Merge branch 'develop' into release/v0.7.5
This commit is contained in:
@@ -103,7 +103,8 @@ from .browser_adapter import (
|
|||||||
|
|
||||||
from .utils import (
|
from .utils import (
|
||||||
start_colab_display_server,
|
start_colab_display_server,
|
||||||
setup_colab_environment
|
setup_colab_environment,
|
||||||
|
hooks_to_string
|
||||||
)
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
@@ -183,6 +184,7 @@ __all__ = [
|
|||||||
"ProxyConfig",
|
"ProxyConfig",
|
||||||
"start_colab_display_server",
|
"start_colab_display_server",
|
||||||
"setup_colab_environment",
|
"setup_colab_environment",
|
||||||
|
"hooks_to_string",
|
||||||
# C4A Script additions
|
# C4A Script additions
|
||||||
"c4a_compile",
|
"c4a_compile",
|
||||||
"c4a_validate",
|
"c4a_validate",
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from typing import List, Optional, Union, AsyncGenerator, Dict, Any
|
from typing import List, Optional, Union, AsyncGenerator, Dict, Any, Callable
|
||||||
import httpx
|
import httpx
|
||||||
import json
|
import json
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
@@ -7,6 +7,7 @@ import asyncio
|
|||||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
from .models import CrawlResult
|
from .models import CrawlResult
|
||||||
from .async_logger import AsyncLogger, LogLevel
|
from .async_logger import AsyncLogger, LogLevel
|
||||||
|
from .utils import hooks_to_string
|
||||||
|
|
||||||
|
|
||||||
class Crawl4aiClientError(Exception):
|
class Crawl4aiClientError(Exception):
|
||||||
@@ -70,17 +71,41 @@ class Crawl4aiDockerClient:
|
|||||||
self.logger.error(f"Server unreachable: {str(e)}", tag="ERROR")
|
self.logger.error(f"Server unreachable: {str(e)}", tag="ERROR")
|
||||||
raise ConnectionError(f"Cannot connect to server: {str(e)}")
|
raise ConnectionError(f"Cannot connect to server: {str(e)}")
|
||||||
|
|
||||||
def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None,
|
def _prepare_request(
|
||||||
crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]:
|
self,
|
||||||
|
urls: List[str],
|
||||||
|
browser_config: Optional[BrowserConfig] = None,
|
||||||
|
crawler_config: Optional[CrawlerRunConfig] = None,
|
||||||
|
hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None,
|
||||||
|
hooks_timeout: int = 30
|
||||||
|
) -> Dict[str, Any]:
|
||||||
"""Prepare request data from configs."""
|
"""Prepare request data from configs."""
|
||||||
if self._token:
|
if self._token:
|
||||||
self._http_client.headers["Authorization"] = f"Bearer {self._token}"
|
self._http_client.headers["Authorization"] = f"Bearer {self._token}"
|
||||||
return {
|
|
||||||
|
request_data = {
|
||||||
"urls": urls,
|
"urls": urls,
|
||||||
"browser_config": browser_config.dump() if browser_config else {},
|
"browser_config": browser_config.dump() if browser_config else {},
|
||||||
"crawler_config": crawler_config.dump() if crawler_config else {}
|
"crawler_config": crawler_config.dump() if crawler_config else {}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Handle hooks if provided
|
||||||
|
if hooks:
|
||||||
|
# Check if hooks are already strings or need conversion
|
||||||
|
if any(callable(v) for v in hooks.values()):
|
||||||
|
# Convert function objects to strings
|
||||||
|
hooks_code = hooks_to_string(hooks)
|
||||||
|
else:
|
||||||
|
# Already in string format
|
||||||
|
hooks_code = hooks
|
||||||
|
|
||||||
|
request_data["hooks"] = {
|
||||||
|
"code": hooks_code,
|
||||||
|
"timeout": hooks_timeout
|
||||||
|
}
|
||||||
|
|
||||||
|
return request_data
|
||||||
|
|
||||||
async def _request(self, method: str, endpoint: str, **kwargs) -> httpx.Response:
|
async def _request(self, method: str, endpoint: str, **kwargs) -> httpx.Response:
|
||||||
"""Make an HTTP request with error handling."""
|
"""Make an HTTP request with error handling."""
|
||||||
url = urljoin(self.base_url, endpoint)
|
url = urljoin(self.base_url, endpoint)
|
||||||
@@ -102,16 +127,42 @@ class Crawl4aiDockerClient:
|
|||||||
self,
|
self,
|
||||||
urls: List[str],
|
urls: List[str],
|
||||||
browser_config: Optional[BrowserConfig] = None,
|
browser_config: Optional[BrowserConfig] = None,
|
||||||
crawler_config: Optional[CrawlerRunConfig] = None
|
crawler_config: Optional[CrawlerRunConfig] = None,
|
||||||
|
hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None,
|
||||||
|
hooks_timeout: int = 30
|
||||||
) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
||||||
"""Execute a crawl operation."""
|
"""
|
||||||
|
Execute a crawl operation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
urls: List of URLs to crawl
|
||||||
|
browser_config: Browser configuration
|
||||||
|
crawler_config: Crawler configuration
|
||||||
|
hooks: Optional hooks - can be either:
|
||||||
|
- Dict[str, Callable]: Function objects that will be converted to strings
|
||||||
|
- Dict[str, str]: Already stringified hook code
|
||||||
|
hooks_timeout: Timeout in seconds for each hook execution (1-120)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Single CrawlResult, list of results, or async generator for streaming
|
||||||
|
|
||||||
|
Example with function hooks:
|
||||||
|
>>> async def my_hook(page, context, **kwargs):
|
||||||
|
... await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
... return page
|
||||||
|
>>>
|
||||||
|
>>> result = await client.crawl(
|
||||||
|
... ["https://example.com"],
|
||||||
|
... hooks={"on_page_context_created": my_hook}
|
||||||
|
... )
|
||||||
|
"""
|
||||||
await self._check_server()
|
await self._check_server()
|
||||||
|
|
||||||
data = self._prepare_request(urls, browser_config, crawler_config)
|
data = self._prepare_request(urls, browser_config, crawler_config, hooks, hooks_timeout)
|
||||||
is_streaming = crawler_config and crawler_config.stream
|
is_streaming = crawler_config and crawler_config.stream
|
||||||
|
|
||||||
self.logger.info(f"Crawling {len(urls)} URLs {'(streaming)' if is_streaming else ''}", tag="CRAWL")
|
self.logger.info(f"Crawling {len(urls)} URLs {'(streaming)' if is_streaming else ''}", tag="CRAWL")
|
||||||
|
|
||||||
if is_streaming:
|
if is_streaming:
|
||||||
async def stream_results() -> AsyncGenerator[CrawlResult, None]:
|
async def stream_results() -> AsyncGenerator[CrawlResult, None]:
|
||||||
async with self._http_client.stream("POST", f"{self.base_url}/crawl/stream", json=data) as response:
|
async with self._http_client.stream("POST", f"{self.base_url}/crawl/stream", json=data) as response:
|
||||||
@@ -128,12 +179,12 @@ class Crawl4aiDockerClient:
|
|||||||
else:
|
else:
|
||||||
yield CrawlResult(**result)
|
yield CrawlResult(**result)
|
||||||
return stream_results()
|
return stream_results()
|
||||||
|
|
||||||
response = await self._request("POST", "/crawl", json=data)
|
response = await self._request("POST", "/crawl", json=data)
|
||||||
result_data = response.json()
|
result_data = response.json()
|
||||||
if not result_data.get("success", False):
|
if not result_data.get("success", False):
|
||||||
raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}")
|
raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}")
|
||||||
|
|
||||||
results = [CrawlResult(**r) for r in result_data.get("results", [])]
|
results = [CrawlResult(**r) for r in result_data.get("results", [])]
|
||||||
self.logger.success(f"Crawl completed with {len(results)} results", tag="CRAWL")
|
self.logger.success(f"Crawl completed with {len(results)} results", tag="CRAWL")
|
||||||
return results[0] if len(results) == 1 else results
|
return results[0] if len(results) == 1 else results
|
||||||
|
|||||||
@@ -47,6 +47,7 @@ from urllib.parse import (
|
|||||||
urljoin, urlparse, urlunparse,
|
urljoin, urlparse, urlunparse,
|
||||||
parse_qsl, urlencode, quote, unquote
|
parse_qsl, urlencode, quote, unquote
|
||||||
)
|
)
|
||||||
|
import inspect
|
||||||
|
|
||||||
|
|
||||||
# Monkey patch to fix wildcard handling in urllib.robotparser
|
# Monkey patch to fix wildcard handling in urllib.robotparser
|
||||||
@@ -3529,4 +3530,52 @@ def get_memory_stats() -> Tuple[float, float, float]:
|
|||||||
available_gb = get_true_available_memory_gb()
|
available_gb = get_true_available_memory_gb()
|
||||||
used_percent = get_true_memory_usage_percent()
|
used_percent = get_true_memory_usage_percent()
|
||||||
|
|
||||||
return used_percent, available_gb, total_gb
|
return used_percent, available_gb, total_gb
|
||||||
|
|
||||||
|
|
||||||
|
# Hook utilities for Docker API
|
||||||
|
def hooks_to_string(hooks: Dict[str, Callable]) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
Convert hook function objects to string representations for Docker API.
|
||||||
|
|
||||||
|
This utility simplifies the process of using hooks with the Docker API by converting
|
||||||
|
Python function objects into the string format required by the API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
hooks: Dictionary mapping hook point names to Python function objects.
|
||||||
|
Functions should be async and follow hook signature requirements.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary mapping hook point names to string representations of the functions.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> async def my_hook(page, context, **kwargs):
|
||||||
|
... await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
... return page
|
||||||
|
>>>
|
||||||
|
>>> hooks_dict = {"on_page_context_created": my_hook}
|
||||||
|
>>> api_hooks = hooks_to_string(hooks_dict)
|
||||||
|
>>> # api_hooks is now ready to use with Docker API
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If a hook is not callable or source cannot be extracted
|
||||||
|
"""
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
for hook_name, hook_func in hooks.items():
|
||||||
|
if not callable(hook_func):
|
||||||
|
raise ValueError(f"Hook '{hook_name}' must be a callable function, got {type(hook_func)}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get the source code of the function
|
||||||
|
source = inspect.getsource(hook_func)
|
||||||
|
# Remove any leading indentation to get clean source
|
||||||
|
source = textwrap.dedent(source)
|
||||||
|
result[hook_name] = source
|
||||||
|
except (OSError, TypeError) as e:
|
||||||
|
raise ValueError(
|
||||||
|
f"Cannot extract source code for hook '{hook_name}'. "
|
||||||
|
f"Make sure the function is defined in a file (not interactively). Error: {e}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|||||||
522
docs/examples/docker_client_hooks_example.py
Normal file
522
docs/examples/docker_client_hooks_example.py
Normal file
@@ -0,0 +1,522 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Comprehensive hooks examples using Docker Client with function objects.
|
||||||
|
|
||||||
|
This approach is recommended because:
|
||||||
|
- Write hooks as regular Python functions
|
||||||
|
- Full IDE support (autocomplete, type checking)
|
||||||
|
- Automatic conversion to API format
|
||||||
|
- Reusable and testable code
|
||||||
|
- Clean, readable syntax
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
# API_BASE_URL = "http://localhost:11235"
|
||||||
|
API_BASE_URL = "http://localhost:11234"
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Hook Function Definitions
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
# --- All Hooks Demo ---
|
||||||
|
async def browser_created_hook(browser, **kwargs):
|
||||||
|
"""Called after browser is created"""
|
||||||
|
print("[HOOK] Browser created and ready")
|
||||||
|
return browser
|
||||||
|
|
||||||
|
|
||||||
|
async def page_context_hook(page, context, **kwargs):
|
||||||
|
"""Setup page environment"""
|
||||||
|
print("[HOOK] Setting up page environment")
|
||||||
|
|
||||||
|
# Set viewport
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
|
||||||
|
# Add cookies
|
||||||
|
await context.add_cookies([{
|
||||||
|
"name": "test_session",
|
||||||
|
"value": "abc123xyz",
|
||||||
|
"domain": ".httpbin.org",
|
||||||
|
"path": "/"
|
||||||
|
}])
|
||||||
|
|
||||||
|
# Block resources
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif}", lambda route: route.abort())
|
||||||
|
await context.route("**/analytics/*", lambda route: route.abort())
|
||||||
|
|
||||||
|
print("[HOOK] Environment configured")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def user_agent_hook(page, context, user_agent, **kwargs):
|
||||||
|
"""Called when user agent is updated"""
|
||||||
|
print(f"[HOOK] User agent: {user_agent[:50]}...")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def before_goto_hook(page, context, url, **kwargs):
|
||||||
|
"""Called before navigating to URL"""
|
||||||
|
print(f"[HOOK] Navigating to: {url}")
|
||||||
|
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
"X-Custom-Header": "crawl4ai-test",
|
||||||
|
"Accept-Language": "en-US"
|
||||||
|
})
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def after_goto_hook(page, context, url, response, **kwargs):
|
||||||
|
"""Called after page loads"""
|
||||||
|
print(f"[HOOK] Page loaded: {url}")
|
||||||
|
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await page.wait_for_selector("body", timeout=2000)
|
||||||
|
print("[HOOK] Body element ready")
|
||||||
|
except:
|
||||||
|
print("[HOOK] Timeout, continuing")
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def execution_started_hook(page, context, **kwargs):
|
||||||
|
"""Called when custom JS execution starts"""
|
||||||
|
print("[HOOK] JS execution started")
|
||||||
|
await page.evaluate("console.log('[HOOK] Custom JS');")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def before_retrieve_hook(page, context, **kwargs):
|
||||||
|
"""Called before retrieving HTML"""
|
||||||
|
print("[HOOK] Preparing HTML retrieval")
|
||||||
|
|
||||||
|
# Scroll for lazy content
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
|
await page.wait_for_timeout(500)
|
||||||
|
await page.evaluate("window.scrollTo(0, 0);")
|
||||||
|
|
||||||
|
print("[HOOK] Scrolling complete")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def before_return_hook(page, context, html, **kwargs):
|
||||||
|
"""Called before returning HTML"""
|
||||||
|
print(f"[HOOK] HTML ready: {len(html)} chars")
|
||||||
|
|
||||||
|
metrics = await page.evaluate('''() => ({
|
||||||
|
images: document.images.length,
|
||||||
|
links: document.links.length,
|
||||||
|
scripts: document.scripts.length
|
||||||
|
})''')
|
||||||
|
|
||||||
|
print(f"[HOOK] Metrics - Images: {metrics['images']}, Links: {metrics['links']}")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
# --- Authentication Hooks ---
|
||||||
|
async def auth_context_hook(page, context, **kwargs):
|
||||||
|
"""Setup authentication context"""
|
||||||
|
print("[HOOK] Setting up authentication")
|
||||||
|
|
||||||
|
# Add auth cookies
|
||||||
|
await context.add_cookies([{
|
||||||
|
"name": "auth_token",
|
||||||
|
"value": "fake_jwt_token",
|
||||||
|
"domain": ".httpbin.org",
|
||||||
|
"path": "/",
|
||||||
|
"httpOnly": True
|
||||||
|
}])
|
||||||
|
|
||||||
|
# Set localStorage
|
||||||
|
await page.evaluate('''
|
||||||
|
localStorage.setItem('user_id', '12345');
|
||||||
|
localStorage.setItem('auth_time', new Date().toISOString());
|
||||||
|
''')
|
||||||
|
|
||||||
|
print("[HOOK] Auth context ready")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def auth_headers_hook(page, context, url, **kwargs):
|
||||||
|
"""Add authentication headers"""
|
||||||
|
print(f"[HOOK] Adding auth headers for {url}")
|
||||||
|
|
||||||
|
import base64
|
||||||
|
credentials = base64.b64encode(b"user:passwd").decode('ascii')
|
||||||
|
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
'Authorization': f'Basic {credentials}',
|
||||||
|
'X-API-Key': 'test-key-123'
|
||||||
|
})
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
# --- Performance Optimization Hooks ---
|
||||||
|
async def performance_hook(page, context, **kwargs):
|
||||||
|
"""Optimize page for performance"""
|
||||||
|
print("[HOOK] Optimizing for performance")
|
||||||
|
|
||||||
|
# Block resource-heavy content
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp,svg}", lambda r: r.abort())
|
||||||
|
await context.route("**/*.{woff,woff2,ttf}", lambda r: r.abort())
|
||||||
|
await context.route("**/*.{mp4,webm,ogg}", lambda r: r.abort())
|
||||||
|
await context.route("**/googletagmanager.com/*", lambda r: r.abort())
|
||||||
|
await context.route("**/google-analytics.com/*", lambda r: r.abort())
|
||||||
|
await context.route("**/facebook.com/*", lambda r: r.abort())
|
||||||
|
|
||||||
|
# Disable animations
|
||||||
|
await page.add_style_tag(content='''
|
||||||
|
*, *::before, *::after {
|
||||||
|
animation-duration: 0s !important;
|
||||||
|
transition-duration: 0s !important;
|
||||||
|
}
|
||||||
|
''')
|
||||||
|
|
||||||
|
print("[HOOK] Optimizations applied")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def cleanup_hook(page, context, **kwargs):
|
||||||
|
"""Clean page before extraction"""
|
||||||
|
print("[HOOK] Cleaning page")
|
||||||
|
|
||||||
|
await page.evaluate('''() => {
|
||||||
|
const selectors = [
|
||||||
|
'.ad', '.ads', '.advertisement',
|
||||||
|
'.popup', '.modal', '.overlay',
|
||||||
|
'.cookie-banner', '.newsletter'
|
||||||
|
];
|
||||||
|
|
||||||
|
selectors.forEach(sel => {
|
||||||
|
document.querySelectorAll(sel).forEach(el => el.remove());
|
||||||
|
});
|
||||||
|
|
||||||
|
document.querySelectorAll('script, style').forEach(el => el.remove());
|
||||||
|
}''')
|
||||||
|
|
||||||
|
print("[HOOK] Page cleaned")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
# --- Content Extraction Hooks ---
|
||||||
|
async def wait_dynamic_content_hook(page, context, url, response, **kwargs):
|
||||||
|
"""Wait for dynamic content to load"""
|
||||||
|
print(f"[HOOK] Waiting for dynamic content on {url}")
|
||||||
|
|
||||||
|
await page.wait_for_timeout(2000)
|
||||||
|
|
||||||
|
# Click "Load More" if exists
|
||||||
|
try:
|
||||||
|
load_more = await page.query_selector('[class*="load-more"], button:has-text("Load More")')
|
||||||
|
if load_more:
|
||||||
|
await load_more.click()
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
print("[HOOK] Clicked 'Load More'")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def extract_metadata_hook(page, context, **kwargs):
|
||||||
|
"""Extract page metadata"""
|
||||||
|
print("[HOOK] Extracting metadata")
|
||||||
|
|
||||||
|
metadata = await page.evaluate('''() => {
|
||||||
|
const getMeta = (name) => {
|
||||||
|
const el = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
|
||||||
|
return el ? el.getAttribute('content') : null;
|
||||||
|
};
|
||||||
|
|
||||||
|
return {
|
||||||
|
title: document.title,
|
||||||
|
description: getMeta('description'),
|
||||||
|
author: getMeta('author'),
|
||||||
|
keywords: getMeta('keywords'),
|
||||||
|
};
|
||||||
|
}''')
|
||||||
|
|
||||||
|
print(f"[HOOK] Metadata: {metadata}")
|
||||||
|
|
||||||
|
# Infinite scroll
|
||||||
|
for i in range(3):
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
print(f"[HOOK] Scroll {i+1}/3")
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
# --- Multi-URL Hooks ---
|
||||||
|
async def url_specific_hook(page, context, url, **kwargs):
|
||||||
|
"""Apply URL-specific logic"""
|
||||||
|
print(f"[HOOK] Processing URL: {url}")
|
||||||
|
|
||||||
|
# URL-specific headers
|
||||||
|
if 'html' in url:
|
||||||
|
await page.set_extra_http_headers({"X-Type": "HTML"})
|
||||||
|
elif 'json' in url:
|
||||||
|
await page.set_extra_http_headers({"X-Type": "JSON"})
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def track_progress_hook(page, context, url, response, **kwargs):
|
||||||
|
"""Track crawl progress"""
|
||||||
|
status = response.status if response else 'unknown'
|
||||||
|
print(f"[HOOK] Loaded {url} - Status: {status}")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Test Functions
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
async def test_all_hooks_comprehensive():
|
||||||
|
"""Test all 8 hook types"""
|
||||||
|
print("=" * 70)
|
||||||
|
print("Test 1: All Hooks Comprehensive Demo (Docker Client)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||||
|
print("\nCrawling with all 8 hooks...")
|
||||||
|
|
||||||
|
# Define hooks with function objects
|
||||||
|
hooks = {
|
||||||
|
"on_browser_created": browser_created_hook,
|
||||||
|
"on_page_context_created": page_context_hook,
|
||||||
|
"on_user_agent_updated": user_agent_hook,
|
||||||
|
"before_goto": before_goto_hook,
|
||||||
|
"after_goto": after_goto_hook,
|
||||||
|
"on_execution_started": execution_started_hook,
|
||||||
|
"before_retrieve_html": before_retrieve_hook,
|
||||||
|
"before_return_html": before_return_hook
|
||||||
|
}
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/html"],
|
||||||
|
hooks=hooks,
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n✅ Success!")
|
||||||
|
print(f" URL: {result.url}")
|
||||||
|
print(f" Success: {result.success}")
|
||||||
|
print(f" HTML: {len(result.html)} chars")
|
||||||
|
|
||||||
|
|
||||||
|
async def test_authentication_workflow():
|
||||||
|
"""Test authentication with hooks"""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("Test 2: Authentication Workflow (Docker Client)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||||
|
print("\nTesting authentication...")
|
||||||
|
|
||||||
|
hooks = {
|
||||||
|
"on_page_context_created": auth_context_hook,
|
||||||
|
"before_goto": auth_headers_hook
|
||||||
|
}
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/basic-auth/user/passwd"],
|
||||||
|
hooks=hooks,
|
||||||
|
hooks_timeout=15
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n✅ Authentication completed")
|
||||||
|
|
||||||
|
if result.success:
|
||||||
|
if '"authenticated"' in result.html and 'true' in result.html:
|
||||||
|
print(" ✅ Basic auth successful!")
|
||||||
|
else:
|
||||||
|
print(" ⚠️ Auth status unclear")
|
||||||
|
else:
|
||||||
|
print(f" ❌ Failed: {result.error_message}")
|
||||||
|
|
||||||
|
|
||||||
|
async def test_performance_optimization():
|
||||||
|
"""Test performance optimization"""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("Test 3: Performance Optimization (Docker Client)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||||
|
print("\nTesting performance hooks...")
|
||||||
|
|
||||||
|
hooks = {
|
||||||
|
"on_page_context_created": performance_hook,
|
||||||
|
"before_retrieve_html": cleanup_hook
|
||||||
|
}
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/html"],
|
||||||
|
hooks=hooks,
|
||||||
|
hooks_timeout=10
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n✅ Optimization completed")
|
||||||
|
print(f" HTML size: {len(result.html):,} chars")
|
||||||
|
print(" Resources blocked, ads removed")
|
||||||
|
|
||||||
|
|
||||||
|
async def test_content_extraction():
|
||||||
|
"""Test content extraction"""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("Test 4: Content Extraction (Docker Client)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||||
|
print("\nTesting extraction hooks...")
|
||||||
|
|
||||||
|
hooks = {
|
||||||
|
"after_goto": wait_dynamic_content_hook,
|
||||||
|
"before_retrieve_html": extract_metadata_hook
|
||||||
|
}
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://www.kidocode.com/"],
|
||||||
|
hooks=hooks,
|
||||||
|
hooks_timeout=20
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n✅ Extraction completed")
|
||||||
|
print(f" URL: {result.url}")
|
||||||
|
print(f" Success: {result.success}")
|
||||||
|
print(f" Metadata: {result.metadata}")
|
||||||
|
|
||||||
|
|
||||||
|
async def test_multi_url_crawl():
|
||||||
|
"""Test hooks with multiple URLs"""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("Test 5: Multi-URL Crawl (Docker Client)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||||
|
print("\nCrawling multiple URLs...")
|
||||||
|
|
||||||
|
hooks = {
|
||||||
|
"before_goto": url_specific_hook,
|
||||||
|
"after_goto": track_progress_hook
|
||||||
|
}
|
||||||
|
|
||||||
|
results = await client.crawl(
|
||||||
|
[
|
||||||
|
"https://httpbin.org/html",
|
||||||
|
"https://httpbin.org/json",
|
||||||
|
"https://httpbin.org/xml"
|
||||||
|
],
|
||||||
|
hooks=hooks,
|
||||||
|
hooks_timeout=15
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n✅ Multi-URL crawl completed")
|
||||||
|
print(f"\n Crawled {len(results)} URLs:")
|
||||||
|
for i, result in enumerate(results, 1):
|
||||||
|
status = "✅" if result.success else "❌"
|
||||||
|
print(f" {status} {i}. {result.url}")
|
||||||
|
|
||||||
|
|
||||||
|
async def test_reusable_hook_library():
|
||||||
|
"""Test using reusable hook library"""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("Test 6: Reusable Hook Library (Docker Client)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
# Create a library of reusable hooks
|
||||||
|
class HookLibrary:
|
||||||
|
@staticmethod
|
||||||
|
async def block_images(page, context, **kwargs):
|
||||||
|
"""Block all images"""
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif}", lambda r: r.abort())
|
||||||
|
print("[LIBRARY] Images blocked")
|
||||||
|
return page
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def block_analytics(page, context, **kwargs):
|
||||||
|
"""Block analytics"""
|
||||||
|
await context.route("**/analytics/*", lambda r: r.abort())
|
||||||
|
await context.route("**/google-analytics.com/*", lambda r: r.abort())
|
||||||
|
print("[LIBRARY] Analytics blocked")
|
||||||
|
return page
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def scroll_infinite(page, context, **kwargs):
|
||||||
|
"""Handle infinite scroll"""
|
||||||
|
for i in range(5):
|
||||||
|
prev = await page.evaluate("document.body.scrollHeight")
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
curr = await page.evaluate("document.body.scrollHeight")
|
||||||
|
if curr == prev:
|
||||||
|
break
|
||||||
|
print("[LIBRARY] Infinite scroll complete")
|
||||||
|
return page
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||||
|
print("\nUsing hook library...")
|
||||||
|
|
||||||
|
hooks = {
|
||||||
|
"on_page_context_created": HookLibrary.block_images,
|
||||||
|
"before_retrieve_html": HookLibrary.scroll_infinite
|
||||||
|
}
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://www.kidocode.com/"],
|
||||||
|
hooks=hooks,
|
||||||
|
hooks_timeout=20
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n✅ Library hooks completed")
|
||||||
|
print(f" Success: {result.success}")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Main
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Run all Docker client hook examples"""
|
||||||
|
print("🔧 Crawl4AI Docker Client - Hooks Examples (Function-Based)")
|
||||||
|
print("Using Python function objects with automatic conversion")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
tests = [
|
||||||
|
("All Hooks Demo", test_all_hooks_comprehensive),
|
||||||
|
("Authentication", test_authentication_workflow),
|
||||||
|
("Performance", test_performance_optimization),
|
||||||
|
("Extraction", test_content_extraction),
|
||||||
|
("Multi-URL", test_multi_url_crawl),
|
||||||
|
("Hook Library", test_reusable_hook_library)
|
||||||
|
]
|
||||||
|
|
||||||
|
for i, (name, test_func) in enumerate(tests, 1):
|
||||||
|
try:
|
||||||
|
await test_func()
|
||||||
|
print(f"\n✅ Test {i}/{len(tests)}: {name} completed\n")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ Test {i}/{len(tests)}: {name} failed: {e}\n")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
print("🎉 All Docker client hook examples completed!")
|
||||||
|
print("\n💡 Key Benefits of Function-Based Hooks:")
|
||||||
|
print(" • Write as regular Python functions")
|
||||||
|
print(" • Full IDE support (autocomplete, types)")
|
||||||
|
print(" • Automatic conversion to API format")
|
||||||
|
print(" • Reusable across projects")
|
||||||
|
print(" • Clean, readable code")
|
||||||
|
print(" • Easy to test and debug")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
@@ -6,18 +6,6 @@
|
|||||||
- [Option 1: Using Pre-built Docker Hub Images (Recommended)](#option-1-using-pre-built-docker-hub-images-recommended)
|
- [Option 1: Using Pre-built Docker Hub Images (Recommended)](#option-1-using-pre-built-docker-hub-images-recommended)
|
||||||
- [Option 2: Using Docker Compose](#option-2-using-docker-compose)
|
- [Option 2: Using Docker Compose](#option-2-using-docker-compose)
|
||||||
- [Option 3: Manual Local Build & Run](#option-3-manual-local-build--run)
|
- [Option 3: Manual Local Build & Run](#option-3-manual-local-build--run)
|
||||||
- [Dockerfile Parameters](#dockerfile-parameters)
|
|
||||||
- [Using the API](#using-the-api)
|
|
||||||
- [Playground Interface](#playground-interface)
|
|
||||||
- [Python SDK](#python-sdk)
|
|
||||||
- [Understanding Request Schema](#understanding-request-schema)
|
|
||||||
- [REST API Examples](#rest-api-examples)
|
|
||||||
- [Additional API Endpoints](#additional-api-endpoints)
|
|
||||||
- [HTML Extraction Endpoint](#html-extraction-endpoint)
|
|
||||||
- [Screenshot Endpoint](#screenshot-endpoint)
|
|
||||||
- [PDF Export Endpoint](#pdf-export-endpoint)
|
|
||||||
- [JavaScript Execution Endpoint](#javascript-execution-endpoint)
|
|
||||||
- [Library Context Endpoint](#library-context-endpoint)
|
|
||||||
- [MCP (Model Context Protocol) Support](#mcp-model-context-protocol-support)
|
- [MCP (Model Context Protocol) Support](#mcp-model-context-protocol-support)
|
||||||
- [What is MCP?](#what-is-mcp)
|
- [What is MCP?](#what-is-mcp)
|
||||||
- [Connecting via MCP](#connecting-via-mcp)
|
- [Connecting via MCP](#connecting-via-mcp)
|
||||||
@@ -25,9 +13,28 @@
|
|||||||
- [Available MCP Tools](#available-mcp-tools)
|
- [Available MCP Tools](#available-mcp-tools)
|
||||||
- [Testing MCP Connections](#testing-mcp-connections)
|
- [Testing MCP Connections](#testing-mcp-connections)
|
||||||
- [MCP Schemas](#mcp-schemas)
|
- [MCP Schemas](#mcp-schemas)
|
||||||
|
- [Additional API Endpoints](#additional-api-endpoints)
|
||||||
|
- [HTML Extraction Endpoint](#html-extraction-endpoint)
|
||||||
|
- [Screenshot Endpoint](#screenshot-endpoint)
|
||||||
|
- [PDF Export Endpoint](#pdf-export-endpoint)
|
||||||
|
- [JavaScript Execution Endpoint](#javascript-execution-endpoint)
|
||||||
|
- [User-Provided Hooks API](#user-provided-hooks-api)
|
||||||
|
- [Hook Information Endpoint](#hook-information-endpoint)
|
||||||
|
- [Available Hook Points](#available-hook-points)
|
||||||
|
- [Using Hooks in Requests](#using-hooks-in-requests)
|
||||||
|
- [Hook Examples with Real URLs](#hook-examples-with-real-urls)
|
||||||
|
- [Security Best Practices](#security-best-practices)
|
||||||
|
- [Hook Response Information](#hook-response-information)
|
||||||
|
- [Error Handling](#error-handling)
|
||||||
|
- [Hooks Utility: Function-Based Approach (Python)](#hooks-utility-function-based-approach-python)
|
||||||
|
- [Dockerfile Parameters](#dockerfile-parameters)
|
||||||
|
- [Using the API](#using-the-api)
|
||||||
|
- [Playground Interface](#playground-interface)
|
||||||
|
- [Python SDK](#python-sdk)
|
||||||
|
- [Understanding Request Schema](#understanding-request-schema)
|
||||||
|
- [REST API Examples](#rest-api-examples)
|
||||||
|
- [LLM Configuration Examples](#llm-configuration-examples)
|
||||||
- [Metrics & Monitoring](#metrics--monitoring)
|
- [Metrics & Monitoring](#metrics--monitoring)
|
||||||
- [Deployment Scenarios](#deployment-scenarios)
|
|
||||||
- [Complete Examples](#complete-examples)
|
|
||||||
- [Server Configuration](#server-configuration)
|
- [Server Configuration](#server-configuration)
|
||||||
- [Understanding config.yml](#understanding-configyml)
|
- [Understanding config.yml](#understanding-configyml)
|
||||||
- [JWT Authentication](#jwt-authentication)
|
- [JWT Authentication](#jwt-authentication)
|
||||||
@@ -832,6 +839,275 @@ else:
|
|||||||
|
|
||||||
> 💡 **Remember**: Always test your hooks on safe, known websites first before using them on production sites. Never crawl sites that you don't have permission to access or that might be malicious.
|
> 💡 **Remember**: Always test your hooks on safe, known websites first before using them on production sites. Never crawl sites that you don't have permission to access or that might be malicious.
|
||||||
|
|
||||||
|
### Hooks Utility: Function-Based Approach (Python)
|
||||||
|
|
||||||
|
For Python developers, Crawl4AI provides a more convenient way to work with hooks using the `hooks_to_string()` utility function and Docker client integration.
|
||||||
|
|
||||||
|
#### Why Use Function-Based Hooks?
|
||||||
|
|
||||||
|
**String-Based Approach (shown above)**:
|
||||||
|
```python
|
||||||
|
hooks_code = {
|
||||||
|
"on_page_context_created": """
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Function-Based Approach (recommended for Python)**:
|
||||||
|
```python
|
||||||
|
from crawl4ai import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
async def my_hook(page, context, **kwargs):
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client:
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://example.com"],
|
||||||
|
hooks={"on_page_context_created": my_hook}
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Benefits**:
|
||||||
|
- ✅ Write hooks as regular Python functions
|
||||||
|
- ✅ Full IDE support (autocomplete, syntax highlighting, type checking)
|
||||||
|
- ✅ Easy to test and debug
|
||||||
|
- ✅ Reusable hook libraries
|
||||||
|
- ✅ Automatic conversion to API format
|
||||||
|
|
||||||
|
#### Using the Hooks Utility
|
||||||
|
|
||||||
|
The `hooks_to_string()` utility converts Python function objects to the string format required by the API:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import hooks_to_string
|
||||||
|
|
||||||
|
# Define your hooks as functions
|
||||||
|
async def setup_hook(page, context, **kwargs):
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
await context.add_cookies([{
|
||||||
|
"name": "session",
|
||||||
|
"value": "token",
|
||||||
|
"domain": ".example.com"
|
||||||
|
}])
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def scroll_hook(page, context, **kwargs):
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Convert to string format
|
||||||
|
hooks_dict = {
|
||||||
|
"on_page_context_created": setup_hook,
|
||||||
|
"before_retrieve_html": scroll_hook
|
||||||
|
}
|
||||||
|
hooks_string = hooks_to_string(hooks_dict)
|
||||||
|
|
||||||
|
# Now use with REST API or Docker client
|
||||||
|
# hooks_string contains the string representations
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Docker Client with Automatic Conversion
|
||||||
|
|
||||||
|
The Docker client automatically detects and converts function objects:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
async def auth_hook(page, context, **kwargs):
|
||||||
|
"""Add authentication cookies"""
|
||||||
|
await context.add_cookies([{
|
||||||
|
"name": "auth_token",
|
||||||
|
"value": "your_token",
|
||||||
|
"domain": ".example.com"
|
||||||
|
}])
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def performance_hook(page, context, **kwargs):
|
||||||
|
"""Block unnecessary resources"""
|
||||||
|
await context.route("**/*.{png,jpg,gif}", lambda r: r.abort())
|
||||||
|
await context.route("**/analytics/*", lambda r: r.abort())
|
||||||
|
return page
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client:
|
||||||
|
# Pass functions directly - automatic conversion!
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://example.com"],
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": performance_hook,
|
||||||
|
"before_goto": auth_hook
|
||||||
|
},
|
||||||
|
hooks_timeout=30 # Optional timeout in seconds (1-120)
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Success: {result.success}")
|
||||||
|
print(f"HTML: {len(result.html)} chars")
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Creating Reusable Hook Libraries
|
||||||
|
|
||||||
|
Build collections of reusable hooks:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# hooks_library.py
|
||||||
|
class CrawlHooks:
|
||||||
|
"""Reusable hook collection for common crawling tasks"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def block_images(page, context, **kwargs):
|
||||||
|
"""Block all images to speed up crawling"""
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda r: r.abort())
|
||||||
|
return page
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def block_analytics(page, context, **kwargs):
|
||||||
|
"""Block analytics and tracking scripts"""
|
||||||
|
tracking_domains = [
|
||||||
|
"**/google-analytics.com/*",
|
||||||
|
"**/googletagmanager.com/*",
|
||||||
|
"**/facebook.com/tr/*",
|
||||||
|
"**/doubleclick.net/*"
|
||||||
|
]
|
||||||
|
for domain in tracking_domains:
|
||||||
|
await context.route(domain, lambda r: r.abort())
|
||||||
|
return page
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def scroll_infinite(page, context, **kwargs):
|
||||||
|
"""Handle infinite scroll to load more content"""
|
||||||
|
previous_height = 0
|
||||||
|
for i in range(5): # Max 5 scrolls
|
||||||
|
current_height = await page.evaluate("document.body.scrollHeight")
|
||||||
|
if current_height == previous_height:
|
||||||
|
break
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
previous_height = current_height
|
||||||
|
return page
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def wait_for_dynamic_content(page, context, url, response, **kwargs):
|
||||||
|
"""Wait for dynamic content to load"""
|
||||||
|
await page.wait_for_timeout(2000)
|
||||||
|
try:
|
||||||
|
# Click "Load More" if present
|
||||||
|
load_more = await page.query_selector('[class*="load-more"]')
|
||||||
|
if load_more:
|
||||||
|
await load_more.click()
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Use in your application
|
||||||
|
from hooks_library import CrawlHooks
|
||||||
|
from crawl4ai import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
async def crawl_with_optimizations(url):
|
||||||
|
async with Crawl4aiDockerClient() as client:
|
||||||
|
result = await client.crawl(
|
||||||
|
[url],
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": CrawlHooks.block_images,
|
||||||
|
"before_retrieve_html": CrawlHooks.scroll_infinite
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Choosing the Right Approach
|
||||||
|
|
||||||
|
| Approach | Best For | IDE Support | Language |
|
||||||
|
|----------|----------|-------------|----------|
|
||||||
|
| **String-based** | Non-Python clients, REST APIs, other languages | ❌ None | Any |
|
||||||
|
| **Function-based** | Python applications, local development | ✅ Full | Python only |
|
||||||
|
| **Docker Client** | Python apps with automatic conversion | ✅ Full | Python only |
|
||||||
|
|
||||||
|
**Recommendation**:
|
||||||
|
- **Python applications**: Use Docker client with function objects (easiest)
|
||||||
|
- **Non-Python or REST API**: Use string-based hooks (most flexible)
|
||||||
|
- **Manual control**: Use `hooks_to_string()` utility (middle ground)
|
||||||
|
|
||||||
|
#### Complete Example with Function Hooks
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import Crawl4aiDockerClient, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
|
# Define hooks as regular Python functions
|
||||||
|
async def setup_environment(page, context, **kwargs):
|
||||||
|
"""Setup crawling environment"""
|
||||||
|
# Set viewport
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
|
||||||
|
# Block resources for speed
|
||||||
|
await context.route("**/*.{png,jpg,gif}", lambda r: r.abort())
|
||||||
|
|
||||||
|
# Add custom headers
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
"Accept-Language": "en-US",
|
||||||
|
"X-Custom-Header": "Crawl4AI"
|
||||||
|
})
|
||||||
|
|
||||||
|
print("[HOOK] Environment configured")
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def extract_content(page, context, **kwargs):
|
||||||
|
"""Extract and prepare content"""
|
||||||
|
# Scroll to load lazy content
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
|
||||||
|
# Extract metadata
|
||||||
|
metadata = await page.evaluate('''() => ({
|
||||||
|
title: document.title,
|
||||||
|
links: document.links.length,
|
||||||
|
images: document.images.length
|
||||||
|
})''')
|
||||||
|
|
||||||
|
print(f"[HOOK] Page metadata: {metadata}")
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
|
||||||
|
# Configure crawl
|
||||||
|
browser_config = BrowserConfig(headless=True)
|
||||||
|
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
|
# Crawl with hooks
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/html"],
|
||||||
|
browser_config=browser_config,
|
||||||
|
crawler_config=crawler_config,
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": setup_environment,
|
||||||
|
"before_retrieve_html": extract_content
|
||||||
|
},
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.success:
|
||||||
|
print(f"✅ Crawl successful!")
|
||||||
|
print(f" URL: {result.url}")
|
||||||
|
print(f" HTML: {len(result.html)} chars")
|
||||||
|
print(f" Markdown: {len(result.markdown)} chars")
|
||||||
|
else:
|
||||||
|
print(f"❌ Crawl failed: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import asyncio
|
||||||
|
asyncio.run(main())
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Additional Resources
|
||||||
|
|
||||||
|
- **Comprehensive Examples**: See `/docs/examples/hooks_docker_client_example.py` for Python function-based examples
|
||||||
|
- **REST API Examples**: See `/docs/examples/hooks_rest_api_example.py` for string-based examples
|
||||||
|
- **Comparison Guide**: See `/docs/examples/README_HOOKS.md` for detailed comparison
|
||||||
|
- **Utility Documentation**: See `/docs/hooks-utility-guide.md` for complete guide
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Dockerfile Parameters
|
## Dockerfile Parameters
|
||||||
@@ -892,10 +1168,12 @@ This is the easiest way to translate Python configuration to JSON requests when
|
|||||||
|
|
||||||
Install the SDK: `pip install crawl4ai`
|
Install the SDK: `pip install crawl4ai`
|
||||||
|
|
||||||
|
The Python SDK provides a convenient way to interact with the Docker API, including **automatic hook conversion** when using function objects.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||||
from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed
|
from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
# Point to the correct server port
|
# Point to the correct server port
|
||||||
@@ -907,23 +1185,22 @@ async def main():
|
|||||||
print("--- Running Non-Streaming Crawl ---")
|
print("--- Running Non-Streaming Crawl ---")
|
||||||
results = await client.crawl(
|
results = await client.crawl(
|
||||||
["https://httpbin.org/html"],
|
["https://httpbin.org/html"],
|
||||||
browser_config=BrowserConfig(headless=True), # Use library classes for config aid
|
browser_config=BrowserConfig(headless=True),
|
||||||
crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
)
|
)
|
||||||
if results: # client.crawl returns None on failure
|
if results:
|
||||||
print(f"Non-streaming results success: {results.success}")
|
print(f"Non-streaming results success: {results.success}")
|
||||||
if results.success:
|
if results.success:
|
||||||
for result in results: # Iterate through the CrawlResultContainer
|
for result in results:
|
||||||
print(f"URL: {result.url}, Success: {result.success}")
|
print(f"URL: {result.url}, Success: {result.success}")
|
||||||
else:
|
else:
|
||||||
print("Non-streaming crawl failed.")
|
print("Non-streaming crawl failed.")
|
||||||
|
|
||||||
|
|
||||||
# Example Streaming crawl
|
# Example Streaming crawl
|
||||||
print("\n--- Running Streaming Crawl ---")
|
print("\n--- Running Streaming Crawl ---")
|
||||||
stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
|
stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
|
||||||
try:
|
try:
|
||||||
async for result in await client.crawl( # client.crawl returns an async generator for streaming
|
async for result in await client.crawl(
|
||||||
["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
|
["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
|
||||||
browser_config=BrowserConfig(headless=True),
|
browser_config=BrowserConfig(headless=True),
|
||||||
crawler_config=stream_config
|
crawler_config=stream_config
|
||||||
@@ -932,17 +1209,56 @@ async def main():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Streaming crawl failed: {e}")
|
print(f"Streaming crawl failed: {e}")
|
||||||
|
|
||||||
|
# Example with hooks (Python function objects)
|
||||||
|
print("\n--- Crawl with Hooks ---")
|
||||||
|
|
||||||
|
async def my_hook(page, context, **kwargs):
|
||||||
|
"""Custom hook to optimize performance"""
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
await context.route("**/*.{png,jpg}", lambda r: r.abort())
|
||||||
|
print("[HOOK] Page optimized")
|
||||||
|
return page
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/html"],
|
||||||
|
browser_config=BrowserConfig(headless=True),
|
||||||
|
crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
||||||
|
hooks={"on_page_context_created": my_hook}, # Pass function directly!
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
print(f"Crawl with hooks success: {result.success}")
|
||||||
|
|
||||||
# Example Get schema
|
# Example Get schema
|
||||||
print("\n--- Getting Schema ---")
|
print("\n--- Getting Schema ---")
|
||||||
schema = await client.get_schema()
|
schema = await client.get_schema()
|
||||||
print(f"Schema received: {bool(schema)}") # Print whether schema was received
|
print(f"Schema received: {bool(schema)}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
```
|
```
|
||||||
|
|
||||||
*(SDK parameters like timeout, verify_ssl etc. remain the same)*
|
#### SDK Parameters
|
||||||
|
|
||||||
|
The Docker client supports the following parameters:
|
||||||
|
|
||||||
|
**Client Initialization**:
|
||||||
|
- `base_url` (str): URL of the Docker server (default: `http://localhost:8000`)
|
||||||
|
- `timeout` (float): Request timeout in seconds (default: 30.0)
|
||||||
|
- `verify_ssl` (bool): Verify SSL certificates (default: True)
|
||||||
|
- `verbose` (bool): Enable verbose logging (default: True)
|
||||||
|
- `log_file` (Optional[str]): Path to log file (default: None)
|
||||||
|
|
||||||
|
**crawl() Method**:
|
||||||
|
- `urls` (List[str]): List of URLs to crawl
|
||||||
|
- `browser_config` (Optional[BrowserConfig]): Browser configuration
|
||||||
|
- `crawler_config` (Optional[CrawlerRunConfig]): Crawler configuration
|
||||||
|
- `hooks` (Optional[Dict]): Hook functions or strings - **automatically converts function objects!**
|
||||||
|
- `hooks_timeout` (int): Timeout for each hook execution in seconds (default: 30)
|
||||||
|
|
||||||
|
**Returns**:
|
||||||
|
- Single URL: `CrawlResult` object
|
||||||
|
- Multiple URLs: `List[CrawlResult]`
|
||||||
|
- Streaming: `AsyncGenerator[CrawlResult]`
|
||||||
|
|
||||||
### Second Approach: Direct API Calls
|
### Second Approach: Direct API Calls
|
||||||
|
|
||||||
@@ -1352,19 +1668,40 @@ We're here to help you succeed with Crawl4AI! Here's how to get support:
|
|||||||
|
|
||||||
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
|
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
|
||||||
- Building and running the Docker container
|
- Building and running the Docker container
|
||||||
- Configuring the environment
|
- Configuring the environment
|
||||||
- Using the interactive playground for testing
|
- Using the interactive playground for testing
|
||||||
- Making API requests with proper typing
|
- Making API requests with proper typing
|
||||||
- Using the Python SDK
|
- Using the Python SDK with **automatic hook conversion**
|
||||||
|
- **Working with hooks** - both string-based (REST API) and function-based (Python SDK)
|
||||||
- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
|
- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
|
||||||
- Connecting via the Model Context Protocol (MCP)
|
- Connecting via the Model Context Protocol (MCP)
|
||||||
- Monitoring your deployment
|
- Monitoring your deployment
|
||||||
|
|
||||||
The new playground interface at `http://localhost:11235/playground` makes it much easier to test configurations and generate the corresponding JSON for API requests.
|
### Key Features
|
||||||
|
|
||||||
For AI application developers, the MCP integration allows tools like Claude Code to directly access Crawl4AI's capabilities without complex API handling.
|
**Hooks Support**: Crawl4AI offers two approaches for working with hooks:
|
||||||
|
- **String-based** (REST API): Works with any language, requires manual string formatting
|
||||||
|
- **Function-based** (Python SDK): Write hooks as regular Python functions with full IDE support and automatic conversion
|
||||||
|
|
||||||
Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
|
**Playground Interface**: The built-in playground at `http://localhost:11235/playground` makes it easy to test configurations and generate corresponding JSON for API requests.
|
||||||
|
|
||||||
|
**MCP Integration**: For AI application developers, the MCP integration allows tools like Claude Code to directly access Crawl4AI's capabilities without complex API handling.
|
||||||
|
|
||||||
|
### Next Steps
|
||||||
|
|
||||||
|
1. **Explore Examples**: Check out the comprehensive examples in:
|
||||||
|
- `/docs/examples/hooks_docker_client_example.py` - Python function-based hooks
|
||||||
|
- `/docs/examples/hooks_rest_api_example.py` - REST API string-based hooks
|
||||||
|
- `/docs/examples/README_HOOKS.md` - Comparison and guide
|
||||||
|
|
||||||
|
2. **Read Documentation**:
|
||||||
|
- `/docs/hooks-utility-guide.md` - Complete hooks utility guide
|
||||||
|
- API documentation for detailed configuration options
|
||||||
|
|
||||||
|
3. **Join the Community**:
|
||||||
|
- GitHub: Report issues and contribute
|
||||||
|
- Discord: Get help and share your experiences
|
||||||
|
- Documentation: Comprehensive guides and tutorials
|
||||||
|
|
||||||
Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
|
Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
|
||||||
|
|
||||||
|
|||||||
@@ -30,10 +30,10 @@ const { API_BASE, API_ORIGIN } = (() => {
|
|||||||
|
|
||||||
if (origin) {
|
if (origin) {
|
||||||
const normalized = cleanOrigin(origin);
|
const normalized = cleanOrigin(origin);
|
||||||
return { API_BASE: `${normalized}/api`, API_ORIGIN: normalized };
|
return { API_BASE: `${normalized}/marketplace/api`, API_ORIGIN: normalized };
|
||||||
}
|
}
|
||||||
|
|
||||||
return { API_BASE: '/api', API_ORIGIN: '' };
|
return { API_BASE: '/marketplace/api', API_ORIGIN: '' };
|
||||||
})();
|
})();
|
||||||
|
|
||||||
const resolveAssetUrl = (path) => {
|
const resolveAssetUrl = (path) => {
|
||||||
|
|||||||
@@ -210,6 +210,6 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<script src="admin.js?v=1759334000"></script>
|
<script src="admin.js?v=1759335000"></script>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
@@ -197,6 +197,41 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Navigation Tabs */
|
/* Navigation Tabs */
|
||||||
|
.tabs {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: row;
|
||||||
|
gap: 0;
|
||||||
|
border-bottom: 2px solid var(--border-color);
|
||||||
|
margin-bottom: 0;
|
||||||
|
background: var(--bg-tertiary);
|
||||||
|
}
|
||||||
|
|
||||||
|
.tab-btn {
|
||||||
|
padding: 1rem 2rem;
|
||||||
|
background: transparent;
|
||||||
|
border: none;
|
||||||
|
border-bottom: 3px solid transparent;
|
||||||
|
color: var(--text-secondary);
|
||||||
|
cursor: pointer;
|
||||||
|
transition: all 0.2s;
|
||||||
|
font-family: inherit;
|
||||||
|
font-size: 0.95rem;
|
||||||
|
margin-bottom: -2px;
|
||||||
|
white-space: nowrap;
|
||||||
|
font-weight: 500;
|
||||||
|
}
|
||||||
|
|
||||||
|
.tab-btn:hover {
|
||||||
|
color: var(--primary-cyan);
|
||||||
|
background: rgba(80, 255, 255, 0.05);
|
||||||
|
}
|
||||||
|
|
||||||
|
.tab-btn.active {
|
||||||
|
color: var(--primary-cyan);
|
||||||
|
border-bottom-color: var(--primary-cyan);
|
||||||
|
background: var(--bg-secondary);
|
||||||
|
}
|
||||||
|
|
||||||
.app-nav {
|
.app-nav {
|
||||||
max-width: 1800px;
|
max-width: 1800px;
|
||||||
margin: 2rem auto 0;
|
margin: 2rem auto 0;
|
||||||
@@ -228,34 +263,167 @@
|
|||||||
border-bottom-color: var(--primary-cyan);
|
border-bottom-color: var(--primary-cyan);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Content Sections */
|
/* Main Content Wrapper */
|
||||||
.app-content {
|
.app-main {
|
||||||
max-width: 1800px;
|
max-width: 1800px;
|
||||||
margin: 2rem auto;
|
margin: 2rem auto;
|
||||||
padding: 0 2rem;
|
padding: 0 2rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Content Sections */
|
||||||
|
.app-content {
|
||||||
|
background: var(--bg-secondary);
|
||||||
|
border: 1px solid var(--border-color);
|
||||||
|
padding: 0;
|
||||||
|
}
|
||||||
|
|
||||||
.tab-content {
|
.tab-content {
|
||||||
display: none;
|
display: none;
|
||||||
|
padding: 2rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
.tab-content.active {
|
.tab-content.active {
|
||||||
display: block;
|
display: block;
|
||||||
}
|
}
|
||||||
|
|
||||||
.docs-content {
|
/* Overview Layout */
|
||||||
max-width: 1200px;
|
.overview-columns {
|
||||||
padding: 2rem;
|
display: grid;
|
||||||
|
grid-template-columns: 2fr 1fr;
|
||||||
|
gap: 2rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.overview-main h2, .overview-main h3 {
|
||||||
|
color: var(--primary-cyan);
|
||||||
|
margin-top: 2rem;
|
||||||
|
margin-bottom: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.overview-main h2:first-child {
|
||||||
|
margin-top: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.overview-main h2 {
|
||||||
|
font-size: 1.8rem;
|
||||||
|
border-bottom: 2px solid var(--border-color);
|
||||||
|
padding-bottom: 0.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.overview-main h3 {
|
||||||
|
font-size: 1.3rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.features-list {
|
||||||
|
list-style: none;
|
||||||
|
padding: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.features-list li {
|
||||||
|
padding: 0.5rem 0;
|
||||||
|
padding-left: 1.5rem;
|
||||||
|
position: relative;
|
||||||
|
color: var(--text-secondary);
|
||||||
|
}
|
||||||
|
|
||||||
|
.features-list li:before {
|
||||||
|
content: "▸";
|
||||||
|
position: absolute;
|
||||||
|
left: 0;
|
||||||
|
color: var(--primary-cyan);
|
||||||
|
}
|
||||||
|
|
||||||
|
.use-cases p {
|
||||||
|
color: var(--text-secondary);
|
||||||
|
line-height: 1.6;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Sidebar */
|
||||||
|
.sidebar {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.sidebar-card {
|
||||||
background: var(--bg-secondary);
|
background: var(--bg-secondary);
|
||||||
border: 1px solid var(--border-color);
|
border: 1px solid var(--border-color);
|
||||||
|
padding: 1.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.sidebar-card h3 {
|
||||||
|
font-size: 1.1rem;
|
||||||
|
color: var(--primary-cyan);
|
||||||
|
margin: 0 0 1rem 0;
|
||||||
|
border-bottom: 1px solid var(--border-color);
|
||||||
|
padding-bottom: 0.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.stats-grid {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: 1fr 1fr;
|
||||||
|
gap: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.stats-grid > div {
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
.metadata {
|
||||||
|
margin: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.metadata div {
|
||||||
|
display: flex;
|
||||||
|
justify-content: space-between;
|
||||||
|
padding: 0.75rem 0;
|
||||||
|
border-bottom: 1px solid var(--border-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
.metadata dt {
|
||||||
|
color: var(--text-tertiary);
|
||||||
|
font-weight: normal;
|
||||||
|
}
|
||||||
|
|
||||||
|
.metadata dd {
|
||||||
|
color: var(--text-primary);
|
||||||
|
margin: 0;
|
||||||
|
font-weight: 600;
|
||||||
|
}
|
||||||
|
|
||||||
|
.sidebar-card p {
|
||||||
|
color: var(--text-secondary);
|
||||||
|
margin: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Integration Content */
|
||||||
|
.integration-content {
|
||||||
|
max-width: 100%;
|
||||||
|
}
|
||||||
|
|
||||||
|
.integration-content h2 {
|
||||||
|
font-size: 1.8rem;
|
||||||
|
color: var(--primary-cyan);
|
||||||
|
margin: 0 0 2rem 0;
|
||||||
|
padding-bottom: 0.5rem;
|
||||||
|
border-bottom: 2px solid var(--border-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
.integration-content h3 {
|
||||||
|
font-size: 1.3rem;
|
||||||
|
color: var(--text-primary);
|
||||||
|
margin: 2rem 0 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.docs-content {
|
||||||
|
max-width: 100%;
|
||||||
}
|
}
|
||||||
|
|
||||||
.docs-content h2 {
|
.docs-content h2 {
|
||||||
font-size: 1.8rem;
|
font-size: 1.8rem;
|
||||||
color: var(--primary-cyan);
|
color: var(--primary-cyan);
|
||||||
margin-bottom: 1rem;
|
margin: 0 0 1.5rem 0;
|
||||||
padding-bottom: 0.5rem;
|
padding-bottom: 0.5rem;
|
||||||
border-bottom: 1px solid var(--border-color);
|
border-bottom: 2px solid var(--border-color);
|
||||||
}
|
}
|
||||||
|
|
||||||
.docs-content h3 {
|
.docs-content h3 {
|
||||||
@@ -290,6 +458,7 @@
|
|||||||
border: 1px solid var(--border-color);
|
border: 1px solid var(--border-color);
|
||||||
margin: 1rem 0;
|
margin: 1rem 0;
|
||||||
overflow: hidden;
|
overflow: hidden;
|
||||||
|
position: relative;
|
||||||
}
|
}
|
||||||
|
|
||||||
.code-header {
|
.code-header {
|
||||||
@@ -308,18 +477,23 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
.copy-btn {
|
.copy-btn {
|
||||||
padding: 0.25rem 0.5rem;
|
position: absolute;
|
||||||
background: transparent;
|
top: 0.5rem;
|
||||||
|
right: 0.5rem;
|
||||||
|
padding: 0.4rem 0.8rem;
|
||||||
|
background: var(--bg-tertiary);
|
||||||
border: 1px solid var(--border-color);
|
border: 1px solid var(--border-color);
|
||||||
color: var(--text-secondary);
|
color: var(--text-secondary);
|
||||||
cursor: pointer;
|
cursor: pointer;
|
||||||
font-size: 0.75rem;
|
font-size: 0.75rem;
|
||||||
transition: all 0.2s;
|
transition: all 0.2s;
|
||||||
|
z-index: 10;
|
||||||
}
|
}
|
||||||
|
|
||||||
.copy-btn:hover {
|
.copy-btn:hover {
|
||||||
border-color: var(--primary-cyan);
|
border-color: var(--primary-cyan);
|
||||||
color: var(--primary-cyan);
|
color: var(--primary-cyan);
|
||||||
|
background: var(--bg-secondary);
|
||||||
}
|
}
|
||||||
|
|
||||||
.code-block pre {
|
.code-block pre {
|
||||||
@@ -435,6 +609,10 @@
|
|||||||
.app-stats {
|
.app-stats {
|
||||||
justify-content: space-around;
|
justify-content: space-around;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.overview-columns {
|
||||||
|
grid-template-columns: 1fr;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@media (max-width: 768px) {
|
@media (max-width: 768px) {
|
||||||
@@ -446,6 +624,16 @@
|
|||||||
flex-direction: column;
|
flex-direction: column;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.tabs {
|
||||||
|
overflow-x: auto;
|
||||||
|
-webkit-overflow-scrolling: touch;
|
||||||
|
}
|
||||||
|
|
||||||
|
.tab-btn {
|
||||||
|
padding: 0.75rem 1.5rem;
|
||||||
|
font-size: 0.875rem;
|
||||||
|
}
|
||||||
|
|
||||||
.app-nav {
|
.app-nav {
|
||||||
overflow-x: auto;
|
overflow-x: auto;
|
||||||
gap: 0;
|
gap: 0;
|
||||||
@@ -459,4 +647,12 @@
|
|||||||
.support-grid {
|
.support-grid {
|
||||||
grid-template-columns: 1fr;
|
grid-template-columns: 1fr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.tab-content {
|
||||||
|
padding: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.app-main {
|
||||||
|
padding: 0 1rem;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
209
docs/md_v2/marketplace/app-detail.html
Normal file
209
docs/md_v2/marketplace/app-detail.html
Normal file
@@ -0,0 +1,209 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en" data-theme="dark">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>App Details - Crawl4AI Marketplace</title>
|
||||||
|
<link rel="stylesheet" href="marketplace.css">
|
||||||
|
<link rel="stylesheet" href="app-detail.css">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="app-detail-container">
|
||||||
|
<!-- Header -->
|
||||||
|
<header class="marketplace-header">
|
||||||
|
<div class="header-content">
|
||||||
|
<div class="header-left">
|
||||||
|
<div class="logo-title">
|
||||||
|
<img src="../assets/images/logo.png" alt="Crawl4AI" class="header-logo">
|
||||||
|
<h1>
|
||||||
|
<span class="ascii-border">[</span>
|
||||||
|
Marketplace
|
||||||
|
<span class="ascii-border">]</span>
|
||||||
|
</h1>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="header-nav">
|
||||||
|
<a href="index.html" class="back-btn">← Back to Marketplace</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<!-- App Hero Section -->
|
||||||
|
<section class="app-hero">
|
||||||
|
<div class="app-hero-content">
|
||||||
|
<div class="app-hero-image" id="app-image">
|
||||||
|
<!-- Dynamic image -->
|
||||||
|
</div>
|
||||||
|
<div class="app-hero-info">
|
||||||
|
<div class="app-badges">
|
||||||
|
<span class="app-badge" id="app-type">Open Source</span>
|
||||||
|
<span class="app-badge featured" id="app-featured" style="display:none">FEATURED</span>
|
||||||
|
<span class="app-badge sponsored" id="app-sponsored" style="display:none">SPONSORED</span>
|
||||||
|
</div>
|
||||||
|
<h1 id="app-name">App Name</h1>
|
||||||
|
<p id="app-description" class="app-tagline">App description goes here</p>
|
||||||
|
|
||||||
|
<div class="app-stats">
|
||||||
|
<div class="stat">
|
||||||
|
<span class="stat-value" id="app-rating">★★★★★</span>
|
||||||
|
<span class="stat-label">Rating</span>
|
||||||
|
</div>
|
||||||
|
<div class="stat">
|
||||||
|
<span class="stat-value" id="app-downloads">0</span>
|
||||||
|
<span class="stat-label">Downloads</span>
|
||||||
|
</div>
|
||||||
|
<div class="stat">
|
||||||
|
<span class="stat-value" id="app-category">Category</span>
|
||||||
|
<span class="stat-label">Category</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="app-actions">
|
||||||
|
<a href="#" id="app-website" class="action-btn primary" target="_blank">Visit Website</a>
|
||||||
|
<a href="#" id="app-github" class="action-btn" target="_blank">View GitHub</a>
|
||||||
|
<a href="#" id="app-demo" class="action-btn" target="_blank" style="display:none">Live Demo</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- App Details Section -->
|
||||||
|
<main class="app-main">
|
||||||
|
<div class="app-content">
|
||||||
|
<div class="tabs">
|
||||||
|
<button class="tab-btn active" data-tab="overview">Overview</button>
|
||||||
|
<button class="tab-btn" data-tab="integration">Integration</button>
|
||||||
|
<button class="tab-btn" data-tab="docs">Documentation</button>
|
||||||
|
<button class="tab-btn" data-tab="support">Support</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<section id="overview-tab" class="tab-content active">
|
||||||
|
<div class="overview-columns">
|
||||||
|
<div class="overview-main">
|
||||||
|
<h2>Overview</h2>
|
||||||
|
<div id="app-overview">Overview content goes here.</div>
|
||||||
|
|
||||||
|
<h3>Key Features</h3>
|
||||||
|
<ul id="app-features" class="features-list">
|
||||||
|
<li>Feature 1</li>
|
||||||
|
<li>Feature 2</li>
|
||||||
|
<li>Feature 3</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<h3>Use Cases</h3>
|
||||||
|
<div id="app-use-cases" class="use-cases">
|
||||||
|
<p>Describe how this app can help your workflow.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<aside class="sidebar">
|
||||||
|
<div class="sidebar-card">
|
||||||
|
<h3>Download Stats</h3>
|
||||||
|
<div class="stats-grid">
|
||||||
|
<div>
|
||||||
|
<span class="stat-value" id="sidebar-downloads">0</span>
|
||||||
|
<span class="stat-label">Downloads</span>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<span class="stat-value" id="sidebar-rating">0.0</span>
|
||||||
|
<span class="stat-label">Rating</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="sidebar-card">
|
||||||
|
<h3>App Metadata</h3>
|
||||||
|
<dl class="metadata">
|
||||||
|
<div>
|
||||||
|
<dt>Category</dt>
|
||||||
|
<dd id="sidebar-category">-</dd>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<dt>Type</dt>
|
||||||
|
<dd id="sidebar-type">-</dd>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<dt>Status</dt>
|
||||||
|
<dd id="sidebar-status">Active</dd>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<dt>Pricing</dt>
|
||||||
|
<dd id="sidebar-pricing">-</dd>
|
||||||
|
</div>
|
||||||
|
</dl>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="sidebar-card">
|
||||||
|
<h3>Contact</h3>
|
||||||
|
<p id="sidebar-contact">contact@example.com</p>
|
||||||
|
</div>
|
||||||
|
</aside>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section id="integration-tab" class="tab-content">
|
||||||
|
<div class="integration-content">
|
||||||
|
<h2>Integration Guide</h2>
|
||||||
|
|
||||||
|
<h3>Installation</h3>
|
||||||
|
<div class="code-block">
|
||||||
|
<pre><code id="install-code"># Installation instructions will appear here</code></pre>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3>Basic Usage</h3>
|
||||||
|
<div class="code-block">
|
||||||
|
<pre><code id="usage-code"># Usage example will appear here</code></pre>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3>Complete Integration Example</h3>
|
||||||
|
<div class="code-block">
|
||||||
|
<button class="copy-btn" id="copy-integration">Copy</button>
|
||||||
|
<pre><code id="integration-code"># Complete integration guide will appear here</code></pre>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section id="docs-tab" class="tab-content">
|
||||||
|
<div class="docs-content">
|
||||||
|
<h2>Documentation</h2>
|
||||||
|
<div id="app-docs" class="doc-sections">
|
||||||
|
<p>Documentation coming soon.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section id="support-tab" class="tab-content">
|
||||||
|
<div class="docs-content">
|
||||||
|
<h2>Support</h2>
|
||||||
|
<div class="support-grid">
|
||||||
|
<div class="support-card">
|
||||||
|
<h3>📧 Contact</h3>
|
||||||
|
<p id="app-contact">contact@example.com</p>
|
||||||
|
</div>
|
||||||
|
<div class="support-card">
|
||||||
|
<h3>🐛 Report Issues</h3>
|
||||||
|
<p>Found a bug? Report it on GitHub Issues.</p>
|
||||||
|
</div>
|
||||||
|
<div class="support-card">
|
||||||
|
<h3>💬 Community</h3>
|
||||||
|
<p>Join our Discord for help and discussions.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</main>
|
||||||
|
|
||||||
|
<!-- Related Apps -->
|
||||||
|
<section class="related-apps">
|
||||||
|
<h2>Related Apps</h2>
|
||||||
|
<div id="related-apps-grid" class="related-grid">
|
||||||
|
<!-- Dynamic related apps -->
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script src="app-detail.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -1,5 +1,15 @@
|
|||||||
// App Detail Page JavaScript
|
// App Detail Page JavaScript
|
||||||
const API_BASE = '/api';
|
const { API_BASE, API_ORIGIN } = (() => {
|
||||||
|
const { hostname, port, protocol } = window.location;
|
||||||
|
const isLocalHost = ['localhost', '127.0.0.1', '0.0.0.0'].includes(hostname);
|
||||||
|
|
||||||
|
if (isLocalHost && port && port !== '8100') {
|
||||||
|
const origin = `${protocol}//127.0.0.1:8100`;
|
||||||
|
return { API_BASE: `${origin}/marketplace/api`, API_ORIGIN: origin };
|
||||||
|
}
|
||||||
|
|
||||||
|
return { API_BASE: '/marketplace/api', API_ORIGIN: '' };
|
||||||
|
})();
|
||||||
|
|
||||||
class AppDetailPage {
|
class AppDetailPage {
|
||||||
constructor() {
|
constructor() {
|
||||||
@@ -70,7 +80,6 @@ class AppDetailPage {
|
|||||||
document.getElementById('app-description').textContent = this.appData.description;
|
document.getElementById('app-description').textContent = this.appData.description;
|
||||||
document.getElementById('app-type').textContent = this.appData.type || 'Open Source';
|
document.getElementById('app-type').textContent = this.appData.type || 'Open Source';
|
||||||
document.getElementById('app-category').textContent = this.appData.category;
|
document.getElementById('app-category').textContent = this.appData.category;
|
||||||
document.getElementById('app-pricing').textContent = this.appData.pricing || 'Free';
|
|
||||||
|
|
||||||
// Badges
|
// Badges
|
||||||
if (this.appData.featured) {
|
if (this.appData.featured) {
|
||||||
@@ -105,6 +114,15 @@ class AppDetailPage {
|
|||||||
// Contact
|
// Contact
|
||||||
document.getElementById('app-contact').textContent = this.appData.contact_email || 'Not available';
|
document.getElementById('app-contact').textContent = this.appData.contact_email || 'Not available';
|
||||||
|
|
||||||
|
// Sidebar info
|
||||||
|
document.getElementById('sidebar-downloads').textContent = this.formatNumber(this.appData.downloads || 0);
|
||||||
|
document.getElementById('sidebar-rating').textContent = (this.appData.rating || 0).toFixed(1);
|
||||||
|
document.getElementById('sidebar-category').textContent = this.appData.category || '-';
|
||||||
|
document.getElementById('sidebar-type').textContent = this.appData.type || '-';
|
||||||
|
document.getElementById('sidebar-status').textContent = this.appData.status || 'Active';
|
||||||
|
document.getElementById('sidebar-pricing').textContent = this.appData.pricing || 'Free';
|
||||||
|
document.getElementById('sidebar-contact').textContent = this.appData.contact_email || 'contact@example.com';
|
||||||
|
|
||||||
// Integration guide
|
// Integration guide
|
||||||
this.renderIntegrationGuide();
|
this.renderIntegrationGuide();
|
||||||
}
|
}
|
||||||
@@ -112,24 +130,27 @@ class AppDetailPage {
|
|||||||
renderIntegrationGuide() {
|
renderIntegrationGuide() {
|
||||||
// Installation code
|
// Installation code
|
||||||
const installCode = document.getElementById('install-code');
|
const installCode = document.getElementById('install-code');
|
||||||
if (this.appData.type === 'Open Source' && this.appData.github_url) {
|
if (installCode) {
|
||||||
installCode.textContent = `# Clone from GitHub
|
if (this.appData.type === 'Open Source' && this.appData.github_url) {
|
||||||
|
installCode.textContent = `# Clone from GitHub
|
||||||
git clone ${this.appData.github_url}
|
git clone ${this.appData.github_url}
|
||||||
|
|
||||||
# Install dependencies
|
# Install dependencies
|
||||||
pip install -r requirements.txt`;
|
pip install -r requirements.txt`;
|
||||||
} else if (this.appData.name.toLowerCase().includes('api')) {
|
} else if (this.appData.name.toLowerCase().includes('api')) {
|
||||||
installCode.textContent = `# Install via pip
|
installCode.textContent = `# Install via pip
|
||||||
pip install ${this.appData.slug}
|
pip install ${this.appData.slug}
|
||||||
|
|
||||||
# Or install from source
|
# Or install from source
|
||||||
pip install git+${this.appData.github_url || 'https://github.com/example/repo'}`;
|
pip install git+${this.appData.github_url || 'https://github.com/example/repo'}`;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Usage code - customize based on category
|
// Usage code - customize based on category
|
||||||
const usageCode = document.getElementById('usage-code');
|
const usageCode = document.getElementById('usage-code');
|
||||||
if (this.appData.category === 'Browser Automation') {
|
if (usageCode) {
|
||||||
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
|
if (this.appData.category === 'Browser Automation') {
|
||||||
|
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
|
||||||
from ${this.appData.slug.replace(/-/g, '_')} import ${this.appData.name.replace(/\s+/g, '')}
|
from ${this.appData.slug.replace(/-/g, '_')} import ${this.appData.name.replace(/\s+/g, '')}
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
@@ -178,11 +199,13 @@ async with AsyncWebCrawler() as crawler:
|
|||||||
extraction_strategy=strategy
|
extraction_strategy=strategy
|
||||||
)
|
)
|
||||||
print(result.extracted_content)`;
|
print(result.extracted_content)`;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Integration example
|
// Integration example
|
||||||
const integrationCode = document.getElementById('integration-code');
|
const integrationCode = document.getElementById('integration-code');
|
||||||
integrationCode.textContent = this.appData.integration_guide ||
|
if (integrationCode) {
|
||||||
|
integrationCode.textContent = this.appData.integration_guide ||
|
||||||
`# Complete ${this.appData.name} Integration Example
|
`# Complete ${this.appData.name} Integration Example
|
||||||
|
|
||||||
from crawl4ai import AsyncWebCrawler
|
from crawl4ai import AsyncWebCrawler
|
||||||
@@ -237,6 +260,7 @@ async def crawl_with_${this.appData.slug.replace(/-/g, '_')}():
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import asyncio
|
import asyncio
|
||||||
asyncio.run(crawl_with_${this.appData.slug.replace(/-/g, '_')}())`;
|
asyncio.run(crawl_with_${this.appData.slug.replace(/-/g, '_')}())`;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
formatNumber(num) {
|
formatNumber(num) {
|
||||||
@@ -250,7 +274,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
setupEventListeners() {
|
setupEventListeners() {
|
||||||
// Tab switching
|
// Tab switching
|
||||||
const tabs = document.querySelectorAll('.nav-tab');
|
const tabs = document.querySelectorAll('.tab-btn');
|
||||||
tabs.forEach(tab => {
|
tabs.forEach(tab => {
|
||||||
tab.addEventListener('click', () => {
|
tab.addEventListener('click', () => {
|
||||||
// Update active tab
|
// Update active tab
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from fastapi import FastAPI, HTTPException, Query, Depends, Body, UploadFile, File, Form
|
from fastapi import FastAPI, HTTPException, Query, Depends, Body, UploadFile, File, Form, APIRouter
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from fastapi.responses import JSONResponse
|
from fastapi.responses import JSONResponse
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
@@ -16,6 +16,7 @@ from datetime import datetime, timedelta
|
|||||||
from config import Config
|
from config import Config
|
||||||
|
|
||||||
app = FastAPI(title="Crawl4AI Marketplace API")
|
app = FastAPI(title="Crawl4AI Marketplace API")
|
||||||
|
router = APIRouter(prefix="/marketplace/api")
|
||||||
|
|
||||||
# Security setup
|
# Security setup
|
||||||
security = HTTPBearer()
|
security = HTTPBearer()
|
||||||
@@ -84,7 +85,7 @@ def to_int(value, default=0):
|
|||||||
|
|
||||||
# ============= PUBLIC ENDPOINTS =============
|
# ============= PUBLIC ENDPOINTS =============
|
||||||
|
|
||||||
@app.get("/api/apps")
|
@router.get("/apps")
|
||||||
async def get_apps(
|
async def get_apps(
|
||||||
category: Optional[str] = None,
|
category: Optional[str] = None,
|
||||||
type: Optional[str] = None,
|
type: Optional[str] = None,
|
||||||
@@ -114,7 +115,7 @@ async def get_apps(
|
|||||||
|
|
||||||
return json_response(apps)
|
return json_response(apps)
|
||||||
|
|
||||||
@app.get("/api/apps/{slug}")
|
@router.get("/apps/{slug}")
|
||||||
async def get_app(slug: str):
|
async def get_app(slug: str):
|
||||||
"""Get single app by slug"""
|
"""Get single app by slug"""
|
||||||
apps = db.get_all('apps', where=f"slug = '{slug}'", limit=1)
|
apps = db.get_all('apps', where=f"slug = '{slug}'", limit=1)
|
||||||
@@ -127,7 +128,7 @@ async def get_app(slug: str):
|
|||||||
|
|
||||||
return json_response(app)
|
return json_response(app)
|
||||||
|
|
||||||
@app.get("/api/articles")
|
@router.get("/articles")
|
||||||
async def get_articles(
|
async def get_articles(
|
||||||
category: Optional[str] = None,
|
category: Optional[str] = None,
|
||||||
limit: int = Query(default=20, le=10000),
|
limit: int = Query(default=20, le=10000),
|
||||||
@@ -146,7 +147,7 @@ async def get_articles(
|
|||||||
|
|
||||||
return json_response(articles)
|
return json_response(articles)
|
||||||
|
|
||||||
@app.get("/api/articles/{slug}")
|
@router.get("/articles/{slug}")
|
||||||
async def get_article(slug: str):
|
async def get_article(slug: str):
|
||||||
"""Get single article by slug"""
|
"""Get single article by slug"""
|
||||||
articles = db.get_all('articles', where=f"slug = '{slug}'", limit=1)
|
articles = db.get_all('articles', where=f"slug = '{slug}'", limit=1)
|
||||||
@@ -161,7 +162,7 @@ async def get_article(slug: str):
|
|||||||
|
|
||||||
return json_response(article)
|
return json_response(article)
|
||||||
|
|
||||||
@app.get("/api/categories")
|
@router.get("/categories")
|
||||||
async def get_categories():
|
async def get_categories():
|
||||||
"""Get all categories ordered by index"""
|
"""Get all categories ordered by index"""
|
||||||
categories = db.get_all('categories', limit=50)
|
categories = db.get_all('categories', limit=50)
|
||||||
@@ -170,7 +171,7 @@ async def get_categories():
|
|||||||
categories.sort(key=lambda x: x.get('order_index', 0))
|
categories.sort(key=lambda x: x.get('order_index', 0))
|
||||||
return json_response(categories, cache_time=7200)
|
return json_response(categories, cache_time=7200)
|
||||||
|
|
||||||
@app.get("/api/sponsors")
|
@router.get("/sponsors")
|
||||||
async def get_sponsors(active: Optional[bool] = True):
|
async def get_sponsors(active: Optional[bool] = True):
|
||||||
"""Get sponsors, default active only"""
|
"""Get sponsors, default active only"""
|
||||||
where = f"active = {1 if active else 0}" if active is not None else None
|
where = f"active = {1 if active else 0}" if active is not None else None
|
||||||
@@ -185,7 +186,7 @@ async def get_sponsors(active: Optional[bool] = True):
|
|||||||
|
|
||||||
return json_response(sponsors)
|
return json_response(sponsors)
|
||||||
|
|
||||||
@app.get("/api/search")
|
@router.get("/search")
|
||||||
async def search(q: str = Query(min_length=2)):
|
async def search(q: str = Query(min_length=2)):
|
||||||
"""Search across apps and articles"""
|
"""Search across apps and articles"""
|
||||||
if len(q) < 2:
|
if len(q) < 2:
|
||||||
@@ -206,7 +207,7 @@ async def search(q: str = Query(min_length=2)):
|
|||||||
|
|
||||||
return json_response(results, cache_time=1800)
|
return json_response(results, cache_time=1800)
|
||||||
|
|
||||||
@app.get("/api/stats")
|
@router.get("/stats")
|
||||||
async def get_stats():
|
async def get_stats():
|
||||||
"""Get marketplace statistics"""
|
"""Get marketplace statistics"""
|
||||||
stats = {
|
stats = {
|
||||||
@@ -227,7 +228,7 @@ def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
|
|||||||
return token
|
return token
|
||||||
|
|
||||||
|
|
||||||
@app.post("/api/admin/upload-image", dependencies=[Depends(verify_token)])
|
@router.post("/admin/upload-image", dependencies=[Depends(verify_token)])
|
||||||
async def upload_image(file: UploadFile = File(...), folder: str = Form("sponsors")):
|
async def upload_image(file: UploadFile = File(...), folder: str = Form("sponsors")):
|
||||||
"""Upload image files for admin assets"""
|
"""Upload image files for admin assets"""
|
||||||
folder = (folder or "").strip().lower()
|
folder = (folder or "").strip().lower()
|
||||||
@@ -251,7 +252,7 @@ async def upload_image(file: UploadFile = File(...), folder: str = Form("sponsor
|
|||||||
|
|
||||||
return {"url": f"/uploads/{folder}/{filename}"}
|
return {"url": f"/uploads/{folder}/{filename}"}
|
||||||
|
|
||||||
@app.post("/api/admin/login")
|
@router.post("/admin/login")
|
||||||
async def admin_login(password: str = Body(..., embed=True)):
|
async def admin_login(password: str = Body(..., embed=True)):
|
||||||
"""Admin login with password"""
|
"""Admin login with password"""
|
||||||
provided_hash = hashlib.sha256(password.encode()).hexdigest()
|
provided_hash = hashlib.sha256(password.encode()).hexdigest()
|
||||||
@@ -272,7 +273,7 @@ async def admin_login(password: str = Body(..., embed=True)):
|
|||||||
|
|
||||||
# ============= ADMIN ENDPOINTS =============
|
# ============= ADMIN ENDPOINTS =============
|
||||||
|
|
||||||
@app.get("/api/admin/stats", dependencies=[Depends(verify_token)])
|
@router.get("/admin/stats", dependencies=[Depends(verify_token)])
|
||||||
async def get_admin_stats():
|
async def get_admin_stats():
|
||||||
"""Get detailed admin statistics"""
|
"""Get detailed admin statistics"""
|
||||||
stats = {
|
stats = {
|
||||||
@@ -292,7 +293,7 @@ async def get_admin_stats():
|
|||||||
return stats
|
return stats
|
||||||
|
|
||||||
# Apps CRUD
|
# Apps CRUD
|
||||||
@app.post("/api/admin/apps", dependencies=[Depends(verify_token)])
|
@router.post("/admin/apps", dependencies=[Depends(verify_token)])
|
||||||
async def create_app(app_data: Dict[str, Any]):
|
async def create_app(app_data: Dict[str, Any]):
|
||||||
"""Create new app"""
|
"""Create new app"""
|
||||||
try:
|
try:
|
||||||
@@ -311,7 +312,7 @@ async def create_app(app_data: Dict[str, Any]):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(status_code=400, detail=str(e))
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
|
|
||||||
@app.put("/api/admin/apps/{app_id}", dependencies=[Depends(verify_token)])
|
@router.put("/admin/apps/{app_id}", dependencies=[Depends(verify_token)])
|
||||||
async def update_app(app_id: int, app_data: Dict[str, Any]):
|
async def update_app(app_id: int, app_data: Dict[str, Any]):
|
||||||
"""Update app"""
|
"""Update app"""
|
||||||
try:
|
try:
|
||||||
@@ -329,7 +330,7 @@ async def update_app(app_id: int, app_data: Dict[str, Any]):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(status_code=400, detail=str(e))
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
|
|
||||||
@app.delete("/api/admin/apps/{app_id}", dependencies=[Depends(verify_token)])
|
@router.delete("/admin/apps/{app_id}", dependencies=[Depends(verify_token)])
|
||||||
async def delete_app(app_id: int):
|
async def delete_app(app_id: int):
|
||||||
"""Delete app"""
|
"""Delete app"""
|
||||||
cursor = db.conn.cursor()
|
cursor = db.conn.cursor()
|
||||||
@@ -338,7 +339,7 @@ async def delete_app(app_id: int):
|
|||||||
return {"message": "App deleted"}
|
return {"message": "App deleted"}
|
||||||
|
|
||||||
# Articles CRUD
|
# Articles CRUD
|
||||||
@app.post("/api/admin/articles", dependencies=[Depends(verify_token)])
|
@router.post("/admin/articles", dependencies=[Depends(verify_token)])
|
||||||
async def create_article(article_data: Dict[str, Any]):
|
async def create_article(article_data: Dict[str, Any]):
|
||||||
"""Create new article"""
|
"""Create new article"""
|
||||||
try:
|
try:
|
||||||
@@ -356,7 +357,7 @@ async def create_article(article_data: Dict[str, Any]):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(status_code=400, detail=str(e))
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
|
|
||||||
@app.put("/api/admin/articles/{article_id}", dependencies=[Depends(verify_token)])
|
@router.put("/admin/articles/{article_id}", dependencies=[Depends(verify_token)])
|
||||||
async def update_article(article_id: int, article_data: Dict[str, Any]):
|
async def update_article(article_id: int, article_data: Dict[str, Any]):
|
||||||
"""Update article"""
|
"""Update article"""
|
||||||
try:
|
try:
|
||||||
@@ -373,7 +374,7 @@ async def update_article(article_id: int, article_data: Dict[str, Any]):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(status_code=400, detail=str(e))
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
|
|
||||||
@app.delete("/api/admin/articles/{article_id}", dependencies=[Depends(verify_token)])
|
@router.delete("/admin/articles/{article_id}", dependencies=[Depends(verify_token)])
|
||||||
async def delete_article(article_id: int):
|
async def delete_article(article_id: int):
|
||||||
"""Delete article"""
|
"""Delete article"""
|
||||||
cursor = db.conn.cursor()
|
cursor = db.conn.cursor()
|
||||||
@@ -382,7 +383,7 @@ async def delete_article(article_id: int):
|
|||||||
return {"message": "Article deleted"}
|
return {"message": "Article deleted"}
|
||||||
|
|
||||||
# Categories CRUD
|
# Categories CRUD
|
||||||
@app.post("/api/admin/categories", dependencies=[Depends(verify_token)])
|
@router.post("/admin/categories", dependencies=[Depends(verify_token)])
|
||||||
async def create_category(category_data: Dict[str, Any]):
|
async def create_category(category_data: Dict[str, Any]):
|
||||||
"""Create new category"""
|
"""Create new category"""
|
||||||
try:
|
try:
|
||||||
@@ -399,7 +400,7 @@ async def create_category(category_data: Dict[str, Any]):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(status_code=400, detail=str(e))
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
|
|
||||||
@app.put("/api/admin/categories/{cat_id}", dependencies=[Depends(verify_token)])
|
@router.put("/admin/categories/{cat_id}", dependencies=[Depends(verify_token)])
|
||||||
async def update_category(cat_id: int, category_data: Dict[str, Any]):
|
async def update_category(cat_id: int, category_data: Dict[str, Any]):
|
||||||
"""Update category"""
|
"""Update category"""
|
||||||
try:
|
try:
|
||||||
@@ -417,7 +418,7 @@ async def update_category(cat_id: int, category_data: Dict[str, Any]):
|
|||||||
raise HTTPException(status_code=400, detail=str(e))
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
@app.delete("/api/admin/categories/{cat_id}", dependencies=[Depends(verify_token)])
|
@router.delete("/admin/categories/{cat_id}", dependencies=[Depends(verify_token)])
|
||||||
async def delete_category(cat_id: int):
|
async def delete_category(cat_id: int):
|
||||||
"""Delete category"""
|
"""Delete category"""
|
||||||
try:
|
try:
|
||||||
@@ -429,7 +430,7 @@ async def delete_category(cat_id: int):
|
|||||||
raise HTTPException(status_code=400, detail=str(e))
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
|
|
||||||
# Sponsors CRUD
|
# Sponsors CRUD
|
||||||
@app.post("/api/admin/sponsors", dependencies=[Depends(verify_token)])
|
@router.post("/admin/sponsors", dependencies=[Depends(verify_token)])
|
||||||
async def create_sponsor(sponsor_data: Dict[str, Any]):
|
async def create_sponsor(sponsor_data: Dict[str, Any]):
|
||||||
"""Create new sponsor"""
|
"""Create new sponsor"""
|
||||||
try:
|
try:
|
||||||
@@ -443,7 +444,7 @@ async def create_sponsor(sponsor_data: Dict[str, Any]):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(status_code=400, detail=str(e))
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
|
|
||||||
@app.put("/api/admin/sponsors/{sponsor_id}", dependencies=[Depends(verify_token)])
|
@router.put("/admin/sponsors/{sponsor_id}", dependencies=[Depends(verify_token)])
|
||||||
async def update_sponsor(sponsor_id: int, sponsor_data: Dict[str, Any]):
|
async def update_sponsor(sponsor_id: int, sponsor_data: Dict[str, Any]):
|
||||||
"""Update sponsor"""
|
"""Update sponsor"""
|
||||||
try:
|
try:
|
||||||
@@ -457,7 +458,7 @@ async def update_sponsor(sponsor_id: int, sponsor_data: Dict[str, Any]):
|
|||||||
raise HTTPException(status_code=400, detail=str(e))
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
@app.delete("/api/admin/sponsors/{sponsor_id}", dependencies=[Depends(verify_token)])
|
@router.delete("/admin/sponsors/{sponsor_id}", dependencies=[Depends(verify_token)])
|
||||||
async def delete_sponsor(sponsor_id: int):
|
async def delete_sponsor(sponsor_id: int):
|
||||||
"""Delete sponsor"""
|
"""Delete sponsor"""
|
||||||
try:
|
try:
|
||||||
@@ -468,6 +469,9 @@ async def delete_sponsor(sponsor_id: int):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(status_code=400, detail=str(e))
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
|
|
||||||
|
app.include_router(router)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/")
|
@app.get("/")
|
||||||
async def root():
|
async def root():
|
||||||
"""API info"""
|
"""API info"""
|
||||||
@@ -475,12 +479,12 @@ async def root():
|
|||||||
"name": "Crawl4AI Marketplace API",
|
"name": "Crawl4AI Marketplace API",
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"endpoints": [
|
"endpoints": [
|
||||||
"/api/apps",
|
"/marketplace/api/apps",
|
||||||
"/api/articles",
|
"/marketplace/api/articles",
|
||||||
"/api/categories",
|
"/marketplace/api/categories",
|
||||||
"/api/sponsors",
|
"/marketplace/api/sponsors",
|
||||||
"/api/search?q=query",
|
"/marketplace/api/search?q=query",
|
||||||
"/api/stats"
|
"/marketplace/api/stats"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,15 @@
|
|||||||
// App Detail Page JavaScript
|
// App Detail Page JavaScript
|
||||||
const API_BASE = '/api';
|
const { API_BASE, API_ORIGIN } = (() => {
|
||||||
|
const { hostname, port, protocol } = window.location;
|
||||||
|
const isLocalHost = ['localhost', '127.0.0.1', '0.0.0.0'].includes(hostname);
|
||||||
|
|
||||||
|
if (isLocalHost && port && port !== '8100') {
|
||||||
|
const origin = `${protocol}//127.0.0.1:8100`;
|
||||||
|
return { API_BASE: `${origin}/marketplace/api`, API_ORIGIN: origin };
|
||||||
|
}
|
||||||
|
|
||||||
|
return { API_BASE: '/marketplace/api', API_ORIGIN: '' };
|
||||||
|
})();
|
||||||
|
|
||||||
class AppDetailPage {
|
class AppDetailPage {
|
||||||
constructor() {
|
constructor() {
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
// Marketplace JS - Magazine Layout
|
// Marketplace JS - Magazine Layout
|
||||||
const API_BASE = '/api';
|
const API_BASE = '/marketplace/api';
|
||||||
const CACHE_TTL = 3600000; // 1 hour in ms
|
const CACHE_TTL = 3600000; // 1 hour in ms
|
||||||
|
|
||||||
class MarketplaceCache {
|
class MarketplaceCache {
|
||||||
|
|||||||
@@ -231,7 +231,9 @@ a:hover {
|
|||||||
|
|
||||||
.hero-image {
|
.hero-image {
|
||||||
width: 100%;
|
width: 100%;
|
||||||
height: 240px;
|
height: 200px;
|
||||||
|
min-height: 200px;
|
||||||
|
max-height: 200px;
|
||||||
background: linear-gradient(135deg, rgba(80, 255, 255, 0.1), rgba(243, 128, 245, 0.05));
|
background: linear-gradient(135deg, rgba(80, 255, 255, 0.1), rgba(243, 128, 245, 0.05));
|
||||||
background-size: cover;
|
background-size: cover;
|
||||||
background-position: center;
|
background-position: center;
|
||||||
@@ -243,6 +245,14 @@ a:hover {
|
|||||||
flex-shrink: 0;
|
flex-shrink: 0;
|
||||||
position: relative;
|
position: relative;
|
||||||
filter: brightness(1.1) contrast(1.1);
|
filter: brightness(1.1) contrast(1.1);
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
|
||||||
|
.hero-image img {
|
||||||
|
width: 100%;
|
||||||
|
height: 100%;
|
||||||
|
object-fit: cover;
|
||||||
|
object-position: center;
|
||||||
}
|
}
|
||||||
|
|
||||||
.hero-image::after {
|
.hero-image::after {
|
||||||
@@ -257,6 +267,10 @@ a:hover {
|
|||||||
|
|
||||||
.hero-content {
|
.hero-content {
|
||||||
padding: 1.5rem;
|
padding: 1.5rem;
|
||||||
|
flex: 1;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
justify-content: space-between;
|
||||||
}
|
}
|
||||||
|
|
||||||
.hero-badge {
|
.hero-badge {
|
||||||
@@ -301,9 +315,9 @@ a:hover {
|
|||||||
/* Secondary Featured */
|
/* Secondary Featured */
|
||||||
.secondary-featured {
|
.secondary-featured {
|
||||||
grid-column: 1 / -1;
|
grid-column: 1 / -1;
|
||||||
height: 380px;
|
min-height: 380px;
|
||||||
display: flex;
|
display: flex;
|
||||||
align-items: stretch;
|
align-items: flex-start;
|
||||||
}
|
}
|
||||||
|
|
||||||
.featured-secondary-cards {
|
.featured-secondary-cards {
|
||||||
@@ -311,7 +325,7 @@ a:hover {
|
|||||||
display: flex;
|
display: flex;
|
||||||
flex-direction: column;
|
flex-direction: column;
|
||||||
gap: 0.75rem;
|
gap: 0.75rem;
|
||||||
justify-content: space-between;
|
align-items: stretch;
|
||||||
}
|
}
|
||||||
|
|
||||||
.secondary-card {
|
.secondary-card {
|
||||||
@@ -321,8 +335,10 @@ a:hover {
|
|||||||
transition: all 0.3s ease;
|
transition: all 0.3s ease;
|
||||||
display: flex;
|
display: flex;
|
||||||
overflow: hidden;
|
overflow: hidden;
|
||||||
height: calc((380px - 1.5rem) / 3);
|
height: 118px;
|
||||||
flex: 1;
|
min-height: 118px;
|
||||||
|
max-height: 118px;
|
||||||
|
flex-shrink: 0;
|
||||||
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.3);
|
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.3);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -875,10 +891,13 @@ a:hover {
|
|||||||
.secondary-featured {
|
.secondary-featured {
|
||||||
grid-column: 3 / 5;
|
grid-column: 3 / 5;
|
||||||
grid-row: 1;
|
grid-row: 1;
|
||||||
|
min-height: auto;
|
||||||
}
|
}
|
||||||
|
|
||||||
.featured-secondary-cards {
|
.featured-secondary-cards {
|
||||||
|
display: grid;
|
||||||
grid-template-columns: repeat(2, 1fr);
|
grid-template-columns: repeat(2, 1fr);
|
||||||
|
flex-direction: unset;
|
||||||
}
|
}
|
||||||
|
|
||||||
.main-content {
|
.main-content {
|
||||||
@@ -906,10 +925,13 @@ a:hover {
|
|||||||
|
|
||||||
.secondary-featured {
|
.secondary-featured {
|
||||||
grid-column: 3 / 6;
|
grid-column: 3 / 6;
|
||||||
|
min-height: auto;
|
||||||
}
|
}
|
||||||
|
|
||||||
.featured-secondary-cards {
|
.featured-secondary-cards {
|
||||||
|
display: grid;
|
||||||
grid-template-columns: repeat(3, 1fr);
|
grid-template-columns: repeat(3, 1fr);
|
||||||
|
flex-direction: unset;
|
||||||
}
|
}
|
||||||
|
|
||||||
.sponsored-section {
|
.sponsored-section {
|
||||||
|
|||||||
@@ -3,9 +3,9 @@ const { API_BASE, API_ORIGIN } = (() => {
|
|||||||
const { hostname, port } = window.location;
|
const { hostname, port } = window.location;
|
||||||
if ((hostname === 'localhost' || hostname === '127.0.0.1') && port === '8000') {
|
if ((hostname === 'localhost' || hostname === '127.0.0.1') && port === '8000') {
|
||||||
const origin = 'http://127.0.0.1:8100';
|
const origin = 'http://127.0.0.1:8100';
|
||||||
return { API_BASE: `${origin}/api`, API_ORIGIN: origin };
|
return { API_BASE: `${origin}/marketplace/api`, API_ORIGIN: origin };
|
||||||
}
|
}
|
||||||
return { API_BASE: '/api', API_ORIGIN: '' };
|
return { API_BASE: '/marketplace/api', API_ORIGIN: '' };
|
||||||
})();
|
})();
|
||||||
|
|
||||||
const resolveAssetUrl = (path) => {
|
const resolveAssetUrl = (path) => {
|
||||||
|
|||||||
193
tests/docker/test_hooks_utility.py
Normal file
193
tests/docker/test_hooks_utility.py
Normal file
@@ -0,0 +1,193 @@
|
|||||||
|
"""
|
||||||
|
Test script demonstrating the hooks_to_string utility and Docker client integration.
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import Crawl4aiDockerClient, hooks_to_string
|
||||||
|
|
||||||
|
|
||||||
|
# Define hook functions as regular Python functions
|
||||||
|
async def auth_hook(page, context, **kwargs):
|
||||||
|
"""Add authentication cookies."""
|
||||||
|
await context.add_cookies([{
|
||||||
|
'name': 'test_cookie',
|
||||||
|
'value': 'test_value',
|
||||||
|
'domain': '.httpbin.org',
|
||||||
|
'path': '/'
|
||||||
|
}])
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def scroll_hook(page, context, **kwargs):
|
||||||
|
"""Scroll to load lazy content."""
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def viewport_hook(page, context, **kwargs):
|
||||||
|
"""Set custom viewport."""
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def test_hooks_utility():
|
||||||
|
"""Test the hooks_to_string utility function."""
|
||||||
|
print("=" * 60)
|
||||||
|
print("Testing hooks_to_string utility")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Create hooks dictionary with function objects
|
||||||
|
hooks_dict = {
|
||||||
|
"on_page_context_created": auth_hook,
|
||||||
|
"before_retrieve_html": scroll_hook
|
||||||
|
}
|
||||||
|
|
||||||
|
# Convert to string format
|
||||||
|
hooks_string = hooks_to_string(hooks_dict)
|
||||||
|
|
||||||
|
print("\n✓ Successfully converted function objects to strings")
|
||||||
|
print(f"\n✓ Converted {len(hooks_string)} hooks:")
|
||||||
|
for hook_name in hooks_string.keys():
|
||||||
|
print(f" - {hook_name}")
|
||||||
|
|
||||||
|
print("\n✓ Preview of converted hook:")
|
||||||
|
print("-" * 60)
|
||||||
|
print(hooks_string["on_page_context_created"][:200] + "...")
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
return hooks_string
|
||||||
|
|
||||||
|
|
||||||
|
async def test_docker_client_with_functions():
|
||||||
|
"""Test Docker client with function objects (automatic conversion)."""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Testing Docker Client with Function Objects")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Note: This requires a running Crawl4AI Docker server
|
||||||
|
# Uncomment the following to test with actual server:
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url="http://localhost:11234", verbose=True) as client:
|
||||||
|
# Pass function objects directly - they'll be converted automatically
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/html"],
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": auth_hook,
|
||||||
|
"before_retrieve_html": scroll_hook
|
||||||
|
},
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
print(f"\n✓ Crawl successful: {result.success}")
|
||||||
|
print(f"✓ URL: {result.url}")
|
||||||
|
|
||||||
|
print("\n✓ Docker client accepts function objects directly")
|
||||||
|
print("✓ Automatic conversion happens internally")
|
||||||
|
print("✓ No manual string formatting needed!")
|
||||||
|
|
||||||
|
|
||||||
|
async def test_docker_client_with_strings():
|
||||||
|
"""Test Docker client with pre-converted strings."""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Testing Docker Client with String Hooks")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Convert hooks to strings first
|
||||||
|
hooks_dict = {
|
||||||
|
"on_page_context_created": viewport_hook,
|
||||||
|
"before_retrieve_html": scroll_hook
|
||||||
|
}
|
||||||
|
hooks_string = hooks_to_string(hooks_dict)
|
||||||
|
|
||||||
|
# Note: This requires a running Crawl4AI Docker server
|
||||||
|
# Uncomment the following to test with actual server:
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url="http://localhost:11234", verbose=True) as client:
|
||||||
|
# Pass string hooks - they'll be used as-is
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/html"],
|
||||||
|
hooks=hooks_string,
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
print(f"\n✓ Crawl successful: {result.success}")
|
||||||
|
|
||||||
|
print("\n✓ Docker client also accepts pre-converted strings")
|
||||||
|
print("✓ Backward compatible with existing code")
|
||||||
|
|
||||||
|
|
||||||
|
async def show_usage_patterns():
|
||||||
|
"""Show different usage patterns."""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Usage Patterns")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
print("\n1. Direct function usage (simplest):")
|
||||||
|
print("-" * 60)
|
||||||
|
print("""
|
||||||
|
async def my_hook(page, context, **kwargs):
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://example.com"],
|
||||||
|
hooks={"on_page_context_created": my_hook}
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
print("\n2. Convert then use:")
|
||||||
|
print("-" * 60)
|
||||||
|
print("""
|
||||||
|
hooks_dict = {"on_page_context_created": my_hook}
|
||||||
|
hooks_string = hooks_to_string(hooks_dict)
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://example.com"],
|
||||||
|
hooks=hooks_string
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
print("\n3. Manual string (backward compatible):")
|
||||||
|
print("-" * 60)
|
||||||
|
print("""
|
||||||
|
hooks_string = {
|
||||||
|
"on_page_context_created": '''
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
'''
|
||||||
|
}
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://example.com"],
|
||||||
|
hooks=hooks_string
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Run all tests."""
|
||||||
|
print("\n🚀 Crawl4AI Hooks Utility Test Suite\n")
|
||||||
|
|
||||||
|
# Test the utility function
|
||||||
|
# await test_hooks_utility()
|
||||||
|
|
||||||
|
# Show usage with Docker client
|
||||||
|
# await test_docker_client_with_functions()
|
||||||
|
await test_docker_client_with_strings()
|
||||||
|
|
||||||
|
# Show different patterns
|
||||||
|
# await show_usage_patterns()
|
||||||
|
|
||||||
|
# print("\n" + "=" * 60)
|
||||||
|
# print("✓ All tests completed successfully!")
|
||||||
|
# print("=" * 60)
|
||||||
|
# print("\nKey Benefits:")
|
||||||
|
# print(" • Write hooks as regular Python functions")
|
||||||
|
# print(" • IDE support with autocomplete and type checking")
|
||||||
|
# print(" • Automatic conversion to API format")
|
||||||
|
# print(" • Backward compatible with string hooks")
|
||||||
|
# print(" • Same utility used everywhere")
|
||||||
|
# print("\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
Reference in New Issue
Block a user