Merge branch 'feature/docker-hooks' into develop

This commit is contained in:
ntohidi
2025-09-25 14:11:46 +08:00
8 changed files with 2561 additions and 26 deletions

View File

@@ -442,13 +442,15 @@ async def handle_crawl_request(
urls: List[str],
browser_config: dict,
crawler_config: dict,
config: dict
config: dict,
hooks_config: Optional[dict] = None
) -> dict:
"""Handle non-streaming crawl requests."""
"""Handle non-streaming crawl requests with optional hooks."""
start_mem_mb = _get_memory_mb() # <--- Get memory before
start_time = time.time()
mem_delta_mb = None
peak_mem_mb = start_mem_mb
hook_manager = None
try:
urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls]
@@ -468,6 +470,19 @@ async def handle_crawl_request(
# crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
# await crawler.start()
# Attach hooks if provided
hooks_status = {}
if hooks_config:
from hook_manager import attach_user_hooks_to_crawler, UserHookManager
hook_manager = UserHookManager(timeout=hooks_config.get('timeout', 30))
hooks_status, hook_manager = await attach_user_hooks_to_crawler(
crawler,
hooks_config.get('code', {}),
timeout=hooks_config.get('timeout', 30),
hook_manager=hook_manager
)
logger.info(f"Hooks attachment status: {hooks_status['status']}")
base_config = config["crawler"]["base_config"]
# Iterate on key-value pairs in global_config then use hasattr to set them
for key, value in base_config.items():
@@ -484,6 +499,10 @@ async def handle_crawl_request(
config=crawler_config,
dispatcher=dispatcher)
results = await partial_func()
# Ensure results is always a list
if not isinstance(results, list):
results = [results]
# await crawler.close()
@@ -498,22 +517,72 @@ async def handle_crawl_request(
# Process results to handle PDF bytes
processed_results = []
for result in results:
result_dict = result.model_dump()
# if fit_html is not a string, set it to None to avoid serialization errors
if "fit_html" in result_dict and not (result_dict["fit_html"] is None or isinstance(result_dict["fit_html"], str)):
result_dict["fit_html"] = None
# If PDF exists, encode it to base64
if result_dict.get('pdf') is not None:
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
processed_results.append(result_dict)
try:
# Check if result has model_dump method (is a proper CrawlResult)
if hasattr(result, 'model_dump'):
result_dict = result.model_dump()
elif isinstance(result, dict):
result_dict = result
else:
# Handle unexpected result type
logger.warning(f"Unexpected result type: {type(result)}")
result_dict = {
"url": str(result) if hasattr(result, '__str__') else "unknown",
"success": False,
"error_message": f"Unexpected result type: {type(result).__name__}"
}
# if fit_html is not a string, set it to None to avoid serialization errors
if "fit_html" in result_dict and not (result_dict["fit_html"] is None or isinstance(result_dict["fit_html"], str)):
result_dict["fit_html"] = None
# If PDF exists, encode it to base64
if result_dict.get('pdf') is not None and isinstance(result_dict.get('pdf'), bytes):
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
processed_results.append(result_dict)
except Exception as e:
logger.error(f"Error processing result: {e}")
processed_results.append({
"url": "unknown",
"success": False,
"error_message": str(e)
})
return {
response = {
"success": True,
"results": processed_results,
"server_processing_time_s": end_time - start_time,
"server_memory_delta_mb": mem_delta_mb,
"server_peak_memory_mb": peak_mem_mb
}
# Add hooks information if hooks were used
if hooks_config and hook_manager:
from hook_manager import UserHookManager
if isinstance(hook_manager, UserHookManager):
try:
# Ensure all hook data is JSON serializable
import json
hook_data = {
"status": hooks_status,
"execution_log": hook_manager.execution_log,
"errors": hook_manager.errors,
"summary": hook_manager.get_summary()
}
# Test that it's serializable
json.dumps(hook_data)
response["hooks"] = hook_data
except (TypeError, ValueError) as e:
logger.error(f"Hook data not JSON serializable: {e}")
response["hooks"] = {
"status": {"status": "error", "message": "Hook data serialization failed"},
"execution_log": [],
"errors": [{"error": str(e)}],
"summary": {}
}
return response
except Exception as e:
logger.error(f"Crawl error: {str(e)}", exc_info=True)
@@ -542,9 +611,11 @@ async def handle_stream_crawl_request(
urls: List[str],
browser_config: dict,
crawler_config: dict,
config: dict
) -> Tuple[AsyncWebCrawler, AsyncGenerator]:
"""Handle streaming crawl requests."""
config: dict,
hooks_config: Optional[dict] = None
) -> Tuple[AsyncWebCrawler, AsyncGenerator, Optional[Dict]]:
"""Handle streaming crawl requests with optional hooks."""
hooks_info = None
try:
browser_config = BrowserConfig.load(browser_config)
# browser_config.verbose = True # Set to False or remove for production stress testing
@@ -565,6 +636,20 @@ async def handle_stream_crawl_request(
# crawler = AsyncWebCrawler(config=browser_config)
# await crawler.start()
# Attach hooks if provided
if hooks_config:
from hook_manager import attach_user_hooks_to_crawler, UserHookManager
hook_manager = UserHookManager(timeout=hooks_config.get('timeout', 30))
hooks_status, hook_manager = await attach_user_hooks_to_crawler(
crawler,
hooks_config.get('code', {}),
timeout=hooks_config.get('timeout', 30),
hook_manager=hook_manager
)
logger.info(f"Hooks attachment status for streaming: {hooks_status['status']}")
# Include hook manager in hooks_info for proper tracking
hooks_info = {'status': hooks_status, 'manager': hook_manager}
results_gen = await crawler.arun_many(
urls=urls,
@@ -572,7 +657,7 @@ async def handle_stream_crawl_request(
dispatcher=dispatcher
)
return crawler, results_gen
return crawler, results_gen, hooks_info
except Exception as e:
# Make sure to close crawler if started during an error here

View File

@@ -0,0 +1,512 @@
"""
Hook Manager for User-Provided Hook Functions
Handles validation, compilation, and safe execution of user-provided hook code
"""
import ast
import asyncio
import traceback
from typing import Dict, Callable, Optional, Tuple, List, Any
import logging
logger = logging.getLogger(__name__)
class UserHookManager:
"""Manages user-provided hook functions with error isolation"""
# Expected signatures for each hook point
HOOK_SIGNATURES = {
"on_browser_created": ["browser"],
"on_page_context_created": ["page", "context"],
"before_goto": ["page", "context", "url"],
"after_goto": ["page", "context", "url", "response"],
"on_user_agent_updated": ["page", "context", "user_agent"],
"on_execution_started": ["page", "context"],
"before_retrieve_html": ["page", "context"],
"before_return_html": ["page", "context", "html"]
}
# Default timeout for hook execution (in seconds)
DEFAULT_TIMEOUT = 30
def __init__(self, timeout: int = DEFAULT_TIMEOUT):
self.timeout = timeout
self.errors: List[Dict[str, Any]] = []
self.compiled_hooks: Dict[str, Callable] = {}
self.execution_log: List[Dict[str, Any]] = []
def validate_hook_structure(self, hook_code: str, hook_point: str) -> Tuple[bool, str]:
"""
Validate the structure of user-provided hook code
Args:
hook_code: The Python code string containing the hook function
hook_point: The hook point name (e.g., 'on_page_context_created')
Returns:
Tuple of (is_valid, error_message)
"""
try:
# Parse the code
tree = ast.parse(hook_code)
# Check if it's empty
if not tree.body:
return False, "Hook code is empty"
# Find the function definition
func_def = None
for node in tree.body:
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
func_def = node
break
if not func_def:
return False, "Hook must contain a function definition (def or async def)"
# Check if it's async (all hooks should be async)
if not isinstance(func_def, ast.AsyncFunctionDef):
return False, f"Hook function must be async (use 'async def' instead of 'def')"
# Get function name for better error messages
func_name = func_def.name
# Validate parameters
expected_params = self.HOOK_SIGNATURES.get(hook_point, [])
if not expected_params:
return False, f"Unknown hook point: {hook_point}"
func_params = [arg.arg for arg in func_def.args.args]
# Check if it has **kwargs for flexibility
has_kwargs = func_def.args.kwarg is not None
# Must have at least the expected parameters
missing_params = []
for expected in expected_params:
if expected not in func_params:
missing_params.append(expected)
if missing_params and not has_kwargs:
return False, f"Hook function '{func_name}' must accept parameters: {', '.join(expected_params)} (missing: {', '.join(missing_params)})"
# Check if it returns something (should return page or browser)
has_return = any(isinstance(node, ast.Return) for node in ast.walk(func_def))
if not has_return:
# Warning, not error - we'll handle this
logger.warning(f"Hook function '{func_name}' should return the {expected_params[0]} object")
return True, "Valid"
except SyntaxError as e:
return False, f"Syntax error at line {e.lineno}: {str(e)}"
except Exception as e:
return False, f"Failed to parse hook code: {str(e)}"
def compile_hook(self, hook_code: str, hook_point: str) -> Optional[Callable]:
"""
Compile user-provided hook code into a callable function
Args:
hook_code: The Python code string
hook_point: The hook point name
Returns:
Compiled function or None if compilation failed
"""
try:
# Create a safe namespace for the hook
# Use a more complete builtins that includes __import__
import builtins
safe_builtins = {}
# Add safe built-in functions
allowed_builtins = [
'print', 'len', 'str', 'int', 'float', 'bool',
'list', 'dict', 'set', 'tuple', 'range', 'enumerate',
'zip', 'map', 'filter', 'any', 'all', 'sum', 'min', 'max',
'sorted', 'reversed', 'abs', 'round', 'isinstance', 'type',
'getattr', 'hasattr', 'setattr', 'callable', 'iter', 'next',
'__import__', '__build_class__' # Required for exec
]
for name in allowed_builtins:
if hasattr(builtins, name):
safe_builtins[name] = getattr(builtins, name)
namespace = {
'__name__': f'user_hook_{hook_point}',
'__builtins__': safe_builtins
}
# Add commonly needed imports
exec("import asyncio", namespace)
exec("import json", namespace)
exec("import re", namespace)
exec("from typing import Dict, List, Optional", namespace)
# Execute the code to define the function
exec(hook_code, namespace)
# Find the async function in the namespace
for name, obj in namespace.items():
if callable(obj) and not name.startswith('_') and asyncio.iscoroutinefunction(obj):
return obj
# If no async function found, look for any function
for name, obj in namespace.items():
if callable(obj) and not name.startswith('_'):
logger.warning(f"Found non-async function '{name}' - wrapping it")
# Wrap sync function in async
async def async_wrapper(*args, **kwargs):
return obj(*args, **kwargs)
return async_wrapper
raise ValueError("No callable function found in hook code")
except Exception as e:
error = {
'hook_point': hook_point,
'error': f"Failed to compile hook: {str(e)}",
'type': 'compilation_error',
'traceback': traceback.format_exc()
}
self.errors.append(error)
logger.error(f"Hook compilation failed for {hook_point}: {str(e)}")
return None
async def execute_hook_safely(
self,
hook_func: Callable,
hook_point: str,
*args,
**kwargs
) -> Tuple[Any, Optional[Dict]]:
"""
Execute a user hook with error isolation and timeout
Args:
hook_func: The compiled hook function
hook_point: The hook point name
*args, **kwargs: Arguments to pass to the hook
Returns:
Tuple of (result, error_dict)
"""
start_time = asyncio.get_event_loop().time()
try:
# Add timeout to prevent infinite loops
result = await asyncio.wait_for(
hook_func(*args, **kwargs),
timeout=self.timeout
)
# Log successful execution
execution_time = asyncio.get_event_loop().time() - start_time
self.execution_log.append({
'hook_point': hook_point,
'status': 'success',
'execution_time': execution_time,
'timestamp': start_time
})
return result, None
except asyncio.TimeoutError:
error = {
'hook_point': hook_point,
'error': f'Hook execution timed out ({self.timeout}s limit)',
'type': 'timeout',
'execution_time': self.timeout
}
self.errors.append(error)
self.execution_log.append({
'hook_point': hook_point,
'status': 'timeout',
'error': error['error'],
'execution_time': self.timeout,
'timestamp': start_time
})
# Return the first argument (usually page/browser) to continue
return args[0] if args else None, error
except Exception as e:
execution_time = asyncio.get_event_loop().time() - start_time
error = {
'hook_point': hook_point,
'error': str(e),
'type': type(e).__name__,
'traceback': traceback.format_exc(),
'execution_time': execution_time
}
self.errors.append(error)
self.execution_log.append({
'hook_point': hook_point,
'status': 'failed',
'error': str(e),
'error_type': type(e).__name__,
'execution_time': execution_time,
'timestamp': start_time
})
# Return the first argument (usually page/browser) to continue
return args[0] if args else None, error
def get_summary(self) -> Dict[str, Any]:
"""Get a summary of hook execution"""
total_hooks = len(self.execution_log)
successful = sum(1 for log in self.execution_log if log['status'] == 'success')
failed = sum(1 for log in self.execution_log if log['status'] == 'failed')
timed_out = sum(1 for log in self.execution_log if log['status'] == 'timeout')
return {
'total_executions': total_hooks,
'successful': successful,
'failed': failed,
'timed_out': timed_out,
'success_rate': (successful / total_hooks * 100) if total_hooks > 0 else 0,
'total_errors': len(self.errors)
}
class IsolatedHookWrapper:
"""Wraps user hooks with error isolation and reporting"""
def __init__(self, hook_manager: UserHookManager):
self.hook_manager = hook_manager
def create_hook_wrapper(self, user_hook: Callable, hook_point: str) -> Callable:
"""
Create a wrapper that isolates hook errors from main process
Args:
user_hook: The compiled user hook function
hook_point: The hook point name
Returns:
Wrapped async function that handles errors gracefully
"""
async def wrapped_hook(*args, **kwargs):
"""Wrapped hook with error isolation"""
# Get the main return object (page/browser)
# This ensures we always have something to return
return_obj = None
if args:
return_obj = args[0]
elif 'page' in kwargs:
return_obj = kwargs['page']
elif 'browser' in kwargs:
return_obj = kwargs['browser']
try:
# Execute user hook with safety
result, error = await self.hook_manager.execute_hook_safely(
user_hook,
hook_point,
*args,
**kwargs
)
if error:
# Hook failed but we continue with original object
logger.warning(f"User hook failed at {hook_point}: {error['error']}")
return return_obj
# Hook succeeded - return its result or the original object
if result is None:
logger.debug(f"Hook at {hook_point} returned None, using original object")
return return_obj
return result
except Exception as e:
# This should rarely happen due to execute_hook_safely
logger.error(f"Unexpected error in hook wrapper for {hook_point}: {e}")
return return_obj
# Set function name for debugging
wrapped_hook.__name__ = f"wrapped_{hook_point}"
return wrapped_hook
async def process_user_hooks(
hooks_input: Dict[str, str],
timeout: int = 30
) -> Tuple[Dict[str, Callable], List[Dict], UserHookManager]:
"""
Process and compile user-provided hook functions
Args:
hooks_input: Dictionary mapping hook points to code strings
timeout: Timeout for each hook execution
Returns:
Tuple of (compiled_hooks, validation_errors, hook_manager)
"""
hook_manager = UserHookManager(timeout=timeout)
wrapper = IsolatedHookWrapper(hook_manager)
compiled_hooks = {}
validation_errors = []
for hook_point, hook_code in hooks_input.items():
# Skip empty hooks
if not hook_code or not hook_code.strip():
continue
# Validate hook point
if hook_point not in UserHookManager.HOOK_SIGNATURES:
validation_errors.append({
'hook_point': hook_point,
'error': f'Unknown hook point. Valid points: {", ".join(UserHookManager.HOOK_SIGNATURES.keys())}',
'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
})
continue
# Validate structure
is_valid, message = hook_manager.validate_hook_structure(hook_code, hook_point)
if not is_valid:
validation_errors.append({
'hook_point': hook_point,
'error': message,
'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
})
continue
# Compile the hook
hook_func = hook_manager.compile_hook(hook_code, hook_point)
if hook_func:
# Wrap with error isolation
wrapped_hook = wrapper.create_hook_wrapper(hook_func, hook_point)
compiled_hooks[hook_point] = wrapped_hook
logger.info(f"Successfully compiled hook for {hook_point}")
else:
validation_errors.append({
'hook_point': hook_point,
'error': 'Failed to compile hook function - check syntax and structure',
'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
})
return compiled_hooks, validation_errors, hook_manager
async def process_user_hooks_with_manager(
hooks_input: Dict[str, str],
hook_manager: UserHookManager
) -> Tuple[Dict[str, Callable], List[Dict]]:
"""
Process and compile user-provided hook functions with existing manager
Args:
hooks_input: Dictionary mapping hook points to code strings
hook_manager: Existing UserHookManager instance
Returns:
Tuple of (compiled_hooks, validation_errors)
"""
wrapper = IsolatedHookWrapper(hook_manager)
compiled_hooks = {}
validation_errors = []
for hook_point, hook_code in hooks_input.items():
# Skip empty hooks
if not hook_code or not hook_code.strip():
continue
# Validate hook point
if hook_point not in UserHookManager.HOOK_SIGNATURES:
validation_errors.append({
'hook_point': hook_point,
'error': f'Unknown hook point. Valid points: {", ".join(UserHookManager.HOOK_SIGNATURES.keys())}',
'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
})
continue
# Validate structure
is_valid, message = hook_manager.validate_hook_structure(hook_code, hook_point)
if not is_valid:
validation_errors.append({
'hook_point': hook_point,
'error': message,
'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
})
continue
# Compile the hook
hook_func = hook_manager.compile_hook(hook_code, hook_point)
if hook_func:
# Wrap with error isolation
wrapped_hook = wrapper.create_hook_wrapper(hook_func, hook_point)
compiled_hooks[hook_point] = wrapped_hook
logger.info(f"Successfully compiled hook for {hook_point}")
else:
validation_errors.append({
'hook_point': hook_point,
'error': 'Failed to compile hook function - check syntax and structure',
'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
})
return compiled_hooks, validation_errors
async def attach_user_hooks_to_crawler(
crawler, # AsyncWebCrawler instance
user_hooks: Dict[str, str],
timeout: int = 30,
hook_manager: Optional[UserHookManager] = None
) -> Tuple[Dict[str, Any], UserHookManager]:
"""
Attach user-provided hooks to crawler with full error reporting
Args:
crawler: AsyncWebCrawler instance
user_hooks: Dictionary mapping hook points to code strings
timeout: Timeout for each hook execution
hook_manager: Optional existing UserHookManager instance
Returns:
Tuple of (status_dict, hook_manager)
"""
# Use provided hook_manager or create a new one
if hook_manager is None:
hook_manager = UserHookManager(timeout=timeout)
# Process hooks with the hook_manager
compiled_hooks, validation_errors = await process_user_hooks_with_manager(
user_hooks, hook_manager
)
# Log validation errors
if validation_errors:
logger.warning(f"Hook validation errors: {validation_errors}")
# Attach successfully compiled hooks
attached_hooks = []
for hook_point, wrapped_hook in compiled_hooks.items():
try:
crawler.crawler_strategy.set_hook(hook_point, wrapped_hook)
attached_hooks.append(hook_point)
logger.info(f"Attached hook to {hook_point}")
except Exception as e:
logger.error(f"Failed to attach hook to {hook_point}: {e}")
validation_errors.append({
'hook_point': hook_point,
'error': f'Failed to attach hook: {str(e)}'
})
status = 'success' if not validation_errors else ('partial' if attached_hooks else 'failed')
status_dict = {
'status': status,
'attached_hooks': attached_hooks,
'validation_errors': validation_errors,
'total_hooks_provided': len(user_hooks),
'successfully_attached': len(attached_hooks),
'failed_validation': len(validation_errors)
}
return status_dict, hook_manager

View File

@@ -9,6 +9,50 @@ class CrawlRequest(BaseModel):
browser_config: Optional[Dict] = Field(default_factory=dict)
crawler_config: Optional[Dict] = Field(default_factory=dict)
class HookConfig(BaseModel):
"""Configuration for user-provided hooks"""
code: Dict[str, str] = Field(
default_factory=dict,
description="Map of hook points to Python code strings"
)
timeout: int = Field(
default=30,
ge=1,
le=120,
description="Timeout in seconds for each hook execution"
)
class Config:
schema_extra = {
"example": {
"code": {
"on_page_context_created": """
async def hook(page, context, **kwargs):
# Block images to speed up crawling
await context.route("**/*.{png,jpg,jpeg,gif}", lambda route: route.abort())
return page
""",
"before_retrieve_html": """
async def hook(page, context, **kwargs):
# Scroll to load lazy content
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(2000)
return page
"""
},
"timeout": 30
}
}
class CrawlRequestWithHooks(CrawlRequest):
"""Extended crawl request with hooks support"""
hooks: Optional[HookConfig] = Field(
default=None,
description="Optional user-provided hook functions"
)
class MarkdownRequest(BaseModel):
"""Request body for the /md endpoint."""
url: str = Field(..., description="Absolute http/https URL to fetch")

View File

@@ -23,7 +23,7 @@ from api import (
stream_results
)
from schemas import (
CrawlRequest,
CrawlRequestWithHooks,
MarkdownRequest,
RawCode,
HTMLRequest,
@@ -462,6 +462,72 @@ async def get_schema():
"crawler": CrawlerRunConfig().dump()}
@app.get("/hooks/info")
async def get_hooks_info():
"""Get information about available hook points and their signatures"""
from hook_manager import UserHookManager
hook_info = {}
for hook_point, params in UserHookManager.HOOK_SIGNATURES.items():
hook_info[hook_point] = {
"parameters": params,
"description": get_hook_description(hook_point),
"example": get_hook_example(hook_point)
}
return JSONResponse({
"available_hooks": hook_info,
"timeout_limits": {
"min": 1,
"max": 120,
"default": 30
}
})
def get_hook_description(hook_point: str) -> str:
"""Get description for each hook point"""
descriptions = {
"on_browser_created": "Called after browser instance is created",
"on_page_context_created": "Called after page and context are created - ideal for authentication",
"before_goto": "Called before navigating to the target URL",
"after_goto": "Called after navigation is complete",
"on_user_agent_updated": "Called when user agent is updated",
"on_execution_started": "Called when custom JavaScript execution begins",
"before_retrieve_html": "Called before retrieving the final HTML - ideal for scrolling",
"before_return_html": "Called just before returning the HTML content"
}
return descriptions.get(hook_point, "")
def get_hook_example(hook_point: str) -> str:
"""Get example code for each hook point"""
examples = {
"on_page_context_created": """async def hook(page, context, **kwargs):
# Add authentication cookie
await context.add_cookies([{
'name': 'session',
'value': 'my-session-id',
'domain': '.example.com'
}])
return page""",
"before_retrieve_html": """async def hook(page, context, **kwargs):
# Scroll to load lazy content
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(2000)
return page""",
"before_goto": """async def hook(page, context, url, **kwargs):
# Set custom headers
await page.set_extra_http_headers({
'X-Custom-Header': 'value'
})
return page"""
}
return examples.get(hook_point, "# Implement your hook logic here\nreturn page")
@app.get(config["observability"]["health_check"]["endpoint"])
async def health():
return {"status": "ok", "timestamp": time.time(), "version": __version__}
@@ -477,12 +543,13 @@ async def metrics():
@mcp_tool("crawl")
async def crawl(
request: Request,
crawl_request: CrawlRequest,
crawl_request: CrawlRequestWithHooks,
_td: Dict = Depends(token_dep),
):
"""
Crawl a list of URLs and return the results as JSON.
For streaming responses, use /crawl/stream endpoint.
Supports optional user-provided hook functions for customization.
"""
if not crawl_request.urls:
raise HTTPException(400, "At least one URL required")
@@ -490,11 +557,21 @@ async def crawl(
crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
if crawler_config.stream:
return await stream_process(crawl_request=crawl_request)
# Prepare hooks config if provided
hooks_config = None
if crawl_request.hooks:
hooks_config = {
'code': crawl_request.hooks.code,
'timeout': crawl_request.hooks.timeout
}
results = await handle_crawl_request(
urls=crawl_request.urls,
browser_config=crawl_request.browser_config,
crawler_config=crawl_request.crawler_config,
config=config,
hooks_config=hooks_config
)
# check if all of the results are not successful
if all(not result["success"] for result in results["results"]):
@@ -506,7 +583,7 @@ async def crawl(
@limiter.limit(config["rate_limiting"]["default_limit"])
async def crawl_stream(
request: Request,
crawl_request: CrawlRequest,
crawl_request: CrawlRequestWithHooks,
_td: Dict = Depends(token_dep),
):
if not crawl_request.urls:
@@ -514,21 +591,38 @@ async def crawl_stream(
return await stream_process(crawl_request=crawl_request)
async def stream_process(crawl_request: CrawlRequest):
crawler, gen = await handle_stream_crawl_request(
async def stream_process(crawl_request: CrawlRequestWithHooks):
# Prepare hooks config if provided# Prepare hooks config if provided
hooks_config = None
if crawl_request.hooks:
hooks_config = {
'code': crawl_request.hooks.code,
'timeout': crawl_request.hooks.timeout
}
crawler, gen, hooks_info = await handle_stream_crawl_request(
urls=crawl_request.urls,
browser_config=crawl_request.browser_config,
crawler_config=crawl_request.crawler_config,
config=config,
)
hooks_config=hooks_config
)
# Add hooks info to response headers if available
headers = {
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Stream-Status": "active",
}
if hooks_info:
import json
headers["X-Hooks-Status"] = json.dumps(hooks_info['status']['status'])
return StreamingResponse(
stream_results(crawler, gen),
media_type="application/x-ndjson",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Stream-Status": "active",
},
headers=headers,
)