Files
crawl4ai/deploy/docker/schemas.py

106 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from typing import List, Optional, Dict
from enum import Enum
from pydantic import BaseModel, Field, HttpUrl
from utils import FilterType
class CrawlRequest(BaseModel):
urls: List[str] = Field(min_length=1, max_length=100)
browser_config: Optional[Dict] = Field(default_factory=dict)
crawler_config: Optional[Dict] = Field(default_factory=dict)
class HookConfig(BaseModel):
"""Configuration for user-provided hooks"""
code: Dict[str, str] = Field(
default_factory=dict,
description="Map of hook points to Python code strings"
)
timeout: int = Field(
default=30,
ge=1,
le=120,
description="Timeout in seconds for each hook execution"
)
class Config:
schema_extra = {
"example": {
"code": {
"on_page_context_created": """
async def hook(page, context, **kwargs):
# Block images to speed up crawling
await context.route("**/*.{png,jpg,jpeg,gif}", lambda route: route.abort())
return page
""",
"before_retrieve_html": """
async def hook(page, context, **kwargs):
# Scroll to load lazy content
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(2000)
return page
"""
},
"timeout": 30
}
}
class CrawlRequestWithHooks(CrawlRequest):
"""Extended crawl request with hooks support"""
hooks: Optional[HookConfig] = Field(
default=None,
description="Optional user-provided hook functions"
)
class MarkdownRequest(BaseModel):
"""Request body for the /md endpoint."""
url: str = Field(..., description="Absolute http/https URL to fetch")
f: FilterType = Field(FilterType.FIT, description="Contentfilter strategy: fit, raw, bm25, or llm")
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
c: Optional[str] = Field("0", description="Cachebust / revision counter")
provider: Optional[str] = Field(None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')")
temperature: Optional[float] = Field(None, description="LLM temperature override (0.0-2.0)")
base_url: Optional[str] = Field(None, description="LLM API base URL override")
class RawCode(BaseModel):
code: str
class HTMLRequest(BaseModel):
url: str
class ScreenshotRequest(BaseModel):
url: str
screenshot_wait_for: Optional[float] = 2
output_path: Optional[str] = None
class PDFRequest(BaseModel):
url: str
output_path: Optional[str] = None
class JSEndpointRequest(BaseModel):
url: str
scripts: List[str] = Field(
...,
description="List of separated JavaScript snippets to execute"
)
class WebhookConfig(BaseModel):
"""Configuration for webhook notifications."""
webhook_url: HttpUrl
webhook_data_in_payload: bool = False
webhook_headers: Optional[Dict[str, str]] = None
class WebhookPayload(BaseModel):
"""Payload sent to webhook endpoints."""
task_id: str
task_type: str # "crawl", "llm_extraction", etc.
status: str # "completed" or "failed"
timestamp: str # ISO 8601 format
urls: List[str]
error: Optional[str] = None
data: Optional[Dict] = None # Included only if webhook_data_in_payload=True