From 1a8e0236af25c97ca961c61fd132163a20e2ce90 Mon Sep 17 00:00:00 2001 From: AHMET YILMAZ Date: Wed, 1 Oct 2025 15:53:56 +0800 Subject: [PATCH] feat(adaptive-crawling): implement adaptive crawling endpoints and integrate with server --- deploy/docker/routers/__init__.py | 0 .../adaptive.py} | 41 +----- deploy/docker/routers/scripts.py | 135 ++++++++++++++++++ deploy/docker/schemas.py | 93 +++++++++--- deploy/docker/server.py | 5 +- 5 files changed, 215 insertions(+), 59 deletions(-) create mode 100644 deploy/docker/routers/__init__.py rename deploy/docker/{adaptive_routes.py => routers/adaptive.py} (75%) create mode 100644 deploy/docker/routers/scripts.py diff --git a/deploy/docker/routers/__init__.py b/deploy/docker/routers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/deploy/docker/adaptive_routes.py b/deploy/docker/routers/adaptive.py similarity index 75% rename from deploy/docker/adaptive_routes.py rename to deploy/docker/routers/adaptive.py index 05686342..274c0b6e 100644 --- a/deploy/docker/adaptive_routes.py +++ b/deploy/docker/routers/adaptive.py @@ -1,9 +1,8 @@ -import asyncio import uuid -from typing import Any, Dict, List, Optional +from typing import Any, Dict -from fastapi import APIRouter, BackgroundTasks, Body, HTTPException -from pydantic import BaseModel, Field +from fastapi import APIRouter, BackgroundTasks, HTTPException +from schemas import AdaptiveConfigPayload, AdaptiveCrawlRequest, AdaptiveJobStatus from crawl4ai import AsyncWebCrawler from crawl4ai.adaptive_crawler import AdaptiveConfig, AdaptiveCrawler @@ -12,40 +11,6 @@ from crawl4ai.utils import get_error_context # --- In-memory storage for job statuses. For production, use Redis or a database. --- ADAPTIVE_JOBS: Dict[str, Dict[str, Any]] = {} -# --- Pydantic Models for API Validation --- - - -class AdaptiveConfigPayload(BaseModel): - """Pydantic model for receiving AdaptiveConfig parameters.""" - - confidence_threshold: float = 0.7 - max_pages: int = 20 - top_k_links: int = 3 - strategy: str = "statistical" # "statistical" or "embedding" - embedding_model: Optional[str] = "sentence-transformers/all-MiniLM-L6-v2" - # Add any other AdaptiveConfig fields you want to expose - - -class AdaptiveCrawlRequest(BaseModel): - """Input model for the adaptive digest job.""" - - start_url: str = Field(..., description="The starting URL for the adaptive crawl.") - query: str = Field(..., description="The user query to guide the crawl.") - config: Optional[AdaptiveConfigPayload] = Field( - None, description="Optional adaptive crawler configuration." - ) - - -class AdaptiveJobStatus(BaseModel): - """Output model for the job status.""" - - task_id: str - status: str - metrics: Optional[Dict[str, Any]] = None - result: Optional[Dict[str, Any]] = None - error: Optional[str] = None - - # --- APIRouter for Adaptive Crawling Endpoints --- router = APIRouter( prefix="/adaptive/digest", diff --git a/deploy/docker/routers/scripts.py b/deploy/docker/routers/scripts.py new file mode 100644 index 00000000..4190ad6b --- /dev/null +++ b/deploy/docker/routers/scripts.py @@ -0,0 +1,135 @@ +from typing import Optional + +from fastapi import APIRouter, File, Form, HTTPException, UploadFile +from schemas import C4AScriptPayload + +from crawl4ai.script import ( + CompilationResult, + ValidationResult, + # ErrorDetail +) + +# Import all necessary components from the crawl4ai library +# C4A Script Language Support +from crawl4ai.script import ( + compile as c4a_compile, +) +from crawl4ai.script import ( + validate as c4a_validate, +) + +# --- APIRouter for c4a Scripts Endpoints --- +router = APIRouter( + prefix="/c4a", + tags=["c4a Scripts"], +) + +# --- Background Worker Function --- + + +@router.post( + "/validate", response_model=ValidationResult, summary="Validate a C4A-Script" +) +async def validate_c4a_script_endpoint(payload: C4AScriptPayload): + """ + Validates the syntax of a C4A-Script without compiling it. + + Returns a `ValidationResult` object indicating whether the script is + valid and providing detailed error information if it's not. + """ + # The validate function is designed not to raise exceptions + validation_result = c4a_validate(payload.script) + return validation_result + + +@router.post( + "/compile", response_model=CompilationResult, summary="Compile a C4A-Script" +) +async def compile_c4a_script_endpoint(payload: C4AScriptPayload): + """ + Compiles a C4A-Script into executable JavaScript. + + If successful, returns the compiled JavaScript code. If there are syntax + errors, it returns a detailed error report. + """ + # The compile function also returns a result object instead of raising + compilation_result = c4a_compile(payload.script) + + if not compilation_result.success: + # You can optionally raise an HTTP exception for failed compilations + # This makes it clearer on the client-side that it was a bad request + raise HTTPException( + status_code=400, + detail=compilation_result.to_dict(), # FastAPI will serialize this + ) + + return compilation_result + + +@router.post( + "/compile-file", + response_model=CompilationResult, + summary="Compile a C4A-Script from file or string", +) +async def compile_c4a_script_file_endpoint( + file: Optional[UploadFile] = File(None), script: Optional[str] = Form(None) +): + """ + Compiles a C4A-Script into executable JavaScript from either an uploaded file or string content. + + Accepts either: + - A file upload containing the C4A-Script + - A string containing the C4A-Script content + + At least one of the parameters must be provided. + + If successful, returns the compiled JavaScript code. If there are syntax + errors, it returns a detailed error report. + """ + script_content = None + + # Validate that at least one input is provided + if not file and not script: + raise HTTPException( + status_code=400, + detail={"error": "Either 'file' or 'script' parameter must be provided"}, + ) + + # If both are provided, prioritize the file + if file and script: + raise HTTPException( + status_code=400, + detail={"error": "Please provide either 'file' or 'script', not both"}, + ) + + # Handle file upload + if file: + try: + file_content = await file.read() + script_content = file_content.decode("utf-8") + except UnicodeDecodeError as exc: + raise HTTPException( + status_code=400, + detail={"error": "File must be a valid UTF-8 text file"}, + ) from exc + except Exception as e: + raise HTTPException( + status_code=400, detail={"error": f"Error reading file: {str(e)}"} + ) from e + + # Handle string content + elif script: + script_content = script + + # Compile the script content + compilation_result = c4a_compile(script_content) + + if not compilation_result.success: + # You can optionally raise an HTTP exception for failed compilations + # This makes it clearer on the client-side that it was a bad request + raise HTTPException( + status_code=400, + detail=compilation_result.to_dict(), # FastAPI will serialize this + ) + + return compilation_result diff --git a/deploy/docker/schemas.py b/deploy/docker/schemas.py index aa479693..5263bfeb 100644 --- a/deploy/docker/schemas.py +++ b/deploy/docker/schemas.py @@ -1,5 +1,6 @@ -from typing import List, Optional, Dict, Any from enum import Enum +from typing import Any, Dict, List, Optional + from pydantic import BaseModel, Field from utils import FilterType @@ -12,17 +13,17 @@ class CrawlRequest(BaseModel): class HookConfig(BaseModel): """Configuration for user-provided hooks""" + code: Dict[str, str] = Field( - default_factory=dict, - description="Map of hook points to Python code strings" + default_factory=dict, description="Map of hook points to Python code strings" ) timeout: int = Field( default=30, ge=1, le=120, - description="Timeout in seconds for each hook execution" + description="Timeout in seconds for each hook execution", ) - + class Config: schema_extra = { "example": { @@ -39,42 +40,53 @@ async def hook(page, context, **kwargs): await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") await page.wait_for_timeout(2000) return page -""" +""", }, - "timeout": 30 + "timeout": 30, } } class CrawlRequestWithHooks(CrawlRequest): """Extended crawl request with hooks support""" + hooks: Optional[HookConfig] = Field( - default=None, - description="Optional user-provided hook functions" + default=None, description="Optional user-provided hook functions" ) + class MarkdownRequest(BaseModel): """Request body for the /md endpoint.""" - url: str = Field(..., description="Absolute http/https URL to fetch") - f: FilterType = Field(FilterType.FIT, description="Content‑filter strategy: fit, raw, bm25, or llm") - q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters") - c: Optional[str] = Field("0", description="Cache‑bust / revision counter") - provider: Optional[str] = Field(None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')") - temperature: Optional[float] = Field(None, description="LLM temperature override (0.0-2.0)") + + url: str = Field(..., description="Absolute http/https URL to fetch") + f: FilterType = Field( + FilterType.FIT, description="Content‑filter strategy: fit, raw, bm25, or llm" + ) + q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters") + c: Optional[str] = Field("0", description="Cache‑bust / revision counter") + provider: Optional[str] = Field( + None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')" + ) + temperature: Optional[float] = Field( + None, description="LLM temperature override (0.0-2.0)" + ) base_url: Optional[str] = Field(None, description="LLM API base URL override") class RawCode(BaseModel): code: str + class HTMLRequest(BaseModel): url: str - + + class ScreenshotRequest(BaseModel): url: str screenshot_wait_for: Optional[float] = 2 output_path: Optional[str] = None + class PDFRequest(BaseModel): url: str output_path: Optional[str] = None @@ -83,12 +95,55 @@ class PDFRequest(BaseModel): class JSEndpointRequest(BaseModel): url: str scripts: List[str] = Field( - ..., - description="List of separated JavaScript snippets to execute" + ..., description="List of separated JavaScript snippets to execute" ) class SeedRequest(BaseModel): """Request model for URL seeding endpoint.""" + url: str = Field(..., example="https://docs.crawl4ai.com") - config: Dict[str, Any] = Field(default_factory=dict) \ No newline at end of file + config: Dict[str, Any] = Field(default_factory=dict) + + +# --- C4A Script Schemas --- + + +class C4AScriptPayload(BaseModel): + """Input model for receiving a C4A-Script.""" + + script: str = Field(..., description="The C4A-Script content to process.") + + +# --- Adaptive Crawling Schemas --- + + +class AdaptiveConfigPayload(BaseModel): + """Pydantic model for receiving AdaptiveConfig parameters.""" + + confidence_threshold: float = 0.7 + max_pages: int = 20 + top_k_links: int = 3 + strategy: str = "statistical" # "statistical" or "embedding" + embedding_model: Optional[str] = "sentence-transformers/all-MiniLM-L6-v2" + # Add any other AdaptiveConfig fields you want to expose + + +class AdaptiveCrawlRequest(BaseModel): + """Input model for the adaptive digest job.""" + + start_url: str = Field(..., description="The starting URL for the adaptive crawl.") + query: str = Field(..., description="The user query to guide the crawl.") + config: Optional[AdaptiveConfigPayload] = Field( + None, description="Optional adaptive crawler configuration." + ) + + +class AdaptiveJobStatus(BaseModel): + """Output model for the job status.""" + + task_id: str + status: str + metrics: Optional[Dict[str, Any]] = None + result: Optional[Dict[str, Any]] = None + error: Optional[str] = None diff --git a/deploy/docker/server.py b/deploy/docker/server.py index b5288646..1fe2783f 100644 --- a/deploy/docker/server.py +++ b/deploy/docker/server.py @@ -20,7 +20,6 @@ from pathlib import Path from typing import Any, Dict, List, Optional from urllib.parse import urlparse -import adaptive_routes from api import ( handle_crawl_request, handle_llm_qa, @@ -48,6 +47,7 @@ from prometheus_fastapi_instrumentator import Instrumentator from pydantic import BaseModel, Field from rank_bm25 import BM25Okapi from redis import asyncio as aioredis +from routers import adaptive, scripts from schemas import ( CrawlRequestWithHooks, HTMLRequest, @@ -124,7 +124,6 @@ app = FastAPI( lifespan=lifespan, ) -app.include_router(adaptive_routes.router) # ── static playground ────────────────────────────────────── STATIC_DIR = pathlib.Path(__file__).parent / "static" / "playground" if not STATIC_DIR.exists(): @@ -219,6 +218,8 @@ def _safe_eval_config(expr: str) -> dict: # ── job router ────────────────────────────────────────────── app.include_router(init_job_router(redis, config, token_dep)) +app.include_router(adaptive.router) +app.include_router(scripts.router) # ──────────────────────── Endpoints ──────────────────────────