feat(adaptive-crawling): implement adaptive crawling endpoints and integrate with server
This commit is contained in:
0
deploy/docker/routers/__init__.py
Normal file
0
deploy/docker/routers/__init__.py
Normal file
@@ -1,9 +1,8 @@
|
||||
import asyncio
|
||||
import uuid
|
||||
from typing import Any, Dict, List, Optional
|
||||
from typing import Any, Dict
|
||||
|
||||
from fastapi import APIRouter, BackgroundTasks, Body, HTTPException
|
||||
from pydantic import BaseModel, Field
|
||||
from fastapi import APIRouter, BackgroundTasks, HTTPException
|
||||
from schemas import AdaptiveConfigPayload, AdaptiveCrawlRequest, AdaptiveJobStatus
|
||||
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.adaptive_crawler import AdaptiveConfig, AdaptiveCrawler
|
||||
@@ -12,40 +11,6 @@ from crawl4ai.utils import get_error_context
|
||||
# --- In-memory storage for job statuses. For production, use Redis or a database. ---
|
||||
ADAPTIVE_JOBS: Dict[str, Dict[str, Any]] = {}
|
||||
|
||||
# --- Pydantic Models for API Validation ---
|
||||
|
||||
|
||||
class AdaptiveConfigPayload(BaseModel):
|
||||
"""Pydantic model for receiving AdaptiveConfig parameters."""
|
||||
|
||||
confidence_threshold: float = 0.7
|
||||
max_pages: int = 20
|
||||
top_k_links: int = 3
|
||||
strategy: str = "statistical" # "statistical" or "embedding"
|
||||
embedding_model: Optional[str] = "sentence-transformers/all-MiniLM-L6-v2"
|
||||
# Add any other AdaptiveConfig fields you want to expose
|
||||
|
||||
|
||||
class AdaptiveCrawlRequest(BaseModel):
|
||||
"""Input model for the adaptive digest job."""
|
||||
|
||||
start_url: str = Field(..., description="The starting URL for the adaptive crawl.")
|
||||
query: str = Field(..., description="The user query to guide the crawl.")
|
||||
config: Optional[AdaptiveConfigPayload] = Field(
|
||||
None, description="Optional adaptive crawler configuration."
|
||||
)
|
||||
|
||||
|
||||
class AdaptiveJobStatus(BaseModel):
|
||||
"""Output model for the job status."""
|
||||
|
||||
task_id: str
|
||||
status: str
|
||||
metrics: Optional[Dict[str, Any]] = None
|
||||
result: Optional[Dict[str, Any]] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
# --- APIRouter for Adaptive Crawling Endpoints ---
|
||||
router = APIRouter(
|
||||
prefix="/adaptive/digest",
|
||||
135
deploy/docker/routers/scripts.py
Normal file
135
deploy/docker/routers/scripts.py
Normal file
@@ -0,0 +1,135 @@
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, File, Form, HTTPException, UploadFile
|
||||
from schemas import C4AScriptPayload
|
||||
|
||||
from crawl4ai.script import (
|
||||
CompilationResult,
|
||||
ValidationResult,
|
||||
# ErrorDetail
|
||||
)
|
||||
|
||||
# Import all necessary components from the crawl4ai library
|
||||
# C4A Script Language Support
|
||||
from crawl4ai.script import (
|
||||
compile as c4a_compile,
|
||||
)
|
||||
from crawl4ai.script import (
|
||||
validate as c4a_validate,
|
||||
)
|
||||
|
||||
# --- APIRouter for c4a Scripts Endpoints ---
|
||||
router = APIRouter(
|
||||
prefix="/c4a",
|
||||
tags=["c4a Scripts"],
|
||||
)
|
||||
|
||||
# --- Background Worker Function ---
|
||||
|
||||
|
||||
@router.post(
|
||||
"/validate", response_model=ValidationResult, summary="Validate a C4A-Script"
|
||||
)
|
||||
async def validate_c4a_script_endpoint(payload: C4AScriptPayload):
|
||||
"""
|
||||
Validates the syntax of a C4A-Script without compiling it.
|
||||
|
||||
Returns a `ValidationResult` object indicating whether the script is
|
||||
valid and providing detailed error information if it's not.
|
||||
"""
|
||||
# The validate function is designed not to raise exceptions
|
||||
validation_result = c4a_validate(payload.script)
|
||||
return validation_result
|
||||
|
||||
|
||||
@router.post(
|
||||
"/compile", response_model=CompilationResult, summary="Compile a C4A-Script"
|
||||
)
|
||||
async def compile_c4a_script_endpoint(payload: C4AScriptPayload):
|
||||
"""
|
||||
Compiles a C4A-Script into executable JavaScript.
|
||||
|
||||
If successful, returns the compiled JavaScript code. If there are syntax
|
||||
errors, it returns a detailed error report.
|
||||
"""
|
||||
# The compile function also returns a result object instead of raising
|
||||
compilation_result = c4a_compile(payload.script)
|
||||
|
||||
if not compilation_result.success:
|
||||
# You can optionally raise an HTTP exception for failed compilations
|
||||
# This makes it clearer on the client-side that it was a bad request
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=compilation_result.to_dict(), # FastAPI will serialize this
|
||||
)
|
||||
|
||||
return compilation_result
|
||||
|
||||
|
||||
@router.post(
|
||||
"/compile-file",
|
||||
response_model=CompilationResult,
|
||||
summary="Compile a C4A-Script from file or string",
|
||||
)
|
||||
async def compile_c4a_script_file_endpoint(
|
||||
file: Optional[UploadFile] = File(None), script: Optional[str] = Form(None)
|
||||
):
|
||||
"""
|
||||
Compiles a C4A-Script into executable JavaScript from either an uploaded file or string content.
|
||||
|
||||
Accepts either:
|
||||
- A file upload containing the C4A-Script
|
||||
- A string containing the C4A-Script content
|
||||
|
||||
At least one of the parameters must be provided.
|
||||
|
||||
If successful, returns the compiled JavaScript code. If there are syntax
|
||||
errors, it returns a detailed error report.
|
||||
"""
|
||||
script_content = None
|
||||
|
||||
# Validate that at least one input is provided
|
||||
if not file and not script:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail={"error": "Either 'file' or 'script' parameter must be provided"},
|
||||
)
|
||||
|
||||
# If both are provided, prioritize the file
|
||||
if file and script:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail={"error": "Please provide either 'file' or 'script', not both"},
|
||||
)
|
||||
|
||||
# Handle file upload
|
||||
if file:
|
||||
try:
|
||||
file_content = await file.read()
|
||||
script_content = file_content.decode("utf-8")
|
||||
except UnicodeDecodeError as exc:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail={"error": "File must be a valid UTF-8 text file"},
|
||||
) from exc
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=400, detail={"error": f"Error reading file: {str(e)}"}
|
||||
) from e
|
||||
|
||||
# Handle string content
|
||||
elif script:
|
||||
script_content = script
|
||||
|
||||
# Compile the script content
|
||||
compilation_result = c4a_compile(script_content)
|
||||
|
||||
if not compilation_result.success:
|
||||
# You can optionally raise an HTTP exception for failed compilations
|
||||
# This makes it clearer on the client-side that it was a bad request
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=compilation_result.to_dict(), # FastAPI will serialize this
|
||||
)
|
||||
|
||||
return compilation_result
|
||||
@@ -1,5 +1,6 @@
|
||||
from typing import List, Optional, Dict, Any
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from utils import FilterType
|
||||
|
||||
@@ -12,17 +13,17 @@ class CrawlRequest(BaseModel):
|
||||
|
||||
class HookConfig(BaseModel):
|
||||
"""Configuration for user-provided hooks"""
|
||||
|
||||
code: Dict[str, str] = Field(
|
||||
default_factory=dict,
|
||||
description="Map of hook points to Python code strings"
|
||||
default_factory=dict, description="Map of hook points to Python code strings"
|
||||
)
|
||||
timeout: int = Field(
|
||||
default=30,
|
||||
ge=1,
|
||||
le=120,
|
||||
description="Timeout in seconds for each hook execution"
|
||||
description="Timeout in seconds for each hook execution",
|
||||
)
|
||||
|
||||
|
||||
class Config:
|
||||
schema_extra = {
|
||||
"example": {
|
||||
@@ -39,42 +40,53 @@ async def hook(page, context, **kwargs):
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(2000)
|
||||
return page
|
||||
"""
|
||||
""",
|
||||
},
|
||||
"timeout": 30
|
||||
"timeout": 30,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class CrawlRequestWithHooks(CrawlRequest):
|
||||
"""Extended crawl request with hooks support"""
|
||||
|
||||
hooks: Optional[HookConfig] = Field(
|
||||
default=None,
|
||||
description="Optional user-provided hook functions"
|
||||
default=None, description="Optional user-provided hook functions"
|
||||
)
|
||||
|
||||
|
||||
class MarkdownRequest(BaseModel):
|
||||
"""Request body for the /md endpoint."""
|
||||
url: str = Field(..., description="Absolute http/https URL to fetch")
|
||||
f: FilterType = Field(FilterType.FIT, description="Content‑filter strategy: fit, raw, bm25, or llm")
|
||||
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
||||
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
||||
provider: Optional[str] = Field(None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')")
|
||||
temperature: Optional[float] = Field(None, description="LLM temperature override (0.0-2.0)")
|
||||
|
||||
url: str = Field(..., description="Absolute http/https URL to fetch")
|
||||
f: FilterType = Field(
|
||||
FilterType.FIT, description="Content‑filter strategy: fit, raw, bm25, or llm"
|
||||
)
|
||||
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
||||
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
||||
provider: Optional[str] = Field(
|
||||
None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')"
|
||||
)
|
||||
temperature: Optional[float] = Field(
|
||||
None, description="LLM temperature override (0.0-2.0)"
|
||||
)
|
||||
base_url: Optional[str] = Field(None, description="LLM API base URL override")
|
||||
|
||||
|
||||
class RawCode(BaseModel):
|
||||
code: str
|
||||
|
||||
|
||||
class HTMLRequest(BaseModel):
|
||||
url: str
|
||||
|
||||
|
||||
|
||||
class ScreenshotRequest(BaseModel):
|
||||
url: str
|
||||
screenshot_wait_for: Optional[float] = 2
|
||||
output_path: Optional[str] = None
|
||||
|
||||
|
||||
class PDFRequest(BaseModel):
|
||||
url: str
|
||||
output_path: Optional[str] = None
|
||||
@@ -83,12 +95,55 @@ class PDFRequest(BaseModel):
|
||||
class JSEndpointRequest(BaseModel):
|
||||
url: str
|
||||
scripts: List[str] = Field(
|
||||
...,
|
||||
description="List of separated JavaScript snippets to execute"
|
||||
..., description="List of separated JavaScript snippets to execute"
|
||||
)
|
||||
|
||||
|
||||
class SeedRequest(BaseModel):
|
||||
"""Request model for URL seeding endpoint."""
|
||||
|
||||
url: str = Field(..., example="https://docs.crawl4ai.com")
|
||||
config: Dict[str, Any] = Field(default_factory=dict)
|
||||
config: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
# --- C4A Script Schemas ---
|
||||
|
||||
|
||||
class C4AScriptPayload(BaseModel):
|
||||
"""Input model for receiving a C4A-Script."""
|
||||
|
||||
script: str = Field(..., description="The C4A-Script content to process.")
|
||||
|
||||
|
||||
# --- Adaptive Crawling Schemas ---
|
||||
|
||||
|
||||
class AdaptiveConfigPayload(BaseModel):
|
||||
"""Pydantic model for receiving AdaptiveConfig parameters."""
|
||||
|
||||
confidence_threshold: float = 0.7
|
||||
max_pages: int = 20
|
||||
top_k_links: int = 3
|
||||
strategy: str = "statistical" # "statistical" or "embedding"
|
||||
embedding_model: Optional[str] = "sentence-transformers/all-MiniLM-L6-v2"
|
||||
# Add any other AdaptiveConfig fields you want to expose
|
||||
|
||||
|
||||
class AdaptiveCrawlRequest(BaseModel):
|
||||
"""Input model for the adaptive digest job."""
|
||||
|
||||
start_url: str = Field(..., description="The starting URL for the adaptive crawl.")
|
||||
query: str = Field(..., description="The user query to guide the crawl.")
|
||||
config: Optional[AdaptiveConfigPayload] = Field(
|
||||
None, description="Optional adaptive crawler configuration."
|
||||
)
|
||||
|
||||
|
||||
class AdaptiveJobStatus(BaseModel):
|
||||
"""Output model for the job status."""
|
||||
|
||||
task_id: str
|
||||
status: str
|
||||
metrics: Optional[Dict[str, Any]] = None
|
||||
result: Optional[Dict[str, Any]] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
@@ -20,7 +20,6 @@ from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import adaptive_routes
|
||||
from api import (
|
||||
handle_crawl_request,
|
||||
handle_llm_qa,
|
||||
@@ -48,6 +47,7 @@ from prometheus_fastapi_instrumentator import Instrumentator
|
||||
from pydantic import BaseModel, Field
|
||||
from rank_bm25 import BM25Okapi
|
||||
from redis import asyncio as aioredis
|
||||
from routers import adaptive, scripts
|
||||
from schemas import (
|
||||
CrawlRequestWithHooks,
|
||||
HTMLRequest,
|
||||
@@ -124,7 +124,6 @@ app = FastAPI(
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
app.include_router(adaptive_routes.router)
|
||||
# ── static playground ──────────────────────────────────────
|
||||
STATIC_DIR = pathlib.Path(__file__).parent / "static" / "playground"
|
||||
if not STATIC_DIR.exists():
|
||||
@@ -219,6 +218,8 @@ def _safe_eval_config(expr: str) -> dict:
|
||||
|
||||
# ── job router ──────────────────────────────────────────────
|
||||
app.include_router(init_job_router(redis, config, token_dep))
|
||||
app.include_router(adaptive.router)
|
||||
app.include_router(scripts.router)
|
||||
|
||||
|
||||
# ──────────────────────── Endpoints ──────────────────────────
|
||||
|
||||
Reference in New Issue
Block a user