feat(adaptive-crawling): implement adaptive crawling endpoints and integrate with server
This commit is contained in:
120
deploy/docker/routers/adaptive.py
Normal file
120
deploy/docker/routers/adaptive.py
Normal file
@@ -0,0 +1,120 @@
|
||||
import uuid
|
||||
from typing import Any, Dict
|
||||
|
||||
from fastapi import APIRouter, BackgroundTasks, HTTPException
|
||||
from schemas import AdaptiveConfigPayload, AdaptiveCrawlRequest, AdaptiveJobStatus
|
||||
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.adaptive_crawler import AdaptiveConfig, AdaptiveCrawler
|
||||
from crawl4ai.utils import get_error_context
|
||||
|
||||
# --- In-memory storage for job statuses. For production, use Redis or a database. ---
|
||||
ADAPTIVE_JOBS: Dict[str, Dict[str, Any]] = {}
|
||||
|
||||
# --- APIRouter for Adaptive Crawling Endpoints ---
|
||||
router = APIRouter(
|
||||
prefix="/adaptive/digest",
|
||||
tags=["Adaptive Crawling"],
|
||||
)
|
||||
|
||||
# --- Background Worker Function ---
|
||||
|
||||
|
||||
async def run_adaptive_digest(task_id: str, request: AdaptiveCrawlRequest):
|
||||
"""The actual async worker that performs the adaptive crawl."""
|
||||
try:
|
||||
# Update job status to RUNNING
|
||||
ADAPTIVE_JOBS[task_id]["status"] = "RUNNING"
|
||||
|
||||
# Create AdaptiveConfig from payload or use default
|
||||
if request.config:
|
||||
adaptive_config = AdaptiveConfig(**request.config.model_dump())
|
||||
else:
|
||||
adaptive_config = AdaptiveConfig()
|
||||
|
||||
# The adaptive crawler needs an instance of the web crawler
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
adaptive_crawler = AdaptiveCrawler(crawler, config=adaptive_config)
|
||||
|
||||
# This is the long-running operation
|
||||
final_state = await adaptive_crawler.digest(
|
||||
start_url=request.start_url, query=request.query
|
||||
)
|
||||
|
||||
# Process the final state into a clean result
|
||||
result_data = {
|
||||
"confidence": final_state.metrics.get("confidence", 0.0),
|
||||
"is_sufficient": adaptive_crawler.is_sufficient,
|
||||
"coverage_stats": adaptive_crawler.coverage_stats,
|
||||
"relevant_content": adaptive_crawler.get_relevant_content(top_k=5),
|
||||
}
|
||||
|
||||
# Update job with the final result
|
||||
ADAPTIVE_JOBS[task_id].update(
|
||||
{
|
||||
"status": "COMPLETED",
|
||||
"result": result_data,
|
||||
"metrics": final_state.metrics,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# On failure, update the job with an error message
|
||||
import sys
|
||||
|
||||
error_context = get_error_context(sys.exc_info())
|
||||
error_message = f"Adaptive crawl failed: {str(e)}\nContext: {error_context}"
|
||||
|
||||
ADAPTIVE_JOBS[task_id].update({"status": "FAILED", "error": error_message})
|
||||
|
||||
|
||||
# --- API Endpoints ---
|
||||
|
||||
|
||||
@router.post("/job", response_model=AdaptiveJobStatus, status_code=202)
|
||||
async def submit_adaptive_digest_job(
|
||||
request: AdaptiveCrawlRequest,
|
||||
background_tasks: BackgroundTasks,
|
||||
):
|
||||
"""
|
||||
Submit a new adaptive crawling job.
|
||||
|
||||
This endpoint starts a long-running adaptive crawl in the background and
|
||||
immediately returns a task ID for polling the job's status.
|
||||
"""
|
||||
|
||||
print("Received adaptive crawl request:", request)
|
||||
task_id = str(uuid.uuid4())
|
||||
|
||||
# Initialize the job in our in-memory store
|
||||
ADAPTIVE_JOBS[task_id] = {
|
||||
"task_id": task_id,
|
||||
"status": "PENDING",
|
||||
"metrics": None,
|
||||
"result": None,
|
||||
"error": None,
|
||||
}
|
||||
|
||||
# Add the long-running task to the background
|
||||
background_tasks.add_task(run_adaptive_digest, task_id, request)
|
||||
|
||||
return ADAPTIVE_JOBS[task_id]
|
||||
|
||||
|
||||
@router.get("/job/{task_id}", response_model=AdaptiveJobStatus)
|
||||
async def get_adaptive_digest_status(task_id: str):
|
||||
"""
|
||||
Get the status and result of an adaptive crawling job.
|
||||
|
||||
Poll this endpoint with the `task_id` returned from the submission
|
||||
endpoint until the status is 'COMPLETED' or 'FAILED'.
|
||||
"""
|
||||
job = ADAPTIVE_JOBS.get(task_id)
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
# If the job is running, update the metrics from the live state
|
||||
if job["status"] == "RUNNING" and job.get("live_state"):
|
||||
job["metrics"] = job["live_state"].metrics
|
||||
|
||||
return job
|
||||
Reference in New Issue
Block a user