121 lines
3.9 KiB
Python
121 lines
3.9 KiB
Python
import uuid
|
|
from typing import Any, Dict
|
|
|
|
from fastapi import APIRouter, BackgroundTasks, HTTPException
|
|
from schemas import AdaptiveConfigPayload, AdaptiveCrawlRequest, AdaptiveJobStatus
|
|
|
|
from crawl4ai import AsyncWebCrawler
|
|
from crawl4ai.adaptive_crawler import AdaptiveConfig, AdaptiveCrawler
|
|
from crawl4ai.utils import get_error_context
|
|
|
|
# --- In-memory storage for job statuses. For production, use Redis or a database. ---
|
|
ADAPTIVE_JOBS: Dict[str, Dict[str, Any]] = {}
|
|
|
|
# --- APIRouter for Adaptive Crawling Endpoints ---
|
|
router = APIRouter(
|
|
prefix="/adaptive/digest",
|
|
tags=["Adaptive Crawling"],
|
|
)
|
|
|
|
# --- Background Worker Function ---
|
|
|
|
|
|
async def run_adaptive_digest(task_id: str, request: AdaptiveCrawlRequest):
|
|
"""The actual async worker that performs the adaptive crawl."""
|
|
try:
|
|
# Update job status to RUNNING
|
|
ADAPTIVE_JOBS[task_id]["status"] = "RUNNING"
|
|
|
|
# Create AdaptiveConfig from payload or use default
|
|
if request.config:
|
|
adaptive_config = AdaptiveConfig(**request.config.model_dump())
|
|
else:
|
|
adaptive_config = AdaptiveConfig()
|
|
|
|
# The adaptive crawler needs an instance of the web crawler
|
|
async with AsyncWebCrawler() as crawler:
|
|
adaptive_crawler = AdaptiveCrawler(crawler, config=adaptive_config)
|
|
|
|
# This is the long-running operation
|
|
final_state = await adaptive_crawler.digest(
|
|
start_url=request.start_url, query=request.query
|
|
)
|
|
|
|
# Process the final state into a clean result
|
|
result_data = {
|
|
"confidence": final_state.metrics.get("confidence", 0.0),
|
|
"is_sufficient": adaptive_crawler.is_sufficient,
|
|
"coverage_stats": adaptive_crawler.coverage_stats,
|
|
"relevant_content": adaptive_crawler.get_relevant_content(top_k=5),
|
|
}
|
|
|
|
# Update job with the final result
|
|
ADAPTIVE_JOBS[task_id].update(
|
|
{
|
|
"status": "COMPLETED",
|
|
"result": result_data,
|
|
"metrics": final_state.metrics,
|
|
}
|
|
)
|
|
|
|
except Exception as e:
|
|
# On failure, update the job with an error message
|
|
import sys
|
|
|
|
error_context = get_error_context(sys.exc_info())
|
|
error_message = f"Adaptive crawl failed: {str(e)}\nContext: {error_context}"
|
|
|
|
ADAPTIVE_JOBS[task_id].update({"status": "FAILED", "error": error_message})
|
|
|
|
|
|
# --- API Endpoints ---
|
|
|
|
|
|
@router.post("/job", response_model=AdaptiveJobStatus, status_code=202)
|
|
async def submit_adaptive_digest_job(
|
|
request: AdaptiveCrawlRequest,
|
|
background_tasks: BackgroundTasks,
|
|
):
|
|
"""
|
|
Submit a new adaptive crawling job.
|
|
|
|
This endpoint starts a long-running adaptive crawl in the background and
|
|
immediately returns a task ID for polling the job's status.
|
|
"""
|
|
|
|
print("Received adaptive crawl request:", request)
|
|
task_id = str(uuid.uuid4())
|
|
|
|
# Initialize the job in our in-memory store
|
|
ADAPTIVE_JOBS[task_id] = {
|
|
"task_id": task_id,
|
|
"status": "PENDING",
|
|
"metrics": None,
|
|
"result": None,
|
|
"error": None,
|
|
}
|
|
|
|
# Add the long-running task to the background
|
|
background_tasks.add_task(run_adaptive_digest, task_id, request)
|
|
|
|
return ADAPTIVE_JOBS[task_id]
|
|
|
|
|
|
@router.get("/job/{task_id}", response_model=AdaptiveJobStatus)
|
|
async def get_adaptive_digest_status(task_id: str):
|
|
"""
|
|
Get the status and result of an adaptive crawling job.
|
|
|
|
Poll this endpoint with the `task_id` returned from the submission
|
|
endpoint until the status is 'COMPLETED' or 'FAILED'.
|
|
"""
|
|
job = ADAPTIVE_JOBS.get(task_id)
|
|
if not job:
|
|
raise HTTPException(status_code=404, detail="Job not found")
|
|
|
|
# If the job is running, update the metrics from the live state
|
|
if job["status"] == "RUNNING" and job.get("live_state"):
|
|
job["metrics"] = job["live_state"].metrics
|
|
|
|
return job
|