feat(adaptive-crawling): implement adaptive crawling endpoints and integrate with server

2025-10-01 15:53:56 +08:00
parent a62cfeebd9
commit 1a8e0236af
5 changed files with 215 additions and 59 deletions
--- a/deploy/docker/routers/adaptive.py
+++ b/deploy/docker/routers/adaptive.py
@@ -0,0 +1,120 @@
+import uuid
+from typing import Any, Dict
+
+from fastapi import APIRouter, BackgroundTasks, HTTPException
+from schemas import AdaptiveConfigPayload, AdaptiveCrawlRequest, AdaptiveJobStatus
+
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.adaptive_crawler import AdaptiveConfig, AdaptiveCrawler
+from crawl4ai.utils import get_error_context
+
+# --- In-memory storage for job statuses. For production, use Redis or a database. ---
+ADAPTIVE_JOBS: Dict[str, Dict[str, Any]] = {}
+
+# --- APIRouter for Adaptive Crawling Endpoints ---
+router = APIRouter(
+    prefix="/adaptive/digest",
+    tags=["Adaptive Crawling"],
+)
+
+# --- Background Worker Function ---
+
+
+async def run_adaptive_digest(task_id: str, request: AdaptiveCrawlRequest):
+    """The actual async worker that performs the adaptive crawl."""
+    try:
+        # Update job status to RUNNING
+        ADAPTIVE_JOBS[task_id]["status"] = "RUNNING"
+
+        # Create AdaptiveConfig from payload or use default
+        if request.config:
+            adaptive_config = AdaptiveConfig(**request.config.model_dump())
+        else:
+            adaptive_config = AdaptiveConfig()
+
+        # The adaptive crawler needs an instance of the web crawler
+        async with AsyncWebCrawler() as crawler:
+            adaptive_crawler = AdaptiveCrawler(crawler, config=adaptive_config)
+
+            # This is the long-running operation
+            final_state = await adaptive_crawler.digest(
+                start_url=request.start_url, query=request.query
+            )
+
+            # Process the final state into a clean result
+            result_data = {
+                "confidence": final_state.metrics.get("confidence", 0.0),
+                "is_sufficient": adaptive_crawler.is_sufficient,
+                "coverage_stats": adaptive_crawler.coverage_stats,
+                "relevant_content": adaptive_crawler.get_relevant_content(top_k=5),
+            }
+
+            # Update job with the final result
+            ADAPTIVE_JOBS[task_id].update(
+                {
+                    "status": "COMPLETED",
+                    "result": result_data,
+                    "metrics": final_state.metrics,
+                }
+            )
+
+    except Exception as e:
+        # On failure, update the job with an error message
+        import sys
+
+        error_context = get_error_context(sys.exc_info())
+        error_message = f"Adaptive crawl failed: {str(e)}\nContext: {error_context}"
+
+        ADAPTIVE_JOBS[task_id].update({"status": "FAILED", "error": error_message})
+
+
+# --- API Endpoints ---
+
+
+@router.post("/job", response_model=AdaptiveJobStatus, status_code=202)
+async def submit_adaptive_digest_job(
+    request: AdaptiveCrawlRequest,
+    background_tasks: BackgroundTasks,
+):
+    """
+    Submit a new adaptive crawling job.
+
+    This endpoint starts a long-running adaptive crawl in the background and
+    immediately returns a task ID for polling the job's status.
+    """
+
+    print("Received adaptive crawl request:", request)
+    task_id = str(uuid.uuid4())
+
+    # Initialize the job in our in-memory store
+    ADAPTIVE_JOBS[task_id] = {
+        "task_id": task_id,
+        "status": "PENDING",
+        "metrics": None,
+        "result": None,
+        "error": None,
+    }
+
+    # Add the long-running task to the background
+    background_tasks.add_task(run_adaptive_digest, task_id, request)
+
+    return ADAPTIVE_JOBS[task_id]
+
+
+@router.get("/job/{task_id}", response_model=AdaptiveJobStatus)
+async def get_adaptive_digest_status(task_id: str):
+    """
+    Get the status and result of an adaptive crawling job.
+
+    Poll this endpoint with the `task_id` returned from the submission
+    endpoint until the status is 'COMPLETED' or 'FAILED'.
+    """
+    job = ADAPTIVE_JOBS.get(task_id)
+    if not job:
+        raise HTTPException(status_code=404, detail="Job not found")
+
+    # If the job is running, update the metrics from the live state
+    if job["status"] == "RUNNING" and job.get("live_state"):
+        job["metrics"] = job["live_state"].metrics
+
+    return job