feat(adaptive-crawling): implement adaptive crawling endpoints and integrate with server

This commit is contained in:
AHMET YILMAZ
2025-10-01 15:53:56 +08:00
parent a62cfeebd9
commit 1a8e0236af
5 changed files with 215 additions and 59 deletions

View File

View File

@@ -1,9 +1,8 @@
import asyncio
import uuid import uuid
from typing import Any, Dict, List, Optional from typing import Any, Dict
from fastapi import APIRouter, BackgroundTasks, Body, HTTPException from fastapi import APIRouter, BackgroundTasks, HTTPException
from pydantic import BaseModel, Field from schemas import AdaptiveConfigPayload, AdaptiveCrawlRequest, AdaptiveJobStatus
from crawl4ai import AsyncWebCrawler from crawl4ai import AsyncWebCrawler
from crawl4ai.adaptive_crawler import AdaptiveConfig, AdaptiveCrawler from crawl4ai.adaptive_crawler import AdaptiveConfig, AdaptiveCrawler
@@ -12,40 +11,6 @@ from crawl4ai.utils import get_error_context
# --- In-memory storage for job statuses. For production, use Redis or a database. --- # --- In-memory storage for job statuses. For production, use Redis or a database. ---
ADAPTIVE_JOBS: Dict[str, Dict[str, Any]] = {} ADAPTIVE_JOBS: Dict[str, Dict[str, Any]] = {}
# --- Pydantic Models for API Validation ---
class AdaptiveConfigPayload(BaseModel):
"""Pydantic model for receiving AdaptiveConfig parameters."""
confidence_threshold: float = 0.7
max_pages: int = 20
top_k_links: int = 3
strategy: str = "statistical" # "statistical" or "embedding"
embedding_model: Optional[str] = "sentence-transformers/all-MiniLM-L6-v2"
# Add any other AdaptiveConfig fields you want to expose
class AdaptiveCrawlRequest(BaseModel):
"""Input model for the adaptive digest job."""
start_url: str = Field(..., description="The starting URL for the adaptive crawl.")
query: str = Field(..., description="The user query to guide the crawl.")
config: Optional[AdaptiveConfigPayload] = Field(
None, description="Optional adaptive crawler configuration."
)
class AdaptiveJobStatus(BaseModel):
"""Output model for the job status."""
task_id: str
status: str
metrics: Optional[Dict[str, Any]] = None
result: Optional[Dict[str, Any]] = None
error: Optional[str] = None
# --- APIRouter for Adaptive Crawling Endpoints --- # --- APIRouter for Adaptive Crawling Endpoints ---
router = APIRouter( router = APIRouter(
prefix="/adaptive/digest", prefix="/adaptive/digest",

View File

@@ -0,0 +1,135 @@
from typing import Optional
from fastapi import APIRouter, File, Form, HTTPException, UploadFile
from schemas import C4AScriptPayload
from crawl4ai.script import (
CompilationResult,
ValidationResult,
# ErrorDetail
)
# Import all necessary components from the crawl4ai library
# C4A Script Language Support
from crawl4ai.script import (
compile as c4a_compile,
)
from crawl4ai.script import (
validate as c4a_validate,
)
# --- APIRouter for c4a Scripts Endpoints ---
router = APIRouter(
prefix="/c4a",
tags=["c4a Scripts"],
)
# --- Background Worker Function ---
@router.post(
"/validate", response_model=ValidationResult, summary="Validate a C4A-Script"
)
async def validate_c4a_script_endpoint(payload: C4AScriptPayload):
"""
Validates the syntax of a C4A-Script without compiling it.
Returns a `ValidationResult` object indicating whether the script is
valid and providing detailed error information if it's not.
"""
# The validate function is designed not to raise exceptions
validation_result = c4a_validate(payload.script)
return validation_result
@router.post(
"/compile", response_model=CompilationResult, summary="Compile a C4A-Script"
)
async def compile_c4a_script_endpoint(payload: C4AScriptPayload):
"""
Compiles a C4A-Script into executable JavaScript.
If successful, returns the compiled JavaScript code. If there are syntax
errors, it returns a detailed error report.
"""
# The compile function also returns a result object instead of raising
compilation_result = c4a_compile(payload.script)
if not compilation_result.success:
# You can optionally raise an HTTP exception for failed compilations
# This makes it clearer on the client-side that it was a bad request
raise HTTPException(
status_code=400,
detail=compilation_result.to_dict(), # FastAPI will serialize this
)
return compilation_result
@router.post(
"/compile-file",
response_model=CompilationResult,
summary="Compile a C4A-Script from file or string",
)
async def compile_c4a_script_file_endpoint(
file: Optional[UploadFile] = File(None), script: Optional[str] = Form(None)
):
"""
Compiles a C4A-Script into executable JavaScript from either an uploaded file or string content.
Accepts either:
- A file upload containing the C4A-Script
- A string containing the C4A-Script content
At least one of the parameters must be provided.
If successful, returns the compiled JavaScript code. If there are syntax
errors, it returns a detailed error report.
"""
script_content = None
# Validate that at least one input is provided
if not file and not script:
raise HTTPException(
status_code=400,
detail={"error": "Either 'file' or 'script' parameter must be provided"},
)
# If both are provided, prioritize the file
if file and script:
raise HTTPException(
status_code=400,
detail={"error": "Please provide either 'file' or 'script', not both"},
)
# Handle file upload
if file:
try:
file_content = await file.read()
script_content = file_content.decode("utf-8")
except UnicodeDecodeError as exc:
raise HTTPException(
status_code=400,
detail={"error": "File must be a valid UTF-8 text file"},
) from exc
except Exception as e:
raise HTTPException(
status_code=400, detail={"error": f"Error reading file: {str(e)}"}
) from e
# Handle string content
elif script:
script_content = script
# Compile the script content
compilation_result = c4a_compile(script_content)
if not compilation_result.success:
# You can optionally raise an HTTP exception for failed compilations
# This makes it clearer on the client-side that it was a bad request
raise HTTPException(
status_code=400,
detail=compilation_result.to_dict(), # FastAPI will serialize this
)
return compilation_result

View File

@@ -1,5 +1,6 @@
from typing import List, Optional, Dict, Any
from enum import Enum from enum import Enum
from typing import Any, Dict, List, Optional
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from utils import FilterType from utils import FilterType
@@ -12,15 +13,15 @@ class CrawlRequest(BaseModel):
class HookConfig(BaseModel): class HookConfig(BaseModel):
"""Configuration for user-provided hooks""" """Configuration for user-provided hooks"""
code: Dict[str, str] = Field( code: Dict[str, str] = Field(
default_factory=dict, default_factory=dict, description="Map of hook points to Python code strings"
description="Map of hook points to Python code strings"
) )
timeout: int = Field( timeout: int = Field(
default=30, default=30,
ge=1, ge=1,
le=120, le=120,
description="Timeout in seconds for each hook execution" description="Timeout in seconds for each hook execution",
) )
class Config: class Config:
@@ -39,42 +40,53 @@ async def hook(page, context, **kwargs):
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(2000) await page.wait_for_timeout(2000)
return page return page
""" """,
}, },
"timeout": 30 "timeout": 30,
} }
} }
class CrawlRequestWithHooks(CrawlRequest): class CrawlRequestWithHooks(CrawlRequest):
"""Extended crawl request with hooks support""" """Extended crawl request with hooks support"""
hooks: Optional[HookConfig] = Field( hooks: Optional[HookConfig] = Field(
default=None, default=None, description="Optional user-provided hook functions"
description="Optional user-provided hook functions"
) )
class MarkdownRequest(BaseModel): class MarkdownRequest(BaseModel):
"""Request body for the /md endpoint.""" """Request body for the /md endpoint."""
url: str = Field(..., description="Absolute http/https URL to fetch")
f: FilterType = Field(FilterType.FIT, description="Contentfilter strategy: fit, raw, bm25, or llm") url: str = Field(..., description="Absolute http/https URL to fetch")
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters") f: FilterType = Field(
c: Optional[str] = Field("0", description="Cachebust / revision counter") FilterType.FIT, description="Contentfilter strategy: fit, raw, bm25, or llm"
provider: Optional[str] = Field(None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')") )
temperature: Optional[float] = Field(None, description="LLM temperature override (0.0-2.0)") q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
c: Optional[str] = Field("0", description="Cachebust / revision counter")
provider: Optional[str] = Field(
None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')"
)
temperature: Optional[float] = Field(
None, description="LLM temperature override (0.0-2.0)"
)
base_url: Optional[str] = Field(None, description="LLM API base URL override") base_url: Optional[str] = Field(None, description="LLM API base URL override")
class RawCode(BaseModel): class RawCode(BaseModel):
code: str code: str
class HTMLRequest(BaseModel): class HTMLRequest(BaseModel):
url: str url: str
class ScreenshotRequest(BaseModel): class ScreenshotRequest(BaseModel):
url: str url: str
screenshot_wait_for: Optional[float] = 2 screenshot_wait_for: Optional[float] = 2
output_path: Optional[str] = None output_path: Optional[str] = None
class PDFRequest(BaseModel): class PDFRequest(BaseModel):
url: str url: str
output_path: Optional[str] = None output_path: Optional[str] = None
@@ -83,12 +95,55 @@ class PDFRequest(BaseModel):
class JSEndpointRequest(BaseModel): class JSEndpointRequest(BaseModel):
url: str url: str
scripts: List[str] = Field( scripts: List[str] = Field(
..., ..., description="List of separated JavaScript snippets to execute"
description="List of separated JavaScript snippets to execute"
) )
class SeedRequest(BaseModel): class SeedRequest(BaseModel):
"""Request model for URL seeding endpoint.""" """Request model for URL seeding endpoint."""
url: str = Field(..., example="https://docs.crawl4ai.com") url: str = Field(..., example="https://docs.crawl4ai.com")
config: Dict[str, Any] = Field(default_factory=dict) config: Dict[str, Any] = Field(default_factory=dict)
# --- C4A Script Schemas ---
class C4AScriptPayload(BaseModel):
"""Input model for receiving a C4A-Script."""
script: str = Field(..., description="The C4A-Script content to process.")
# --- Adaptive Crawling Schemas ---
class AdaptiveConfigPayload(BaseModel):
"""Pydantic model for receiving AdaptiveConfig parameters."""
confidence_threshold: float = 0.7
max_pages: int = 20
top_k_links: int = 3
strategy: str = "statistical" # "statistical" or "embedding"
embedding_model: Optional[str] = "sentence-transformers/all-MiniLM-L6-v2"
# Add any other AdaptiveConfig fields you want to expose
class AdaptiveCrawlRequest(BaseModel):
"""Input model for the adaptive digest job."""
start_url: str = Field(..., description="The starting URL for the adaptive crawl.")
query: str = Field(..., description="The user query to guide the crawl.")
config: Optional[AdaptiveConfigPayload] = Field(
None, description="Optional adaptive crawler configuration."
)
class AdaptiveJobStatus(BaseModel):
"""Output model for the job status."""
task_id: str
status: str
metrics: Optional[Dict[str, Any]] = None
result: Optional[Dict[str, Any]] = None
error: Optional[str] = None

View File

@@ -20,7 +20,6 @@ from pathlib import Path
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
from urllib.parse import urlparse from urllib.parse import urlparse
import adaptive_routes
from api import ( from api import (
handle_crawl_request, handle_crawl_request,
handle_llm_qa, handle_llm_qa,
@@ -48,6 +47,7 @@ from prometheus_fastapi_instrumentator import Instrumentator
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from rank_bm25 import BM25Okapi from rank_bm25 import BM25Okapi
from redis import asyncio as aioredis from redis import asyncio as aioredis
from routers import adaptive, scripts
from schemas import ( from schemas import (
CrawlRequestWithHooks, CrawlRequestWithHooks,
HTMLRequest, HTMLRequest,
@@ -124,7 +124,6 @@ app = FastAPI(
lifespan=lifespan, lifespan=lifespan,
) )
app.include_router(adaptive_routes.router)
# ── static playground ────────────────────────────────────── # ── static playground ──────────────────────────────────────
STATIC_DIR = pathlib.Path(__file__).parent / "static" / "playground" STATIC_DIR = pathlib.Path(__file__).parent / "static" / "playground"
if not STATIC_DIR.exists(): if not STATIC_DIR.exists():
@@ -219,6 +218,8 @@ def _safe_eval_config(expr: str) -> dict:
# ── job router ────────────────────────────────────────────── # ── job router ──────────────────────────────────────────────
app.include_router(init_job_router(redis, config, token_dep)) app.include_router(init_job_router(redis, config, token_dep))
app.include_router(adaptive.router)
app.include_router(scripts.router)
# ──────────────────────── Endpoints ────────────────────────── # ──────────────────────── Endpoints ──────────────────────────