feat(adaptive-crawling): implement adaptive crawling endpoints and integrate with server
This commit is contained in:
0
deploy/docker/routers/__init__.py
Normal file
0
deploy/docker/routers/__init__.py
Normal file
@@ -1,9 +1,8 @@
|
|||||||
import asyncio
|
|
||||||
import uuid
|
import uuid
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict
|
||||||
|
|
||||||
from fastapi import APIRouter, BackgroundTasks, Body, HTTPException
|
from fastapi import APIRouter, BackgroundTasks, HTTPException
|
||||||
from pydantic import BaseModel, Field
|
from schemas import AdaptiveConfigPayload, AdaptiveCrawlRequest, AdaptiveJobStatus
|
||||||
|
|
||||||
from crawl4ai import AsyncWebCrawler
|
from crawl4ai import AsyncWebCrawler
|
||||||
from crawl4ai.adaptive_crawler import AdaptiveConfig, AdaptiveCrawler
|
from crawl4ai.adaptive_crawler import AdaptiveConfig, AdaptiveCrawler
|
||||||
@@ -12,40 +11,6 @@ from crawl4ai.utils import get_error_context
|
|||||||
# --- In-memory storage for job statuses. For production, use Redis or a database. ---
|
# --- In-memory storage for job statuses. For production, use Redis or a database. ---
|
||||||
ADAPTIVE_JOBS: Dict[str, Dict[str, Any]] = {}
|
ADAPTIVE_JOBS: Dict[str, Dict[str, Any]] = {}
|
||||||
|
|
||||||
# --- Pydantic Models for API Validation ---
|
|
||||||
|
|
||||||
|
|
||||||
class AdaptiveConfigPayload(BaseModel):
|
|
||||||
"""Pydantic model for receiving AdaptiveConfig parameters."""
|
|
||||||
|
|
||||||
confidence_threshold: float = 0.7
|
|
||||||
max_pages: int = 20
|
|
||||||
top_k_links: int = 3
|
|
||||||
strategy: str = "statistical" # "statistical" or "embedding"
|
|
||||||
embedding_model: Optional[str] = "sentence-transformers/all-MiniLM-L6-v2"
|
|
||||||
# Add any other AdaptiveConfig fields you want to expose
|
|
||||||
|
|
||||||
|
|
||||||
class AdaptiveCrawlRequest(BaseModel):
|
|
||||||
"""Input model for the adaptive digest job."""
|
|
||||||
|
|
||||||
start_url: str = Field(..., description="The starting URL for the adaptive crawl.")
|
|
||||||
query: str = Field(..., description="The user query to guide the crawl.")
|
|
||||||
config: Optional[AdaptiveConfigPayload] = Field(
|
|
||||||
None, description="Optional adaptive crawler configuration."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class AdaptiveJobStatus(BaseModel):
|
|
||||||
"""Output model for the job status."""
|
|
||||||
|
|
||||||
task_id: str
|
|
||||||
status: str
|
|
||||||
metrics: Optional[Dict[str, Any]] = None
|
|
||||||
result: Optional[Dict[str, Any]] = None
|
|
||||||
error: Optional[str] = None
|
|
||||||
|
|
||||||
|
|
||||||
# --- APIRouter for Adaptive Crawling Endpoints ---
|
# --- APIRouter for Adaptive Crawling Endpoints ---
|
||||||
router = APIRouter(
|
router = APIRouter(
|
||||||
prefix="/adaptive/digest",
|
prefix="/adaptive/digest",
|
||||||
135
deploy/docker/routers/scripts.py
Normal file
135
deploy/docker/routers/scripts.py
Normal file
@@ -0,0 +1,135 @@
|
|||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from fastapi import APIRouter, File, Form, HTTPException, UploadFile
|
||||||
|
from schemas import C4AScriptPayload
|
||||||
|
|
||||||
|
from crawl4ai.script import (
|
||||||
|
CompilationResult,
|
||||||
|
ValidationResult,
|
||||||
|
# ErrorDetail
|
||||||
|
)
|
||||||
|
|
||||||
|
# Import all necessary components from the crawl4ai library
|
||||||
|
# C4A Script Language Support
|
||||||
|
from crawl4ai.script import (
|
||||||
|
compile as c4a_compile,
|
||||||
|
)
|
||||||
|
from crawl4ai.script import (
|
||||||
|
validate as c4a_validate,
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- APIRouter for c4a Scripts Endpoints ---
|
||||||
|
router = APIRouter(
|
||||||
|
prefix="/c4a",
|
||||||
|
tags=["c4a Scripts"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Background Worker Function ---
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/validate", response_model=ValidationResult, summary="Validate a C4A-Script"
|
||||||
|
)
|
||||||
|
async def validate_c4a_script_endpoint(payload: C4AScriptPayload):
|
||||||
|
"""
|
||||||
|
Validates the syntax of a C4A-Script without compiling it.
|
||||||
|
|
||||||
|
Returns a `ValidationResult` object indicating whether the script is
|
||||||
|
valid and providing detailed error information if it's not.
|
||||||
|
"""
|
||||||
|
# The validate function is designed not to raise exceptions
|
||||||
|
validation_result = c4a_validate(payload.script)
|
||||||
|
return validation_result
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/compile", response_model=CompilationResult, summary="Compile a C4A-Script"
|
||||||
|
)
|
||||||
|
async def compile_c4a_script_endpoint(payload: C4AScriptPayload):
|
||||||
|
"""
|
||||||
|
Compiles a C4A-Script into executable JavaScript.
|
||||||
|
|
||||||
|
If successful, returns the compiled JavaScript code. If there are syntax
|
||||||
|
errors, it returns a detailed error report.
|
||||||
|
"""
|
||||||
|
# The compile function also returns a result object instead of raising
|
||||||
|
compilation_result = c4a_compile(payload.script)
|
||||||
|
|
||||||
|
if not compilation_result.success:
|
||||||
|
# You can optionally raise an HTTP exception for failed compilations
|
||||||
|
# This makes it clearer on the client-side that it was a bad request
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=compilation_result.to_dict(), # FastAPI will serialize this
|
||||||
|
)
|
||||||
|
|
||||||
|
return compilation_result
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/compile-file",
|
||||||
|
response_model=CompilationResult,
|
||||||
|
summary="Compile a C4A-Script from file or string",
|
||||||
|
)
|
||||||
|
async def compile_c4a_script_file_endpoint(
|
||||||
|
file: Optional[UploadFile] = File(None), script: Optional[str] = Form(None)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Compiles a C4A-Script into executable JavaScript from either an uploaded file or string content.
|
||||||
|
|
||||||
|
Accepts either:
|
||||||
|
- A file upload containing the C4A-Script
|
||||||
|
- A string containing the C4A-Script content
|
||||||
|
|
||||||
|
At least one of the parameters must be provided.
|
||||||
|
|
||||||
|
If successful, returns the compiled JavaScript code. If there are syntax
|
||||||
|
errors, it returns a detailed error report.
|
||||||
|
"""
|
||||||
|
script_content = None
|
||||||
|
|
||||||
|
# Validate that at least one input is provided
|
||||||
|
if not file and not script:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail={"error": "Either 'file' or 'script' parameter must be provided"},
|
||||||
|
)
|
||||||
|
|
||||||
|
# If both are provided, prioritize the file
|
||||||
|
if file and script:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail={"error": "Please provide either 'file' or 'script', not both"},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Handle file upload
|
||||||
|
if file:
|
||||||
|
try:
|
||||||
|
file_content = await file.read()
|
||||||
|
script_content = file_content.decode("utf-8")
|
||||||
|
except UnicodeDecodeError as exc:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail={"error": "File must be a valid UTF-8 text file"},
|
||||||
|
) from exc
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400, detail={"error": f"Error reading file: {str(e)}"}
|
||||||
|
) from e
|
||||||
|
|
||||||
|
# Handle string content
|
||||||
|
elif script:
|
||||||
|
script_content = script
|
||||||
|
|
||||||
|
# Compile the script content
|
||||||
|
compilation_result = c4a_compile(script_content)
|
||||||
|
|
||||||
|
if not compilation_result.success:
|
||||||
|
# You can optionally raise an HTTP exception for failed compilations
|
||||||
|
# This makes it clearer on the client-side that it was a bad request
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=compilation_result.to_dict(), # FastAPI will serialize this
|
||||||
|
)
|
||||||
|
|
||||||
|
return compilation_result
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
from typing import List, Optional, Dict, Any
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from utils import FilterType
|
from utils import FilterType
|
||||||
|
|
||||||
@@ -12,15 +13,15 @@ class CrawlRequest(BaseModel):
|
|||||||
|
|
||||||
class HookConfig(BaseModel):
|
class HookConfig(BaseModel):
|
||||||
"""Configuration for user-provided hooks"""
|
"""Configuration for user-provided hooks"""
|
||||||
|
|
||||||
code: Dict[str, str] = Field(
|
code: Dict[str, str] = Field(
|
||||||
default_factory=dict,
|
default_factory=dict, description="Map of hook points to Python code strings"
|
||||||
description="Map of hook points to Python code strings"
|
|
||||||
)
|
)
|
||||||
timeout: int = Field(
|
timeout: int = Field(
|
||||||
default=30,
|
default=30,
|
||||||
ge=1,
|
ge=1,
|
||||||
le=120,
|
le=120,
|
||||||
description="Timeout in seconds for each hook execution"
|
description="Timeout in seconds for each hook execution",
|
||||||
)
|
)
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
@@ -39,42 +40,53 @@ async def hook(page, context, **kwargs):
|
|||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
await page.wait_for_timeout(2000)
|
await page.wait_for_timeout(2000)
|
||||||
return page
|
return page
|
||||||
"""
|
""",
|
||||||
},
|
},
|
||||||
"timeout": 30
|
"timeout": 30,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class CrawlRequestWithHooks(CrawlRequest):
|
class CrawlRequestWithHooks(CrawlRequest):
|
||||||
"""Extended crawl request with hooks support"""
|
"""Extended crawl request with hooks support"""
|
||||||
|
|
||||||
hooks: Optional[HookConfig] = Field(
|
hooks: Optional[HookConfig] = Field(
|
||||||
default=None,
|
default=None, description="Optional user-provided hook functions"
|
||||||
description="Optional user-provided hook functions"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class MarkdownRequest(BaseModel):
|
class MarkdownRequest(BaseModel):
|
||||||
"""Request body for the /md endpoint."""
|
"""Request body for the /md endpoint."""
|
||||||
url: str = Field(..., description="Absolute http/https URL to fetch")
|
|
||||||
f: FilterType = Field(FilterType.FIT, description="Content‑filter strategy: fit, raw, bm25, or llm")
|
url: str = Field(..., description="Absolute http/https URL to fetch")
|
||||||
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
f: FilterType = Field(
|
||||||
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
FilterType.FIT, description="Content‑filter strategy: fit, raw, bm25, or llm"
|
||||||
provider: Optional[str] = Field(None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')")
|
)
|
||||||
temperature: Optional[float] = Field(None, description="LLM temperature override (0.0-2.0)")
|
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
||||||
|
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
||||||
|
provider: Optional[str] = Field(
|
||||||
|
None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')"
|
||||||
|
)
|
||||||
|
temperature: Optional[float] = Field(
|
||||||
|
None, description="LLM temperature override (0.0-2.0)"
|
||||||
|
)
|
||||||
base_url: Optional[str] = Field(None, description="LLM API base URL override")
|
base_url: Optional[str] = Field(None, description="LLM API base URL override")
|
||||||
|
|
||||||
|
|
||||||
class RawCode(BaseModel):
|
class RawCode(BaseModel):
|
||||||
code: str
|
code: str
|
||||||
|
|
||||||
|
|
||||||
class HTMLRequest(BaseModel):
|
class HTMLRequest(BaseModel):
|
||||||
url: str
|
url: str
|
||||||
|
|
||||||
|
|
||||||
class ScreenshotRequest(BaseModel):
|
class ScreenshotRequest(BaseModel):
|
||||||
url: str
|
url: str
|
||||||
screenshot_wait_for: Optional[float] = 2
|
screenshot_wait_for: Optional[float] = 2
|
||||||
output_path: Optional[str] = None
|
output_path: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
class PDFRequest(BaseModel):
|
class PDFRequest(BaseModel):
|
||||||
url: str
|
url: str
|
||||||
output_path: Optional[str] = None
|
output_path: Optional[str] = None
|
||||||
@@ -83,12 +95,55 @@ class PDFRequest(BaseModel):
|
|||||||
class JSEndpointRequest(BaseModel):
|
class JSEndpointRequest(BaseModel):
|
||||||
url: str
|
url: str
|
||||||
scripts: List[str] = Field(
|
scripts: List[str] = Field(
|
||||||
...,
|
..., description="List of separated JavaScript snippets to execute"
|
||||||
description="List of separated JavaScript snippets to execute"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class SeedRequest(BaseModel):
|
class SeedRequest(BaseModel):
|
||||||
"""Request model for URL seeding endpoint."""
|
"""Request model for URL seeding endpoint."""
|
||||||
|
|
||||||
url: str = Field(..., example="https://docs.crawl4ai.com")
|
url: str = Field(..., example="https://docs.crawl4ai.com")
|
||||||
config: Dict[str, Any] = Field(default_factory=dict)
|
config: Dict[str, Any] = Field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
# --- C4A Script Schemas ---
|
||||||
|
|
||||||
|
|
||||||
|
class C4AScriptPayload(BaseModel):
|
||||||
|
"""Input model for receiving a C4A-Script."""
|
||||||
|
|
||||||
|
script: str = Field(..., description="The C4A-Script content to process.")
|
||||||
|
|
||||||
|
|
||||||
|
# --- Adaptive Crawling Schemas ---
|
||||||
|
|
||||||
|
|
||||||
|
class AdaptiveConfigPayload(BaseModel):
|
||||||
|
"""Pydantic model for receiving AdaptiveConfig parameters."""
|
||||||
|
|
||||||
|
confidence_threshold: float = 0.7
|
||||||
|
max_pages: int = 20
|
||||||
|
top_k_links: int = 3
|
||||||
|
strategy: str = "statistical" # "statistical" or "embedding"
|
||||||
|
embedding_model: Optional[str] = "sentence-transformers/all-MiniLM-L6-v2"
|
||||||
|
# Add any other AdaptiveConfig fields you want to expose
|
||||||
|
|
||||||
|
|
||||||
|
class AdaptiveCrawlRequest(BaseModel):
|
||||||
|
"""Input model for the adaptive digest job."""
|
||||||
|
|
||||||
|
start_url: str = Field(..., description="The starting URL for the adaptive crawl.")
|
||||||
|
query: str = Field(..., description="The user query to guide the crawl.")
|
||||||
|
config: Optional[AdaptiveConfigPayload] = Field(
|
||||||
|
None, description="Optional adaptive crawler configuration."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class AdaptiveJobStatus(BaseModel):
|
||||||
|
"""Output model for the job status."""
|
||||||
|
|
||||||
|
task_id: str
|
||||||
|
status: str
|
||||||
|
metrics: Optional[Dict[str, Any]] = None
|
||||||
|
result: Optional[Dict[str, Any]] = None
|
||||||
|
error: Optional[str] = None
|
||||||
|
|||||||
@@ -20,7 +20,6 @@ from pathlib import Path
|
|||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import adaptive_routes
|
|
||||||
from api import (
|
from api import (
|
||||||
handle_crawl_request,
|
handle_crawl_request,
|
||||||
handle_llm_qa,
|
handle_llm_qa,
|
||||||
@@ -48,6 +47,7 @@ from prometheus_fastapi_instrumentator import Instrumentator
|
|||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from rank_bm25 import BM25Okapi
|
from rank_bm25 import BM25Okapi
|
||||||
from redis import asyncio as aioredis
|
from redis import asyncio as aioredis
|
||||||
|
from routers import adaptive, scripts
|
||||||
from schemas import (
|
from schemas import (
|
||||||
CrawlRequestWithHooks,
|
CrawlRequestWithHooks,
|
||||||
HTMLRequest,
|
HTMLRequest,
|
||||||
@@ -124,7 +124,6 @@ app = FastAPI(
|
|||||||
lifespan=lifespan,
|
lifespan=lifespan,
|
||||||
)
|
)
|
||||||
|
|
||||||
app.include_router(adaptive_routes.router)
|
|
||||||
# ── static playground ──────────────────────────────────────
|
# ── static playground ──────────────────────────────────────
|
||||||
STATIC_DIR = pathlib.Path(__file__).parent / "static" / "playground"
|
STATIC_DIR = pathlib.Path(__file__).parent / "static" / "playground"
|
||||||
if not STATIC_DIR.exists():
|
if not STATIC_DIR.exists():
|
||||||
@@ -219,6 +218,8 @@ def _safe_eval_config(expr: str) -> dict:
|
|||||||
|
|
||||||
# ── job router ──────────────────────────────────────────────
|
# ── job router ──────────────────────────────────────────────
|
||||||
app.include_router(init_job_router(redis, config, token_dep))
|
app.include_router(init_job_router(redis, config, token_dep))
|
||||||
|
app.include_router(adaptive.router)
|
||||||
|
app.include_router(scripts.router)
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────── Endpoints ──────────────────────────
|
# ──────────────────────── Endpoints ──────────────────────────
|
||||||
|
|||||||
Reference in New Issue
Block a user