feat(adaptive-crawling): implement adaptive crawling endpoints and job management

This commit is contained in:
AHMET YILMAZ
2025-09-30 18:17:40 +08:00
parent bb3b29042f
commit a62cfeebd9
3 changed files with 314 additions and 156 deletions

View File

@@ -7,70 +7,63 @@ Crawl4AI FastAPI entrypoint
"""
# ── stdlib & 3rdparty imports ───────────────────────────────
from crawler_pool import get_crawler, close_all, janitor
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from auth import create_access_token, get_token_dependency, TokenRequest
from pydantic import BaseModel
from typing import Optional, List, Dict, Any
from fastapi import Request, Depends
from fastapi.responses import FileResponse
import base64
import re
import os
import sys
import ast
import asyncio
from contextlib import asynccontextmanager
from pathlib import Path
from urllib.parse import urlparse
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from api import (
handle_markdown_request, handle_llm_qa,
handle_stream_crawl_request, handle_crawl_request,
stream_results, handle_seed
)
from schemas import (
CrawlRequestWithHooks,
MarkdownRequest,
RawCode,
HTMLRequest,
ScreenshotRequest,
PDFRequest,
JSEndpointRequest,
SeedRequest,
)
from utils import (
FilterType, load_config, setup_logging, verify_email_domain
)
import base64
import os
import pathlib
import re
import sys
import time
import asyncio
from typing import List
from contextlib import asynccontextmanager
import pathlib
from pathlib import Path
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse
from fastapi import (
FastAPI, HTTPException, Request, Path, Query, Depends
)
from rank_bm25 import BM25Okapi
from fastapi.responses import (
StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
import adaptive_routes
from api import (
handle_crawl_request,
handle_llm_qa,
handle_markdown_request,
handle_seed,
handle_stream_crawl_request,
stream_results,
)
from auth import TokenRequest, create_access_token, get_token_dependency
from crawler_pool import close_all, get_crawler, janitor
from fastapi import Depends, FastAPI, HTTPException, Path, Query, Request
from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
from fastapi.middleware.trustedhost import TrustedHostMiddleware
from fastapi.responses import (
FileResponse,
JSONResponse,
PlainTextResponse,
RedirectResponse,
StreamingResponse,
)
from fastapi.staticfiles import StaticFiles
from job import init_job_router
from mcp_bridge import attach_mcp, mcp_resource, mcp_template, mcp_tool
import ast
import crawl4ai as _c4
from prometheus_fastapi_instrumentator import Instrumentator
from pydantic import BaseModel, Field
from rank_bm25 import BM25Okapi
from redis import asyncio as aioredis
from schemas import (
CrawlRequestWithHooks,
HTMLRequest,
JSEndpointRequest,
MarkdownRequest,
PDFRequest,
RawCode,
ScreenshotRequest,
SeedRequest,
)
from slowapi import Limiter
from slowapi.util import get_remote_address
from prometheus_fastapi_instrumentator import Instrumentator
from redis import asyncio as aioredis
from utils import FilterType, load_config, setup_logging, verify_email_domain
import crawl4ai as _c4
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
# ── internal imports (after sys.path append) ─────────────────
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
@@ -103,6 +96,8 @@ orig_arun = AsyncWebCrawler.arun
async def capped_arun(self, *a, **kw):
async with GLOBAL_SEM:
return await orig_arun(self, *a, **kw)
AsyncWebCrawler.arun = capped_arun
# ───────────────────── FastAPI lifespan ──────────────────────
@@ -110,15 +105,18 @@ AsyncWebCrawler.arun = capped_arun
@asynccontextmanager
async def lifespan(_: FastAPI):
await get_crawler(BrowserConfig(
extra_args=config["crawler"]["browser"].get("extra_args", []),
**config["crawler"]["browser"].get("kwargs", {}),
)) # warmup
app.state.janitor = asyncio.create_task(janitor()) # idle GC
await get_crawler(
BrowserConfig(
extra_args=config["crawler"]["browser"].get("extra_args", []),
**config["crawler"]["browser"].get("kwargs", {}),
)
) # warmup
app.state.janitor = asyncio.create_task(janitor()) # idle GC
yield
app.state.janitor.cancel()
await close_all()
# ───────────────────── FastAPI instance ──────────────────────
app = FastAPI(
title=config["app"]["title"],
@@ -126,6 +124,7 @@ app = FastAPI(
lifespan=lifespan,
)
app.include_router(adaptive_routes.router)
# ── static playground ──────────────────────────────────────
STATIC_DIR = pathlib.Path(__file__).parent / "static" / "playground"
if not STATIC_DIR.exists():
@@ -141,6 +140,7 @@ app.mount(
async def root():
return RedirectResponse("/playground")
# ─────────────────── infra / middleware ─────────────────────
redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost"))
@@ -158,9 +158,7 @@ def _setup_security(app_: FastAPI):
if sec.get("https_redirect"):
app_.add_middleware(HTTPSRedirectMiddleware)
if sec.get("trusted_hosts", []) != ["*"]:
app_.add_middleware(
TrustedHostMiddleware, allowed_hosts=sec["trusted_hosts"]
)
app_.add_middleware(TrustedHostMiddleware, allowed_hosts=sec["trusted_hosts"])
_setup_security(app)
@@ -178,6 +176,7 @@ async def add_security_headers(request: Request, call_next):
resp.headers.update(config["security"]["headers"])
return resp
# ───────────────── safe configdump helper ─────────────────
ALLOWED_TYPES = {
"CrawlerRunConfig": CrawlerRunConfig,
@@ -199,9 +198,11 @@ def _safe_eval_config(expr: str) -> dict:
raise ValueError("Expression must be a single constructor call")
call = tree.body
if not (isinstance(call.func, ast.Name) and call.func.id in {"CrawlerRunConfig", "BrowserConfig"}):
raise ValueError(
"Only CrawlerRunConfig(...) or BrowserConfig(...) are allowed")
if not (
isinstance(call.func, ast.Name)
and call.func.id in {"CrawlerRunConfig", "BrowserConfig"}
):
raise ValueError("Only CrawlerRunConfig(...) or BrowserConfig(...) are allowed")
# forbid nested calls to keep the surface tiny
for node in ast.walk(call):
@@ -209,16 +210,17 @@ def _safe_eval_config(expr: str) -> dict:
raise ValueError("Nested function calls are not permitted")
# expose everything that crawl4ai exports, nothing else
safe_env = {name: getattr(_c4, name)
for name in dir(_c4) if not name.startswith("_")}
obj = eval(compile(tree, "<config>", "eval"),
{"__builtins__": {}}, safe_env)
safe_env = {
name: getattr(_c4, name) for name in dir(_c4) if not name.startswith("_")
}
obj = eval(compile(tree, "<config>", "eval"), {"__builtins__": {}}, safe_env)
return obj.dump()
# ── job router ──────────────────────────────────────────────
app.include_router(init_job_router(redis, config, token_dep))
# ──────────────────────── Endpoints ──────────────────────────
@app.post("/token")
async def get_token(req: TokenRequest):
@@ -252,9 +254,9 @@ async def seed_url(request: SeedRequest):
status_code=400,
detail="Invalid URL provided. Could not extract domain.",
)
res = await handle_seed(request.url , request.config)
return JSONResponse({"seed_url":res , "count":len(res)})
res = await handle_seed(request.url, request.config)
return JSONResponse({"seed_url": res, "count": len(res)})
except Exception as e:
print(f"❌ Error in seed_url: {e}")
raise HTTPException(status_code=500, detail=str(e))
@@ -268,21 +270,33 @@ async def get_markdown(
body: MarkdownRequest,
_td: Dict = Depends(token_dep),
):
if not body.url.startswith(("http://", "https://")) and not body.url.startswith(("raw:", "raw://")):
if not body.url.startswith(("http://", "https://")) and not body.url.startswith(
("raw:", "raw://")
):
raise HTTPException(
400, "Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)")
400,
"Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)",
)
markdown = await handle_markdown_request(
body.url, body.f, body.q, body.c, config, body.provider,
body.temperature, body.base_url
body.url,
body.f,
body.q,
body.c,
config,
body.provider,
body.temperature,
body.base_url,
)
return JSONResponse(
{
"url": body.url,
"filter": body.f,
"query": body.q,
"cache": body.c,
"markdown": markdown,
"success": True,
}
)
return JSONResponse({
"url": body.url,
"filter": body.f,
"query": body.q,
"cache": body.c,
"markdown": markdown,
"success": True
})
@app.post("/html")
@@ -304,20 +318,18 @@ async def generate_html(
# Check if the crawl was successful
if not results[0].success:
raise HTTPException(
status_code=500,
detail=results[0].error_message or "Crawl failed"
status_code=500, detail=results[0].error_message or "Crawl failed"
)
raw_html = results[0].html
from crawl4ai.utils import preprocess_html_for_schema
processed_html = preprocess_html_for_schema(raw_html)
return JSONResponse({"html": processed_html, "url": body.url, "success": True})
except Exception as e:
# Log and raise as HTTP 500 for other exceptions
raise HTTPException(
status_code=500,
detail=str(e)
)
raise HTTPException(status_code=500, detail=str(e))
# Screenshot endpoint
@@ -337,13 +349,13 @@ async def generate_screenshot(
"""
try:
cfg = CrawlerRunConfig(
screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
screenshot=True, screenshot_wait_for=body.screenshot_wait_for
)
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
results = await crawler.arun(url=body.url, config=cfg)
if not results[0].success:
raise HTTPException(
status_code=500,
detail=results[0].error_message or "Crawl failed"
status_code=500, detail=results[0].error_message or "Crawl failed"
)
screenshot_data = results[0].screenshot
if body.output_path:
@@ -354,10 +366,8 @@ async def generate_screenshot(
return {"success": True, "path": abs_path}
return {"success": True, "screenshot": screenshot_data}
except Exception as e:
raise HTTPException(
status_code=500,
detail=str(e)
)
raise HTTPException(status_code=500, detail=str(e))
# PDF endpoint
@@ -381,8 +391,7 @@ async def generate_pdf(
results = await crawler.arun(url=body.url, config=cfg)
if not results[0].success:
raise HTTPException(
status_code=500,
detail=results[0].error_message or "Crawl failed"
status_code=500, detail=results[0].error_message or "Crawl failed"
)
pdf_data = results[0].pdf
if body.output_path:
@@ -393,10 +402,7 @@ async def generate_pdf(
return {"success": True, "path": abs_path}
return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
except Exception as e:
raise HTTPException(
status_code=500,
detail=str(e)
)
raise HTTPException(status_code=500, detail=str(e))
@app.post("/execute_js")
@@ -458,17 +464,13 @@ async def execute_js(
results = await crawler.arun(url=body.url, config=cfg)
if not results[0].success:
raise HTTPException(
status_code=500,
detail=results[0].error_message or "Crawl failed"
status_code=500, detail=results[0].error_message or "Crawl failed"
)
# Return JSON-serializable dict of the first CrawlResult
data = results[0].model_dump()
return JSONResponse(data)
except Exception as e:
raise HTTPException(
status_code=500,
detail=str(e)
)
raise HTTPException(status_code=500, detail=str(e))
@app.get("/llm/{url:path}")
@@ -480,7 +482,9 @@ async def llm_endpoint(
):
if not q:
raise HTTPException(400, "Query parameter 'q' is required")
if not url.startswith(("http://", "https://")) and not url.startswith(("raw:", "raw://")):
if not url.startswith(("http://", "https://")) and not url.startswith(
("raw:", "raw://")
):
url = "https://" + url
answer = await handle_llm_qa(url, q, config)
return JSONResponse({"answer": answer})
@@ -489,31 +493,29 @@ async def llm_endpoint(
@app.get("/schema")
async def get_schema():
from crawl4ai import BrowserConfig, CrawlerRunConfig
return {"browser": BrowserConfig().dump(),
"crawler": CrawlerRunConfig().dump()}
return {"browser": BrowserConfig().dump(), "crawler": CrawlerRunConfig().dump()}
@app.get("/hooks/info")
async def get_hooks_info():
"""Get information about available hook points and their signatures"""
from hook_manager import UserHookManager
hook_info = {}
for hook_point, params in UserHookManager.HOOK_SIGNATURES.items():
hook_info[hook_point] = {
"parameters": params,
"description": get_hook_description(hook_point),
"example": get_hook_example(hook_point)
"example": get_hook_example(hook_point),
}
return JSONResponse({
"available_hooks": hook_info,
"timeout_limits": {
"min": 1,
"max": 120,
"default": 30
return JSONResponse(
{
"available_hooks": hook_info,
"timeout_limits": {"min": 1, "max": 120, "default": 30},
}
})
)
def get_hook_description(hook_point: str) -> str:
@@ -526,7 +528,7 @@ def get_hook_description(hook_point: str) -> str:
"on_user_agent_updated": "Called when user agent is updated",
"on_execution_started": "Called when custom JavaScript execution begins",
"before_retrieve_html": "Called before retrieving the final HTML - ideal for scrolling",
"before_return_html": "Called just before returning the HTML content"
"before_return_html": "Called just before returning the HTML content",
}
return descriptions.get(hook_point, "")
@@ -542,19 +544,17 @@ def get_hook_example(hook_point: str) -> str:
'domain': '.example.com'
}])
return page""",
"before_retrieve_html": """async def hook(page, context, **kwargs):
# Scroll to load lazy content
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(2000)
return page""",
"before_goto": """async def hook(page, context, url, **kwargs):
# Set custom headers
await page.set_extra_http_headers({
'X-Custom-Header': 'value'
})
return page"""
return page""",
}
return examples.get(hook_point, "# Implement your hook logic here\nreturn page")
@@ -588,25 +588,27 @@ async def crawl(
crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
if crawler_config.stream:
return await stream_process(crawl_request=crawl_request)
# Prepare hooks config if provided
hooks_config = None
if crawl_request.hooks:
hooks_config = {
'code': crawl_request.hooks.code,
'timeout': crawl_request.hooks.timeout
"code": crawl_request.hooks.code,
"timeout": crawl_request.hooks.timeout,
}
results = await handle_crawl_request(
urls=crawl_request.urls,
browser_config=crawl_request.browser_config,
crawler_config=crawl_request.crawler_config,
config=config,
hooks_config=hooks_config
hooks_config=hooks_config,
)
# check if all of the results are not successful
if all(not result["success"] for result in results["results"]):
raise HTTPException(500, f"Crawl request failed: {results['results'][0]['error_message']}")
raise HTTPException(
500, f"Crawl request failed: {results['results'][0]['error_message']}"
)
return JSONResponse(results)
@@ -622,24 +624,24 @@ async def crawl_stream(
return await stream_process(crawl_request=crawl_request)
async def stream_process(crawl_request: CrawlRequestWithHooks):
# Prepare hooks config if provided# Prepare hooks config if provided
hooks_config = None
if crawl_request.hooks:
hooks_config = {
'code': crawl_request.hooks.code,
'timeout': crawl_request.hooks.timeout
"code": crawl_request.hooks.code,
"timeout": crawl_request.hooks.timeout,
}
crawler, gen, hooks_info = await handle_stream_crawl_request(
urls=crawl_request.urls,
browser_config=crawl_request.browser_config,
crawler_config=crawl_request.crawler_config,
config=config,
hooks_config=hooks_config
hooks_config=hooks_config,
)
# Add hooks info to response headers if available
headers = {
"Cache-Control": "no-cache",
@@ -648,8 +650,9 @@ async def stream_process(crawl_request: CrawlRequestWithHooks):
}
if hooks_info:
import json
headers["X-Hooks-Status"] = json.dumps(hooks_info['status']['status'])
headers["X-Hooks-Status"] = json.dumps(hooks_info["status"]["status"])
return StreamingResponse(
stream_results(crawler, gen),
media_type="application/x-ndjson",
@@ -661,10 +664,10 @@ def chunk_code_functions(code_md: str) -> List[str]:
"""Extract each function/class from markdown code blocks per file."""
pattern = re.compile(
# match "## File: <path>" then a ```py fence, then capture until the closing ```
r'##\s*File:\s*(?P<path>.+?)\s*?\r?\n' # file header
r'```py\s*?\r?\n' # opening fence
r'(?P<code>.*?)(?=\r?\n```)', # code block
re.DOTALL
r"##\s*File:\s*(?P<path>.+?)\s*?\r?\n" # file header
r"```py\s*?\r?\n" # opening fence
r"(?P<code>.*?)(?=\r?\n```)", # code block
re.DOTALL,
)
chunks: List[str] = []
for m in pattern.finditer(code_md):
@@ -704,15 +707,14 @@ async def get_context(
request: Request,
_td: Dict = Depends(token_dep),
context_type: str = Query("all", regex="^(code|doc|all)$"),
query: Optional[str] = Query(
None, description="search query to filter chunks"),
query: Optional[str] = Query(None, description="search query to filter chunks"),
score_ratio: float = Query(
0.5, ge=0.0, le=1.0, description="min score as fraction of max_score"),
max_results: int = Query(
20, ge=1, description="absolute cap on returned chunks"),
0.5, ge=0.0, le=1.0, description="min score as fraction of max_score"
),
max_results: int = Query(20, ge=1, description="absolute cap on returned chunks"),
):
"""
This end point is design for any questions about Crawl4ai library. It returns a plain text markdown with extensive information about Crawl4ai.
This end point is design for any questions about Crawl4ai library. It returns a plain text markdown with extensive information about Crawl4ai.
You can use this as a context for any AI assistant. Use this endpoint for AI assistants to retrieve library context for decision making or code generation tasks.
Alway is BEST practice you provide a query to filter the context. Otherwise the lenght of the response will be very long.
@@ -746,10 +748,12 @@ async def get_context(
return JSONResponse({"code_context": code_content})
if context_type == "doc":
return JSONResponse({"doc_context": doc_content})
return JSONResponse({
"code_context": code_content,
"doc_context": doc_content,
})
return JSONResponse(
{
"code_context": code_content,
"doc_context": doc_content,
}
)
tokens = query.split()
results: Dict[str, List[Dict[str, float]]] = {}
@@ -773,7 +777,7 @@ async def get_context(
max_sd = float(scores_d.max()) if scores_d.size > 0 else 0.0
cutoff_d = max_sd * score_ratio
idxs = [i for i, s in enumerate(scores_d) if s >= cutoff_d]
neighbors = set(i for idx in idxs for i in (idx-1, idx, idx+1))
neighbors = set(i for idx in idxs for i in (idx - 1, idx, idx + 1))
valid = [i for i in sorted(neighbors) if 0 <= i < len(sections)]
valid = valid[:max_results]
results["doc_results"] = [
@@ -785,14 +789,12 @@ async def get_context(
# attach MCP layer (adds /mcp/ws, /mcp/sse, /mcp/schema)
print(f"MCP server running on {config['app']['host']}:{config['app']['port']}")
attach_mcp(
app,
base_url=f"http://{config['app']['host']}:{config['app']['port']}"
)
attach_mcp(app, base_url=f"http://{config['app']['host']}:{config['app']['port']}")
# ────────────────────────── cli ──────────────────────────────
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"server:app",
host=config["app"]["host"],