feat(adaptive-crawling): implement adaptive crawling endpoints and job management

2025-09-30 18:17:40 +08:00
parent bb3b29042f
commit a62cfeebd9
3 changed files with 314 additions and 156 deletions
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -7,70 +7,63 @@ Crawl4AI FastAPI entry‑point
 """

 # ── stdlib & 3rd‑party imports ───────────────────────────────
-from crawler_pool import get_crawler, close_all, janitor
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-from auth import create_access_token, get_token_dependency, TokenRequest
-from pydantic import BaseModel
-from typing import Optional, List, Dict, Any
-from fastapi import Request, Depends
-from fastapi.responses import FileResponse
-import base64
-import re
-import os
-import sys
+import ast
 import asyncio
-from contextlib import asynccontextmanager
-from pathlib import Path
-from urllib.parse import urlparse
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-from api import (
-    handle_markdown_request, handle_llm_qa,
-    handle_stream_crawl_request, handle_crawl_request,
-    stream_results, handle_seed
-)
-from schemas import (
-    CrawlRequestWithHooks,
-    MarkdownRequest,
-    RawCode,
-    HTMLRequest,
-    ScreenshotRequest,
-    PDFRequest,
-    JSEndpointRequest,
-    SeedRequest,
-)
-
-from utils import (
-    FilterType, load_config, setup_logging, verify_email_domain
-)
+import base64
 import os
+import pathlib
+import re
 import sys
 import time
-import asyncio
-from typing import List
 from contextlib import asynccontextmanager
-import pathlib
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from urllib.parse import urlparse

-from fastapi import (
-    FastAPI, HTTPException, Request, Path, Query, Depends
-)
-from rank_bm25 import BM25Okapi
-from fastapi.responses import (
-    StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
+import adaptive_routes
+from api import (
+    handle_crawl_request,
+    handle_llm_qa,
+    handle_markdown_request,
+    handle_seed,
+    handle_stream_crawl_request,
+    stream_results,
 )
+from auth import TokenRequest, create_access_token, get_token_dependency
+from crawler_pool import close_all, get_crawler, janitor
+from fastapi import Depends, FastAPI, HTTPException, Path, Query, Request
 from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
 from fastapi.middleware.trustedhost import TrustedHostMiddleware
+from fastapi.responses import (
+    FileResponse,
+    JSONResponse,
+    PlainTextResponse,
+    RedirectResponse,
+    StreamingResponse,
+)
 from fastapi.staticfiles import StaticFiles
 from job import init_job_router
-
 from mcp_bridge import attach_mcp, mcp_resource, mcp_template, mcp_tool
-
-import ast
-import crawl4ai as _c4
+from prometheus_fastapi_instrumentator import Instrumentator
 from pydantic import BaseModel, Field
+from rank_bm25 import BM25Okapi
+from redis import asyncio as aioredis
+from schemas import (
+    CrawlRequestWithHooks,
+    HTMLRequest,
+    JSEndpointRequest,
+    MarkdownRequest,
+    PDFRequest,
+    RawCode,
+    ScreenshotRequest,
+    SeedRequest,
+)
 from slowapi import Limiter
 from slowapi.util import get_remote_address
-from prometheus_fastapi_instrumentator import Instrumentator
-from redis import asyncio as aioredis
+from utils import FilterType, load_config, setup_logging, verify_email_domain
+
+import crawl4ai as _c4
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig

 # ── internal imports (after sys.path append) ─────────────────
 sys.path.append(os.path.dirname(os.path.realpath(__file__)))
@@ -103,6 +96,8 @@ orig_arun = AsyncWebCrawler.arun
 async def capped_arun(self, *a, **kw):
    async with GLOBAL_SEM:
        return await orig_arun(self, *a, **kw)
+
+
 AsyncWebCrawler.arun = capped_arun

 # ───────────────────── FastAPI lifespan ──────────────────────
@@ -110,15 +105,18 @@ AsyncWebCrawler.arun = capped_arun

@asynccontextmanager
 async def lifespan(_: FastAPI):
-    await get_crawler(BrowserConfig(
-        extra_args=config["crawler"]["browser"].get("extra_args", []),
-        **config["crawler"]["browser"].get("kwargs", {}),
-    ))           # warm‑up
-    app.state.janitor = asyncio.create_task(janitor())        # idle GC
+    await get_crawler(
+        BrowserConfig(
+            extra_args=config["crawler"]["browser"].get("extra_args", []),
+            **config["crawler"]["browser"].get("kwargs", {}),
+        )
+    )  # warm‑up
+    app.state.janitor = asyncio.create_task(janitor())  # idle GC
    yield
    app.state.janitor.cancel()
    await close_all()

+
 # ───────────────────── FastAPI instance ──────────────────────
 app = FastAPI(
    title=config["app"]["title"],
@@ -126,6 +124,7 @@ app = FastAPI(
    lifespan=lifespan,
 )

+app.include_router(adaptive_routes.router)
 # ── static playground ──────────────────────────────────────
 STATIC_DIR = pathlib.Path(__file__).parent / "static" / "playground"
 if not STATIC_DIR.exists():
@@ -141,6 +140,7 @@ app.mount(
 async def root():
    return RedirectResponse("/playground")

+
 # ─────────────────── infra / middleware  ─────────────────────
 redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost"))

@@ -158,9 +158,7 @@ def _setup_security(app_: FastAPI):
    if sec.get("https_redirect"):
        app_.add_middleware(HTTPSRedirectMiddleware)
    if sec.get("trusted_hosts", []) != ["*"]:
-        app_.add_middleware(
-            TrustedHostMiddleware, allowed_hosts=sec["trusted_hosts"]
-        )
+        app_.add_middleware(TrustedHostMiddleware, allowed_hosts=sec["trusted_hosts"])


 _setup_security(app)
@@ -178,6 +176,7 @@ async def add_security_headers(request: Request, call_next):
        resp.headers.update(config["security"]["headers"])
    return resp

+
 # ───────────────── safe config‑dump helper ─────────────────
 ALLOWED_TYPES = {
    "CrawlerRunConfig": CrawlerRunConfig,
@@ -199,9 +198,11 @@ def _safe_eval_config(expr: str) -> dict:
        raise ValueError("Expression must be a single constructor call")

    call = tree.body
-    if not (isinstance(call.func, ast.Name) and call.func.id in {"CrawlerRunConfig", "BrowserConfig"}):
-        raise ValueError(
-            "Only CrawlerRunConfig(...) or BrowserConfig(...) are allowed")
+    if not (
+        isinstance(call.func, ast.Name)
+        and call.func.id in {"CrawlerRunConfig", "BrowserConfig"}
+    ):
+        raise ValueError("Only CrawlerRunConfig(...) or BrowserConfig(...) are allowed")

    # forbid nested calls to keep the surface tiny
    for node in ast.walk(call):
@@ -209,16 +210,17 @@ def _safe_eval_config(expr: str) -> dict:
            raise ValueError("Nested function calls are not permitted")

    # expose everything that crawl4ai exports, nothing else
-    safe_env = {name: getattr(_c4, name)
-                for name in dir(_c4) if not name.startswith("_")}
-    obj = eval(compile(tree, "<config>", "eval"),
-               {"__builtins__": {}}, safe_env)
+    safe_env = {
+        name: getattr(_c4, name) for name in dir(_c4) if not name.startswith("_")
+    }
+    obj = eval(compile(tree, "<config>", "eval"), {"__builtins__": {}}, safe_env)
    return obj.dump()


 # ── job router ──────────────────────────────────────────────
 app.include_router(init_job_router(redis, config, token_dep))

+
 # ──────────────────────── Endpoints ──────────────────────────
@app.post("/token")
 async def get_token(req: TokenRequest):
@@ -252,9 +254,9 @@ async def seed_url(request: SeedRequest):
                status_code=400,
                detail="Invalid URL provided. Could not extract domain.",
            )
-        res = await handle_seed(request.url , request.config)
-        return JSONResponse({"seed_url":res , "count":len(res)})
-        
+        res = await handle_seed(request.url, request.config)
+        return JSONResponse({"seed_url": res, "count": len(res)})
+
    except Exception as e:
        print(f"❌ Error in seed_url: {e}")
        raise HTTPException(status_code=500, detail=str(e))
@@ -268,21 +270,33 @@ async def get_markdown(
    body: MarkdownRequest,
    _td: Dict = Depends(token_dep),
 ):
-    if not body.url.startswith(("http://", "https://")) and not body.url.startswith(("raw:", "raw://")):
+    if not body.url.startswith(("http://", "https://")) and not body.url.startswith(
+        ("raw:", "raw://")
+    ):
        raise HTTPException(
-            400, "Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)")
+            400,
+            "Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)",
+        )
    markdown = await handle_markdown_request(
-        body.url, body.f, body.q, body.c, config, body.provider,
-        body.temperature, body.base_url
+        body.url,
+        body.f,
+        body.q,
+        body.c,
+        config,
+        body.provider,
+        body.temperature,
+        body.base_url,
+    )
+    return JSONResponse(
+        {
+            "url": body.url,
+            "filter": body.f,
+            "query": body.q,
+            "cache": body.c,
+            "markdown": markdown,
+            "success": True,
+        }
    )
-    return JSONResponse({
-        "url": body.url,
-        "filter": body.f,
-        "query": body.q,
-        "cache": body.c,
-        "markdown": markdown,
-        "success": True
-    })


@app.post("/html")
@@ -304,20 +318,18 @@ async def generate_html(
        # Check if the crawl was successful
        if not results[0].success:
            raise HTTPException(
-                status_code=500,
-                detail=results[0].error_message or "Crawl failed"
+                status_code=500, detail=results[0].error_message or "Crawl failed"
            )
-        
+
        raw_html = results[0].html
        from crawl4ai.utils import preprocess_html_for_schema
+
        processed_html = preprocess_html_for_schema(raw_html)
        return JSONResponse({"html": processed_html, "url": body.url, "success": True})
    except Exception as e:
        # Log and raise as HTTP 500 for other exceptions
-        raise HTTPException(
-            status_code=500,
-            detail=str(e)
-        )
+        raise HTTPException(status_code=500, detail=str(e))
+

 # Screenshot endpoint

@@ -337,13 +349,13 @@ async def generate_screenshot(
    """
    try:
        cfg = CrawlerRunConfig(
-            screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
+            screenshot=True, screenshot_wait_for=body.screenshot_wait_for
+        )
        async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
            results = await crawler.arun(url=body.url, config=cfg)
        if not results[0].success:
            raise HTTPException(
-                status_code=500,
-                detail=results[0].error_message or "Crawl failed"
+                status_code=500, detail=results[0].error_message or "Crawl failed"
            )
        screenshot_data = results[0].screenshot
        if body.output_path:
@@ -354,10 +366,8 @@ async def generate_screenshot(
            return {"success": True, "path": abs_path}
        return {"success": True, "screenshot": screenshot_data}
    except Exception as e:
-        raise HTTPException(
-            status_code=500,
-            detail=str(e)
-        )
+        raise HTTPException(status_code=500, detail=str(e))
+

 # PDF endpoint

@@ -381,8 +391,7 @@ async def generate_pdf(
            results = await crawler.arun(url=body.url, config=cfg)
        if not results[0].success:
            raise HTTPException(
-                status_code=500,
-                detail=results[0].error_message or "Crawl failed"
+                status_code=500, detail=results[0].error_message or "Crawl failed"
            )
        pdf_data = results[0].pdf
        if body.output_path:
@@ -393,10 +402,7 @@ async def generate_pdf(
            return {"success": True, "path": abs_path}
        return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
    except Exception as e:
-        raise HTTPException(
-            status_code=500,
-            detail=str(e)
-        )
+        raise HTTPException(status_code=500, detail=str(e))


@app.post("/execute_js")
@@ -458,17 +464,13 @@ async def execute_js(
            results = await crawler.arun(url=body.url, config=cfg)
        if not results[0].success:
            raise HTTPException(
-                status_code=500,
-                detail=results[0].error_message or "Crawl failed"
+                status_code=500, detail=results[0].error_message or "Crawl failed"
            )
        # Return JSON-serializable dict of the first CrawlResult
        data = results[0].model_dump()
        return JSONResponse(data)
    except Exception as e:
-        raise HTTPException(
-            status_code=500,
-            detail=str(e)
-        )
+        raise HTTPException(status_code=500, detail=str(e))


@app.get("/llm/{url:path}")
@@ -480,7 +482,9 @@ async def llm_endpoint(
 ):
    if not q:
        raise HTTPException(400, "Query parameter 'q' is required")
-    if not url.startswith(("http://", "https://")) and not url.startswith(("raw:", "raw://")):
+    if not url.startswith(("http://", "https://")) and not url.startswith(
+        ("raw:", "raw://")
+    ):
        url = "https://" + url
    answer = await handle_llm_qa(url, q, config)
    return JSONResponse({"answer": answer})
@@ -489,31 +493,29 @@ async def llm_endpoint(
@app.get("/schema")
 async def get_schema():
    from crawl4ai import BrowserConfig, CrawlerRunConfig
-    return {"browser": BrowserConfig().dump(),
-            "crawler": CrawlerRunConfig().dump()}
+
+    return {"browser": BrowserConfig().dump(), "crawler": CrawlerRunConfig().dump()}


@app.get("/hooks/info")
 async def get_hooks_info():
    """Get information about available hook points and their signatures"""
    from hook_manager import UserHookManager
-    
+
    hook_info = {}
    for hook_point, params in UserHookManager.HOOK_SIGNATURES.items():
        hook_info[hook_point] = {
            "parameters": params,
            "description": get_hook_description(hook_point),
-            "example": get_hook_example(hook_point)
+            "example": get_hook_example(hook_point),
        }
-    
-    return JSONResponse({
-        "available_hooks": hook_info,
-        "timeout_limits": {
-            "min": 1,
-            "max": 120,
-            "default": 30
+
+    return JSONResponse(
+        {
+            "available_hooks": hook_info,
+            "timeout_limits": {"min": 1, "max": 120, "default": 30},
        }
-    })
+    )


 def get_hook_description(hook_point: str) -> str:
@@ -526,7 +528,7 @@ def get_hook_description(hook_point: str) -> str:
        "on_user_agent_updated": "Called when user agent is updated",
        "on_execution_started": "Called when custom JavaScript execution begins",
        "before_retrieve_html": "Called before retrieving the final HTML - ideal for scrolling",
-        "before_return_html": "Called just before returning the HTML content"
+        "before_return_html": "Called just before returning the HTML content",
    }
    return descriptions.get(hook_point, "")

@@ -542,19 +544,17 @@ def get_hook_example(hook_point: str) -> str:
        'domain': '.example.com'
    }])
    return page""",
-        
        "before_retrieve_html": """async def hook(page, context, **kwargs):
    # Scroll to load lazy content
    await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
    await page.wait_for_timeout(2000)
    return page""",
-        
        "before_goto": """async def hook(page, context, url, **kwargs):
    # Set custom headers
    await page.set_extra_http_headers({
        'X-Custom-Header': 'value'
    })
-    return page"""
+    return page""",
    }
    return examples.get(hook_point, "# Implement your hook logic here\nreturn page")

@@ -588,25 +588,27 @@ async def crawl(
    crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
    if crawler_config.stream:
        return await stream_process(crawl_request=crawl_request)
-    
+
    # Prepare hooks config if provided
    hooks_config = None
    if crawl_request.hooks:
        hooks_config = {
-            'code': crawl_request.hooks.code,
-            'timeout': crawl_request.hooks.timeout
+            "code": crawl_request.hooks.code,
+            "timeout": crawl_request.hooks.timeout,
        }
-    
+
    results = await handle_crawl_request(
        urls=crawl_request.urls,
        browser_config=crawl_request.browser_config,
        crawler_config=crawl_request.crawler_config,
        config=config,
-        hooks_config=hooks_config
+        hooks_config=hooks_config,
    )
    # check if all of the results are not successful
    if all(not result["success"] for result in results["results"]):
-        raise HTTPException(500, f"Crawl request failed: {results['results'][0]['error_message']}")
+        raise HTTPException(
+            500, f"Crawl request failed: {results['results'][0]['error_message']}"
+        )
    return JSONResponse(results)


@@ -622,24 +624,24 @@ async def crawl_stream(

    return await stream_process(crawl_request=crawl_request)

+
 async def stream_process(crawl_request: CrawlRequestWithHooks):
-    
    # Prepare hooks config if provided# Prepare hooks config if provided
    hooks_config = None
    if crawl_request.hooks:
        hooks_config = {
-            'code': crawl_request.hooks.code,
-            'timeout': crawl_request.hooks.timeout
+            "code": crawl_request.hooks.code,
+            "timeout": crawl_request.hooks.timeout,
        }
-    
+
    crawler, gen, hooks_info = await handle_stream_crawl_request(
        urls=crawl_request.urls,
        browser_config=crawl_request.browser_config,
        crawler_config=crawl_request.crawler_config,
        config=config,
-        hooks_config=hooks_config
+        hooks_config=hooks_config,
    )
-    
+
    # Add hooks info to response headers if available
    headers = {
        "Cache-Control": "no-cache",
@@ -648,8 +650,9 @@ async def stream_process(crawl_request: CrawlRequestWithHooks):
    }
    if hooks_info:
        import json
-        headers["X-Hooks-Status"] = json.dumps(hooks_info['status']['status'])
-    
+
+        headers["X-Hooks-Status"] = json.dumps(hooks_info["status"]["status"])
+
    return StreamingResponse(
        stream_results(crawler, gen),
        media_type="application/x-ndjson",
@@ -661,10 +664,10 @@ def chunk_code_functions(code_md: str) -> List[str]:
    """Extract each function/class from markdown code blocks per file."""
    pattern = re.compile(
        # match "## File: <path>" then a ```py fence, then capture until the closing ```
-        r'##\s*File:\s*(?P<path>.+?)\s*?\r?\n'      # file header
-        r'```py\s*?\r?\n'                         # opening fence
-        r'(?P<code>.*?)(?=\r?\n```)',             # code block
-        re.DOTALL
+        r"##\s*File:\s*(?P<path>.+?)\s*?\r?\n"  # file header
+        r"```py\s*?\r?\n"  # opening fence
+        r"(?P<code>.*?)(?=\r?\n```)",  # code block
+        re.DOTALL,
    )
    chunks: List[str] = []
    for m in pattern.finditer(code_md):
@@ -704,15 +707,14 @@ async def get_context(
    request: Request,
    _td: Dict = Depends(token_dep),
    context_type: str = Query("all", regex="^(code|doc|all)$"),
-    query: Optional[str] = Query(
-        None, description="search query to filter chunks"),
+    query: Optional[str] = Query(None, description="search query to filter chunks"),
    score_ratio: float = Query(
-        0.5, ge=0.0, le=1.0, description="min score as fraction of max_score"),
-    max_results: int = Query(
-        20, ge=1, description="absolute cap on returned chunks"),
+        0.5, ge=0.0, le=1.0, description="min score as fraction of max_score"
+    ),
+    max_results: int = Query(20, ge=1, description="absolute cap on returned chunks"),
 ):
    """
-    This end point is design for any questions about Crawl4ai library. It returns a plain text markdown with extensive information about Crawl4ai. 
+    This end point is design for any questions about Crawl4ai library. It returns a plain text markdown with extensive information about Crawl4ai.
    You can use this as a context for any AI assistant. Use this endpoint for AI assistants to retrieve library context for decision making or code generation tasks.
    Alway is BEST practice you provide a query to filter the context. Otherwise the lenght of the response will be very long.

@@ -746,10 +748,12 @@ async def get_context(
            return JSONResponse({"code_context": code_content})
        if context_type == "doc":
            return JSONResponse({"doc_context": doc_content})
-        return JSONResponse({
-            "code_context": code_content,
-            "doc_context": doc_content,
-        })
+        return JSONResponse(
+            {
+                "code_context": code_content,
+                "doc_context": doc_content,
+            }
+        )

    tokens = query.split()
    results: Dict[str, List[Dict[str, float]]] = {}
@@ -773,7 +777,7 @@ async def get_context(
        max_sd = float(scores_d.max()) if scores_d.size > 0 else 0.0
        cutoff_d = max_sd * score_ratio
        idxs = [i for i, s in enumerate(scores_d) if s >= cutoff_d]
-        neighbors = set(i for idx in idxs for i in (idx-1, idx, idx+1))
+        neighbors = set(i for idx in idxs for i in (idx - 1, idx, idx + 1))
        valid = [i for i in sorted(neighbors) if 0 <= i < len(sections)]
        valid = valid[:max_results]
        results["doc_results"] = [
@@ -785,14 +789,12 @@ async def get_context(

 # attach MCP layer (adds /mcp/ws, /mcp/sse, /mcp/schema)
 print(f"MCP server running on {config['app']['host']}:{config['app']['port']}")
-attach_mcp(
-    app,
-    base_url=f"http://{config['app']['host']}:{config['app']['port']}"
-)
+attach_mcp(app, base_url=f"http://{config['app']['host']}:{config['app']['port']}")

 # ────────────────────────── cli ──────────────────────────────
 if __name__ == "__main__":
    import uvicorn
+
    uvicorn.run(
        "server:app",
        host=config["app"]["host"],