feat(mcp): Implement MCP protocol and enhance server capabilities

This commit introduces several significant enhancements to the Crawl4AI Docker deployment:

  1. Add MCP Protocol Support:
     - Implement WebSocket and SSE transport layers for MCP server communication
     - Create mcp_bridge.py to expose existing API endpoints via MCP protocol
     - Add comprehensive tests for both socket and SSE transport methods

  2. Enhance Docker Server Capabilities:
     - Add PDF generation endpoint with file saving functionality
     - Add screenshot capture endpoint with configurable wait time
     - Implement JavaScript execution endpoint for dynamic page interaction
     - Add intelligent file path handling for saving generated assets

  3. Improve Search and Context Functionality:
     - Implement syntax-aware code function chunking using AST parsing
     - Add BM25-based intelligent document search with relevance scoring
     - Create separate code and documentation context endpoints
     - Enhance response format with structured results and scores

  4. Rename and Fix File Organization:
     - Fix typo in test_docker_config_gen.py filename
     - Update import statements and dependencies
     - Add FileResponse for context endpoints

  This enhancement significantly improves the machine-to-machine communication
  capabilities of Crawl4AI, making it more suitable for integration with LLM agents
  and other automated systems.

  The CHANGELOG update has been applied successfully, highlighting the key features and improvements made in this release. The commit message provides a detailed explanation of all the
  changes, which will be helpful for tracking the project's evolution.
This commit is contained in:
UncleCode
2025-04-21 22:22:02 +08:00
parent a58c8000aa
commit 5297e362f3
9 changed files with 21327 additions and 30 deletions

View File

@@ -7,14 +7,47 @@ Crawl4AI FastAPI entrypoint
"""
# ── stdlib & 3rdparty imports ───────────────────────────────
import os, sys, time, asyncio
from typing import List, Optional, Dict
from crawler_pool import get_crawler, close_all, janitor
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from auth import create_access_token, get_token_dependency, TokenRequest
from pydantic import BaseModel
from typing import Optional, List, Dict
from fastapi import Request, Depends
from fastapi.responses import FileResponse
import base64
import re
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from api import (
handle_markdown_request, handle_llm_qa,
handle_stream_crawl_request, handle_crawl_request,
stream_results
)
from utils import (
FilterType, load_config, setup_logging, verify_email_domain
)
import os
import sys
import time
import asyncio
from typing import List
from contextlib import asynccontextmanager
import pathlib
from fastapi import (
FastAPI, HTTPException, Request, Path, Query, Depends
)
from rank_bm25 import BM25Okapi
def chunk_code_functions(code: str) -> List[str]:
tree = ast.parse(code)
lines = code.splitlines()
chunks = []
for node in tree.body:
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
start = node.lineno - 1
end = getattr(node, 'end_lineno', start + 1)
chunks.append("\n".join(lines[start:end]))
return chunks
from fastapi.responses import (
StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
)
@@ -22,7 +55,10 @@ from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
from fastapi.middleware.trustedhost import TrustedHostMiddleware
from fastapi.staticfiles import StaticFiles
import ast, crawl4ai as _c4
from mcp_bridge import attach_mcp, mcp_resource, mcp_template, mcp_tool
import ast
import crawl4ai as _c4
from pydantic import BaseModel, Field
from slowapi import Limiter
from slowapi.util import get_remote_address
@@ -31,17 +67,6 @@ from redis import asyncio as aioredis
# ── internal imports (after sys.path append) ─────────────────
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
from utils import (
FilterType, load_config, setup_logging, verify_email_domain
)
from api import (
handle_markdown_request, handle_llm_qa,
handle_stream_crawl_request, handle_crawl_request,
stream_results
)
from auth import create_access_token, get_token_dependency, TokenRequest
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawler_pool import get_crawler, close_all, janitor
# ────────────────── configuration / logging ──────────────────
config = load_config()
@@ -66,12 +91,16 @@ GLOBAL_SEM = asyncio.Semaphore(MAX_PAGES)
# GLOBAL_SEM.release() # ← free slot
orig_arun = AsyncWebCrawler.arun
async def capped_arun(self, *a, **kw):
async with GLOBAL_SEM:
return await orig_arun(self, *a, **kw)
AsyncWebCrawler.arun = capped_arun
# ───────────────────── FastAPI lifespan ──────────────────────
@asynccontextmanager
async def lifespan(_: FastAPI):
await get_crawler(BrowserConfig(
@@ -101,6 +130,8 @@ app.mount(
)
# Optional nicetohave: opening the root shows the playground
@app.get("/")
async def root():
return RedirectResponse("/playground")
@@ -114,6 +145,7 @@ limiter = Limiter(
storage_uri=config["rate_limiting"]["storage_uri"],
)
def _setup_security(app_: FastAPI):
sec = config["security"]
if not sec["enabled"]:
@@ -124,6 +156,8 @@ def _setup_security(app_: FastAPI):
app_.add_middleware(
TrustedHostMiddleware, allowed_hosts=sec["trusted_hosts"]
)
_setup_security(app)
if config["observability"]["prometheus"]["enabled"]:
@@ -131,6 +165,7 @@ if config["observability"]["prometheus"]["enabled"]:
token_dep = get_token_dependency(config)
@app.middleware("http")
async def add_security_headers(request: Request, call_next):
resp = await call_next(request)
@@ -144,6 +179,7 @@ ALLOWED_TYPES = {
"BrowserConfig": BrowserConfig,
}
def _safe_eval_config(expr: str) -> dict:
"""
Accept exactly one toplevel call to CrawlerRunConfig(...) or BrowserConfig(...).
@@ -159,7 +195,8 @@ def _safe_eval_config(expr: str) -> dict:
call = tree.body
if not (isinstance(call.func, ast.Name) and call.func.id in {"CrawlerRunConfig", "BrowserConfig"}):
raise ValueError("Only CrawlerRunConfig(...) or BrowserConfig(...) are allowed")
raise ValueError(
"Only CrawlerRunConfig(...) or BrowserConfig(...) are allowed")
# forbid nested calls to keep the surface tiny
for node in ast.walk(call):
@@ -167,8 +204,10 @@ def _safe_eval_config(expr: str) -> dict:
raise ValueError("Nested function calls are not permitted")
# expose everything that crawl4ai exports, nothing else
safe_env = {name: getattr(_c4, name) for name in dir(_c4) if not name.startswith("_")}
obj = eval(compile(tree, "<config>", "eval"), {"__builtins__": {}}, safe_env)
safe_env = {name: getattr(_c4, name)
for name in dir(_c4) if not name.startswith("_")}
obj = eval(compile(tree, "<config>", "eval"),
{"__builtins__": {}}, safe_env)
return obj.dump()
@@ -178,10 +217,42 @@ class CrawlRequest(BaseModel):
browser_config: Optional[Dict] = Field(default_factory=dict)
crawler_config: Optional[Dict] = Field(default_factory=dict)
# ────────────── Schemas ──────────────
class MarkdownRequest(BaseModel):
"""Request body for the /md endpoint."""
url: str = Field(..., description="Absolute http/https URL to fetch")
f: FilterType = Field(FilterType.FIT,
description="Contentfilter strategy: FIT, RAW, BM25, or LLM")
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
c: Optional[str] = Field("0", description="Cachebust / revision counter")
class RawCode(BaseModel):
code: str
class HTMLRequest(BaseModel):
url: str
class ScreenshotRequest(BaseModel):
url: str
screenshot_wait_for: Optional[float] = 2
output_path: Optional[str] = None
class PDFRequest(BaseModel):
url: str
output_path: Optional[str] = None
class JSEndpointRequest(BaseModel):
url: str
scripts: List[str] = Field(
...,
description="List of separated JavaScript snippets to execute"
)
# ──────────────────────── Endpoints ──────────────────────────
@app.post("/token")
async def get_token(req: TokenRequest):
if not verify_email_domain(req.email):
@@ -189,6 +260,7 @@ async def get_token(req: TokenRequest):
token = create_access_token({"sub": req.email})
return {"email": req.email, "access_token": token, "token_type": "bearer"}
@app.post("/config/dump")
async def config_dump(raw: RawCode):
try:
@@ -197,18 +269,164 @@ async def config_dump(raw: RawCode):
raise HTTPException(400, str(e))
@app.get("/md/{url:path}")
@app.post("/md")
@limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("md")
async def get_markdown(
request: Request,
url: str,
f: FilterType = FilterType.FIT,
q: Optional[str] = None,
c: str = "0",
body: MarkdownRequest,
_td: Dict = Depends(token_dep),
):
md = await handle_markdown_request(url, f, q, c, config)
return PlainTextResponse(md)
if not body.url.startswith(("http://", "https://")):
raise HTTPException(400, "URL must be absolute and start with http/https")
markdown = await handle_markdown_request(
body.url, body.f, body.q, body.c, config
)
return JSONResponse({
"url": body.url,
"filter": body.f,
"query": body.q,
"cache": body.c,
"markdown": markdown,
"success": True
})
@app.post("/html")
@limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("html")
async def generate_html(
request: Request,
body: HTMLRequest,
_td: Dict = Depends(token_dep),
):
"""
Crawls the URL, preprocesses the raw HTML for schema extraction, and returns the processed HTML.
Use when you need sanitized HTML structures for building schemas or further processing.
"""
cfg = CrawlerRunConfig()
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
results = await crawler.arun(url=body.url, config=cfg)
raw_html = results[0].html
from crawl4ai.utils import preprocess_html_for_schema
processed_html = preprocess_html_for_schema(raw_html)
return JSONResponse({"html": processed_html, "url": body.url, "success": True})
# Screenshot endpoint
@app.post("/screenshot")
@limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("screenshot")
async def generate_screenshot(
request: Request,
body: ScreenshotRequest,
_td: Dict = Depends(token_dep),
):
"""
Capture a full-page PNG screenshot of the specified URL, waiting an optional delay before capture,
Use when you need an image snapshot of the rendered page. Its recommened to provide an output path to save the screenshot.
Then in result instead of the screenshot you will get a path to the saved file.
"""
cfg = CrawlerRunConfig(screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
results = await crawler.arun(url=body.url, config=cfg)
screenshot_data = results[0].screenshot
if body.output_path:
abs_path = os.path.abspath(body.output_path)
os.makedirs(os.path.dirname(abs_path), exist_ok=True)
with open(abs_path, "wb") as f:
f.write(base64.b64decode(screenshot_data))
return {"success": True, "path": abs_path}
return {"success": True, "screenshot": screenshot_data}
# PDF endpoint
@app.post("/pdf")
@limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("pdf")
async def generate_pdf(
request: Request,
body: PDFRequest,
_td: Dict = Depends(token_dep),
):
"""
Generate a PDF document of the specified URL,
Use when you need a printable or archivable snapshot of the page. It is recommended to provide an output path to save the PDF.
Then in result instead of the PDF you will get a path to the saved file.
"""
cfg = CrawlerRunConfig(pdf=True)
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
results = await crawler.arun(url=body.url, config=cfg)
pdf_data = results[0].pdf
if body.output_path:
abs_path = os.path.abspath(body.output_path)
os.makedirs(os.path.dirname(abs_path), exist_ok=True)
with open(abs_path, "wb") as f:
f.write(pdf_data)
return {"success": True, "path": abs_path}
return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
@app.post("/execute_js")
@limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("execute_js")
async def execute_js(
request: Request,
body: JSEndpointRequest,
_td: Dict = Depends(token_dep),
):
"""
Execute a sequence of JavaScript snippets on the specified URL.
Return the full CrawlResult JSON (first result).
Use this when you need to interact with dynamic pages using JS.
REMEMBER: Scripts accept a list of separated JS snippets to execute and execute them in order.
IMPORTANT: Each script should be an expression that returns a value. It can be an IIFE or an async function. You can think of it as such.
Your script will replace '{script}' and execute in the browser context. So provide either an IIFE or a sync/async function that returns a value.
Return Format:
- The return result is an instance of CrawlResult, so you have access to markdown, links, and other stuff. If this is enough, you don't need to call again for other endpoints.
```python
class CrawlResult(BaseModel):
url: str
html: str
success: bool
cleaned_html: Optional[str] = None
media: Dict[str, List[Dict]] = {}
links: Dict[str, List[Dict]] = {}
downloaded_files: Optional[List[str]] = None
js_execution_result: Optional[Dict[str, Any]] = None
screenshot: Optional[str] = None
pdf: Optional[bytes] = None
mhtml: Optional[str] = None
_markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
extracted_content: Optional[str] = None
metadata: Optional[dict] = None
error_message: Optional[str] = None
session_id: Optional[str] = None
response_headers: Optional[dict] = None
status_code: Optional[int] = None
ssl_certificate: Optional[SSLCertificate] = None
dispatch_result: Optional[DispatchResult] = None
redirected_url: Optional[str] = None
network_requests: Optional[List[Dict[str, Any]]] = None
console_messages: Optional[List[Dict[str, Any]]] = None
class MarkdownGenerationResult(BaseModel):
raw_markdown: str
markdown_with_citations: str
references_markdown: str
fit_markdown: Optional[str] = None
fit_html: Optional[str] = None
```
"""
cfg = CrawlerRunConfig(js_code=body.scripts)
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
results = await crawler.arun(url=body.url, config=cfg)
# Return JSON-serializable dict of the first CrawlResult
data = results[0].model_dump()
return JSONResponse(data)
@app.get("/llm/{url:path}")
async def llm_endpoint(
@@ -224,27 +442,35 @@ async def llm_endpoint(
answer = await handle_llm_qa(url, q, config)
return JSONResponse({"answer": answer})
@app.get("/schema")
async def get_schema():
from crawl4ai import BrowserConfig, CrawlerRunConfig
return {"browser": BrowserConfig().dump(),
"crawler": CrawlerRunConfig().dump()}
@app.get(config["observability"]["health_check"]["endpoint"])
async def health():
return {"status": "ok", "timestamp": time.time(), "version": __version__}
@app.get(config["observability"]["prometheus"]["endpoint"])
async def metrics():
return RedirectResponse(config["observability"]["prometheus"]["endpoint"])
@app.post("/crawl")
@limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("crawl")
async def crawl(
request: Request,
crawl_request: CrawlRequest,
_td: Dict = Depends(token_dep),
):
"""
Crawl a list of URLs and return the results as JSON.
"""
if not crawl_request.urls:
raise HTTPException(400, "At least one URL required")
res = await handle_crawl_request(
@@ -255,6 +481,7 @@ async def crawl(
)
return JSONResponse(res)
@app.post("/crawl/stream")
@limiter.limit(config["rate_limiting"]["default_limit"])
async def crawl_stream(
@@ -280,6 +507,133 @@ async def crawl_stream(
},
)
def chunk_code_functions(code_md: str) -> List[str]:
"""Extract each function/class from markdown code blocks per file."""
pattern = re.compile(
# match "## File: <path>" then a ```py fence, then capture until the closing ```
r'##\s*File:\s*(?P<path>.+?)\s*?\r?\n' # file header
r'```py\s*?\r?\n' # opening fence
r'(?P<code>.*?)(?=\r?\n```)', # code block
re.DOTALL
)
chunks: List[str] = []
for m in pattern.finditer(code_md):
file_path = m.group("path").strip()
code_blk = m.group("code")
tree = ast.parse(code_blk)
lines = code_blk.splitlines()
for node in tree.body:
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
start = node.lineno - 1
end = getattr(node, "end_lineno", start + 1)
snippet = "\n".join(lines[start:end])
chunks.append(f"# File: {file_path}\n{snippet}")
return chunks
def chunk_doc_sections(doc: str) -> List[str]:
lines = doc.splitlines(keepends=True)
sections = []
current: List[str] = []
for line in lines:
if re.match(r"^#{1,6}\s", line):
if current:
sections.append("".join(current))
current = [line]
else:
current.append(line)
if current:
sections.append("".join(current))
return sections
@app.get("/ask")
@limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("ask")
async def get_context(
request: Request,
_td: Dict = Depends(token_dep),
context_type: str = Query("all", regex="^(code|doc|all)$"),
query: Optional[str] = Query(None, description="search query to filter chunks"),
score_ratio: float = Query(0.5, ge=0.0, le=1.0, description="min score as fraction of max_score"),
max_results: int = Query(20, ge=1, description="absolute cap on returned chunks"),
):
"""
This end point is design for any questions about Crawl4ai library. It returns a plain text markdown with extensive information about Crawl4ai.
You can use this as a context for any AI assistant. Use this endpoint for AI assistants to retrieve library context for decision making or code generation tasks.
Alway is BEST practice you provide a query to filter the context. Otherwise the lenght of the response will be very long.
Parameters:
- context_type: Specify "code" for code context, "doc" for documentation context, or "all" for both.
- query: RECOMMENDED search query to filter paragraphs using BM25. You can leave this empty to get all the context.
- score_ratio: Minimum score as a fraction of the maximum score for filtering results.
- max_results: Maximum number of results to return. Default is 20.
Returns:
- JSON response with the requested context.
- If "code" is specified, returns the code context.
- If "doc" is specified, returns the documentation context.
- If "all" is specified, returns both code and documentation contexts.
"""
# load contexts
base = os.path.dirname(__file__)
code_path = os.path.join(base, "c4ai-code-context.md")
doc_path = os.path.join(base, "c4ai-doc-context.md")
if not os.path.exists(code_path) or not os.path.exists(doc_path):
raise HTTPException(404, "Context files not found")
with open(code_path, "r") as f:
code_content = f.read()
with open(doc_path, "r") as f:
doc_content = f.read()
# if no query, just return raw contexts
if not query:
if context_type == "code":
return JSONResponse({"code_context": code_content})
if context_type == "doc":
return JSONResponse({"doc_context": doc_content})
return JSONResponse({
"code_context": code_content,
"doc_context": doc_content,
})
tokens = query.split()
results: Dict[str, List[Dict[str, float]]] = {}
# code BM25 over functions/classes
if context_type in ("code", "all"):
code_chunks = chunk_code_functions(code_content)
bm25 = BM25Okapi([c.split() for c in code_chunks])
scores = bm25.get_scores(tokens)
max_sc = float(scores.max()) if scores.size > 0 else 0.0
cutoff = max_sc * score_ratio
picked = [(c, s) for c, s in zip(code_chunks, scores) if s >= cutoff]
picked = sorted(picked, key=lambda x: x[1], reverse=True)[:max_results]
results["code_results"] = [{"text": c, "score": s} for c, s in picked]
# doc BM25 over markdown sections
if context_type in ("doc", "all"):
sections = chunk_doc_sections(doc_content)
bm25d = BM25Okapi([sec.split() for sec in sections])
scores_d = bm25d.get_scores(tokens)
max_sd = float(scores_d.max()) if scores_d.size > 0 else 0.0
cutoff_d = max_sd * score_ratio
idxs = [i for i, s in enumerate(scores_d) if s >= cutoff_d]
neighbors = set(i for idx in idxs for i in (idx-1, idx, idx+1))
valid = [i for i in sorted(neighbors) if 0 <= i < len(sections)]
valid = valid[:max_results]
results["doc_results"] = [
{"text": sections[i], "score": scores_d[i]} for i in valid
]
return JSONResponse(results)
# attach MCP layer (adds /mcp/ws, /mcp/sse, /mcp/schema)
attach_mcp(
app,
base_url=f"http://{config['app']['host']}:{config['app']['port']}"
)
# ────────────────────────── cli ──────────────────────────────
if __name__ == "__main__":
import uvicorn