Files
crawl4ai/deploy/docker/server.py
AHMET YILMAZ 3877335d89 Profiling/monitoring :Add interactive monitoring dashboard and integration tests for monitoring endpoints
- Implemented an interactive monitoring dashboard in `demo_monitoring_dashboard.py` for real-time statistics, profiling session management, and system resource monitoring.
- Created a quick test script `test_monitoring_quick.py` to verify the functionality of monitoring endpoints.
- Developed comprehensive integration tests in `test_monitoring_endpoints.py` covering health checks, statistics, profiling sessions, and real-time streaming.
- Added error handling and user-friendly output for better usability in the dashboard.
2025-10-16 16:48:13 +08:00

2111 lines
68 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# ───────────────────────── server.py ─────────────────────────
"""
Crawl4AI FastAPI entrypoint
• Browser pool + global page cap
• Ratelimiting, security, metrics
• /crawl, /crawl/stream, /md, /llm endpoints
"""
# ── stdlib & 3rdparty imports ───────────────────────────────
from crawler_pool import get_crawler, close_all, janitor
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LinkPreviewConfig
from auth import create_access_token, get_token_dependency, TokenRequest
from pydantic import BaseModel
from typing import Optional, List, Dict, AsyncGenerator
from fastapi import Request, Depends
from fastapi.responses import FileResponse
import ast
import asyncio
import base64
import re
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LinkPreviewConfig
from api import (
handle_crawl_request,
handle_http_crawl_request,
handle_http_stream_crawl_request,
handle_llm_qa,
handle_markdown_request,
handle_seed,
handle_stream_crawl_request,
handle_url_discovery,
stream_results,
)
from schemas import (
CrawlRequest,
CrawlRequestWithHooks,
HTMLRequest,
HTTPCrawlRequest,
HTTPCrawlRequestWithHooks,
JSEndpointRequest,
LinkAnalysisRequest,
MarkdownRequest,
PDFRequest,
RawCode,
ScreenshotRequest,
SeedRequest,
URLDiscoveryRequest,
)
from utils import (
FilterType, load_config, setup_logging, verify_email_domain
)
import os
import pathlib
import re
import sys
import time
from contextlib import asynccontextmanager
from pathlib import Path
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse
from api import (
handle_crawl_request,
handle_llm_qa,
handle_markdown_request,
handle_seed,
handle_stream_crawl_request,
handle_url_discovery,
stream_results,
)
from auth import TokenRequest, create_access_token, get_token_dependency
from crawler_pool import close_all, get_crawler, janitor
from fastapi import Depends, FastAPI, HTTPException, Path, Query, Request
from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
from fastapi.middleware.trustedhost import TrustedHostMiddleware
from fastapi.responses import (
FileResponse,
JSONResponse,
PlainTextResponse,
RedirectResponse,
StreamingResponse,
)
from fastapi.staticfiles import StaticFiles
from job import init_job_router
from mcp_bridge import attach_mcp, mcp_resource, mcp_template, mcp_tool
from prometheus_fastapi_instrumentator import Instrumentator
from pydantic import BaseModel, Field
from rank_bm25 import BM25Okapi
from redis import asyncio as aioredis
from routers import adaptive, dispatchers, scripts, monitoring
from schemas import (
CrawlRequest,
CrawlRequestWithHooks,
HTMLRequest,
JSEndpointRequest,
MarkdownRequest,
PDFRequest,
RawCode,
ScreenshotRequest,
SeedRequest,
URLDiscoveryRequest,
)
from slowapi import Limiter
from slowapi.util import get_remote_address
from utils import (
FilterType,
load_config,
setup_logging,
verify_email_domain,
create_dispatcher,
DEFAULT_DISPATCHER_TYPE,
)
import crawl4ai as _c4
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.async_dispatcher import BaseDispatcher
# ── internal imports (after sys.path append) ─────────────────
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
# ────────────────── configuration / logging ──────────────────
config = load_config()
setup_logging(config)
__version__ = "0.5.1-d1"
# ── global page semaphore (hard cap) ─────────────────────────
MAX_PAGES = config["crawler"]["pool"].get("max_pages", 30)
GLOBAL_SEM = asyncio.Semaphore(MAX_PAGES)
# import logging
# page_log = logging.getLogger("page_cap")
# orig_arun = AsyncWebCrawler.arun
# async def capped_arun(self, *a, **kw):
# await GLOBAL_SEM.acquire() # ← take slot
# try:
# in_flight = MAX_PAGES - GLOBAL_SEM._value # used permits
# page_log.info("🕸️ pages_in_flight=%s / %s", in_flight, MAX_PAGES)
# return await orig_arun(self, *a, **kw)
# finally:
# GLOBAL_SEM.release() # ← free slot
orig_arun = AsyncWebCrawler.arun
async def capped_arun(self, *a, **kw):
async with GLOBAL_SEM:
return await orig_arun(self, *a, **kw)
AsyncWebCrawler.arun = capped_arun
# ───────────────────── FastAPI lifespan ──────────────────────
@asynccontextmanager
async def lifespan(_: FastAPI):
import logging
logger = logging.getLogger(__name__)
# Initialize crawler pool
await get_crawler(
BrowserConfig(
extra_args=config["crawler"]["browser"].get("extra_args", []),
**config["crawler"]["browser"].get("kwargs", {}),
)
) # warmup
# Initialize dispatchers
try:
app.state.dispatchers: Dict[str, BaseDispatcher] = {}
app.state.default_dispatcher_type = DEFAULT_DISPATCHER_TYPE
# Pre-create both dispatcher types
app.state.dispatchers["memory_adaptive"] = create_dispatcher("memory_adaptive")
app.state.dispatchers["semaphore"] = create_dispatcher("semaphore")
logger.info(f"✓ Initialized dispatchers: {list(app.state.dispatchers.keys())}")
logger.info(f"✓ Default dispatcher: {app.state.default_dispatcher_type}")
except Exception as e:
logger.error(f"✗ Failed to initialize dispatchers: {e}")
raise
# Start background tasks
app.state.janitor = asyncio.create_task(janitor()) # idle GC
yield
# Cleanup
app.state.janitor.cancel()
app.state.dispatchers.clear()
logger.info("✓ Dispatchers cleaned up")
await close_all()
# ───────────────────── FastAPI instance ──────────────────────
app = FastAPI(
title=config["app"]["title"],
version=config["app"]["version"],
lifespan=lifespan,
)
# ── static playground ──────────────────────────────────────
STATIC_DIR = pathlib.Path(__file__).parent / "static" / "playground"
if not STATIC_DIR.exists():
raise RuntimeError(f"Playground assets not found at {STATIC_DIR}")
app.mount(
"/playground",
StaticFiles(directory=STATIC_DIR, html=True),
name="play",
)
@app.get("/")
async def root():
return RedirectResponse("/playground")
# ─────────────────── infra / middleware ─────────────────────
redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost"))
limiter = Limiter(
key_func=get_remote_address,
default_limits=[config["rate_limiting"]["default_limit"]],
storage_uri=config["rate_limiting"]["storage_uri"],
)
def _setup_security(app_: FastAPI):
sec = config["security"]
if not sec["enabled"]:
return
if sec.get("https_redirect"):
app_.add_middleware(HTTPSRedirectMiddleware)
if sec.get("trusted_hosts", []) != ["*"]:
app_.add_middleware(TrustedHostMiddleware, allowed_hosts=sec["trusted_hosts"])
_setup_security(app)
if config["observability"]["prometheus"]["enabled"]:
Instrumentator().instrument(app).expose(app)
token_dep = get_token_dependency(config)
@app.middleware("http")
async def add_security_headers(request: Request, call_next):
resp = await call_next(request)
if config["security"]["enabled"]:
resp.headers.update(config["security"]["headers"])
return resp
# ───────────────── safe configdump helper ─────────────────
ALLOWED_TYPES = {
"CrawlerRunConfig": CrawlerRunConfig,
"BrowserConfig": BrowserConfig,
}
def _safe_eval_config(expr: str) -> dict:
"""
Accept exactly one toplevel call to CrawlerRunConfig(...) or BrowserConfig(...).
Whatever is inside the parentheses is fine *except* further function calls
(so no __import__('os') stuff). All public names from crawl4ai are available
when we eval.
"""
tree = ast.parse(expr, mode="eval")
# must be a single call
if not isinstance(tree.body, ast.Call):
raise ValueError("Expression must be a single constructor call")
call = tree.body
if not (
isinstance(call.func, ast.Name)
and call.func.id in {"CrawlerRunConfig", "BrowserConfig"}
):
raise ValueError("Only CrawlerRunConfig(...) or BrowserConfig(...) are allowed")
# forbid nested calls to keep the surface tiny
for node in ast.walk(call):
if isinstance(node, ast.Call) and node is not call:
raise ValueError("Nested function calls are not permitted")
# expose everything that crawl4ai exports, nothing else
safe_env = {
name: getattr(_c4, name) for name in dir(_c4) if not name.startswith("_")
}
obj = eval(compile(tree, "<config>", "eval"), {"__builtins__": {}}, safe_env)
return obj.dump()
# ── job router ──────────────────────────────────────────────
app.include_router(init_job_router(redis, config, token_dep))
app.include_router(adaptive.router)
app.include_router(dispatchers.router)
app.include_router(scripts.router)
app.include_router(monitoring.router)
# ──────────────────────── Endpoints ──────────────────────────
@app.post("/token",
summary="Get Authentication Token",
description="Generate a JWT authentication token for API access using your email address.",
response_description="JWT token with expiration time",
tags=["Authentication"]
)
async def get_token(req: TokenRequest):
"""
Generate an authentication token for API access.
This endpoint creates a JWT token that must be included in the Authorization
header of subsequent requests. Tokens are valid for the duration specified
in server configuration (default: 60 minutes).
**Example Request:**
```json
{
"email": "user@example.com"
}
```
**Example Response:**
```json
{
"email": "user@example.com",
"access_token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...",
"token_type": "bearer"
}
```
**Usage:**
```python
import requests
response = requests.post(
"http://localhost:11235/token",
json={"email": "user@example.com"}
)
token = response.json()["access_token"]
# Use token in subsequent requests
headers = {"Authorization": f"Bearer {token}"}
```
**Notes:**
- Email domain must be in the allowed list (configurable via config.yml)
- Tokens expire after configured duration
- Store tokens securely and refresh before expiration
"""
if not verify_email_domain(req.email):
raise HTTPException(400, "Invalid email domain")
token = create_access_token({"sub": req.email})
return {"email": req.email, "access_token": token, "token_type": "bearer"}
@app.post("/config/dump",
summary="Validate and Dump Configuration",
description="Validate CrawlerRunConfig or BrowserConfig and return serialized version.",
response_description="Serialized configuration dictionary",
tags=["Utility"]
)
async def config_dump(raw: RawCode):
"""
Validate and serialize crawler or browser configuration.
This endpoint accepts Python code containing a CrawlerRunConfig or BrowserConfig
constructor and returns the serialized configuration dict. Useful for validating
configurations before use.
**Example Request:**
```json
{
"code": "CrawlerRunConfig(word_count_threshold=10, screenshot=True)"
}
```
**Example Response:**
```json
{
"word_count_threshold": 10,
"screenshot": true,
"wait_until": "networkidle",
...
}
```
**Security:**
- Only CrawlerRunConfig() and BrowserConfig() constructors allowed
- No nested function calls permitted
- Prevents code injection attempts
"""
try:
return JSONResponse(_safe_eval_config(raw.code.strip()))
except Exception as e:
raise HTTPException(400, str(e))
@app.post("/seed",
summary="URL Discovery and Seeding",
description="Discover and extract crawlable URLs from a website for subsequent crawling.",
response_description="List of discovered URLs with count",
tags=["Core Crawling"]
)
async def seed_url(request: SeedRequest):
"""
Discover and seed URLs from a website.
This endpoint crawls a starting URL and discovers all available links based on
specified filters. Useful for finding URLs to crawl before running a full crawl.
**Parameters:**
- **url**: Starting URL to discover links from
- **config**: Seeding configuration
- **max_urls**: Maximum number of URLs to return (default: 100)
- **filter_type**: Filter strategy for URLs
- `all`: Include all discovered URLs
- `domain`: Only URLs from same domain
- `subdomain`: Only URLs from same subdomain
- **exclude_external**: Exclude external links (default: false)
**Example Request:**
```json
{
"url": "https://www.nbcnews.com",
"config": {
"max_urls": 20,
"filter_type": "domain",
"exclude_external": true
}
}
```
**Example Response:**
```json
{
"seed_url": [
"https://www.nbcnews.com/news/page1",
"https://www.nbcnews.com/news/page2",
"https://www.nbcnews.com/about"
],
"count": 3
}
```
**Usage:**
```python
response = requests.post(
"http://localhost:11235/seed",
headers={"Authorization": f"Bearer {token}"},
json={
"url": "https://www.nbcnews.com",
"config": {"max_urls": 20, "filter_type": "domain"}
}
)
urls = response.json()["seed_url"]
```
**Notes:**
- Returns direct list of URLs in `seed_url` field (not nested dict)
- Empty list returned if no URLs found
- Respects robots.txt if configured
"""
try:
# Extract the domain (e.g., "docs.crawl4ai.com") from the full URL
domain = urlparse(request.url).netloc
if not domain:
raise HTTPException(
status_code=400,
detail="Invalid URL provided. Could not extract domain.",
)
res = await handle_seed(request.url, request.config)
return JSONResponse({"seed_url": res, "count": len(res)})
except Exception as e:
print(f"❌ Error in seed_url: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/urls/discover",
summary="URL Discovery and Seeding",
description="Discover and extract crawlable URLs from a domain using AsyncUrlSeeder functionality.",
response_description="List of discovered URL objects with metadata",
tags=["Core Crawling"]
)
async def discover_urls(request: URLDiscoveryRequest):
"""
Discover URLs from a domain using AsyncUrlSeeder functionality.
This endpoint allows users to find relevant URLs from a domain before
committing to a full crawl. It supports various discovery sources like
sitemaps and Common Crawl, with filtering and scoring capabilities.
**Parameters:**
- **domain**: Domain to discover URLs from (e.g., "example.com")
- **seeding_config**: Configuration object mirroring SeedingConfig parameters
- **source**: Discovery source(s) - "sitemap", "cc", or "sitemap+cc" (default: "sitemap+cc")
- **pattern**: URL pattern filter using glob-style wildcards (default: "*")
- **live_check**: Whether to verify URL liveness with HEAD requests (default: false)
- **extract_head**: Whether to fetch and parse <head> metadata (default: false)
- **max_urls**: Maximum URLs to discover, -1 for no limit (default: -1)
- **concurrency**: Maximum concurrent requests (default: 1000)
- **hits_per_sec**: Rate limit in requests per second (default: 5)
- **force**: Bypass internal cache and re-fetch URLs (default: false)
- **query**: Search query for BM25 relevance scoring (optional)
- **scoring_method**: Scoring method when query provided (default: "bm25")
- **score_threshold**: Minimum score threshold for filtering (optional)
- **filter_nonsense_urls**: Filter out nonsense URLs (default: true)
**Example Request:**
```json
{
"domain": "docs.crawl4ai.com",
"seeding_config": {
"source": "sitemap",
"pattern": "*/docs/*",
"extract_head": true,
"max_urls": 50,
"query": "API documentation"
}
}
```
**Example Response:**
```json
[
{
"url": "https://docs.crawl4ai.com/api/getting-started",
"status": "valid",
"head_data": {
"title": "Getting Started - Crawl4AI API",
"description": "Learn how to get started with Crawl4AI API"
},
"score": 0.85
}
]
```
**Usage:**
```python
response = requests.post(
"http://localhost:11235/urls/discover",
headers={"Authorization": f"Bearer {token}"},
json={
"domain": "docs.crawl4ai.com",
"seeding_config": {
"source": "sitemap+cc",
"extract_head": true,
"max_urls": 100
}
}
)
urls = response.json()
```
**Notes:**
- Returns direct list of URL objects with metadata if requested
- Empty list returned if no URLs found
- Supports BM25 relevance scoring when query is provided
- Can combine multiple sources for maximum coverage
"""
try:
res = await handle_url_discovery(request.domain, request.seeding_config)
return JSONResponse(res)
except Exception as e:
print(f"❌ Error in discover_urls: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/md",
summary="Extract Markdown",
description="Extract clean markdown content from a URL or raw HTML.",
response_description="Markdown content with metadata",
tags=["Content Extraction"]
)
@limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("md")
async def get_markdown(
request: Request,
body: MarkdownRequest,
_td: Dict = Depends(token_dep),
):
"""
Extract clean markdown content from a URL.
This endpoint fetches a page and converts it to clean, readable markdown format.
Useful for LLM processing, content analysis, or markdown export.
**Request Body:**
```json
{
"url": "https://example.com",
"f": "markdown",
"q": "",
"c": true,
"provider": "openai",
"temperature": 0.0
}
```
**Parameters:**
- `url`: Target URL (or raw:// for raw HTML)
- `f`: Output format ("markdown", "fit_markdown")
- `q`: Query for filtered extraction
- `c`: Enable caching (default: true)
- `provider`: LLM provider for enhanced extraction
- `temperature`: LLM temperature setting
- `base_url`: Custom LLM API base URL
**Response:**
```json
{
"url": "https://example.com",
"markdown": "# Example Domain\\n\\nThis domain is for use...",
"success": true,
"filter": "markdown",
"query": "",
"cache": true
}
```
**Usage:**
```python
response = requests.post(
"http://localhost:11235/md",
headers={"Authorization": f"Bearer {token}"},
json={"url": "https://example.com"}
)
markdown = response.json()["markdown"]
print(markdown)
```
**Notes:**
- Supports raw HTML input with `raw://` prefix
- Returns clean, structured markdown
- LLM-friendly format for AI processing
- Caching improves performance for repeated requests
"""
if not body.url.startswith(("http://", "https://")) and not body.url.startswith(
("raw:", "raw://")
):
raise HTTPException(
400,
"Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)",
)
markdown = await handle_markdown_request(
body.url,
body.f,
body.q,
body.c,
config,
body.provider,
body.temperature,
body.base_url,
)
return JSONResponse(
{
"url": body.url,
"filter": body.f,
"query": body.q,
"cache": body.c,
"markdown": markdown,
"success": True,
}
)
@app.post("/html",
summary="Extract Processed HTML",
description="Crawl a URL and return preprocessed HTML suitable for schema extraction.",
response_description="Processed HTML content",
tags=["Content Extraction"]
)
@limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("html")
async def generate_html(
request: Request,
body: HTMLRequest,
_td: Dict = Depends(token_dep),
):
"""
Crawl a URL and return sanitized, preprocessed HTML.
This endpoint crawls a page and returns processed HTML that's been cleaned
and prepared for schema extraction or further processing. The HTML is
sanitized to remove scripts, styles, and other non-content elements.
**Request Body:**
```json
{
"url": "https://example.com"
}
```
**Response:**
```json
{
"url": "https://example.com",
"html": "<html><body><h1>Example Domain</h1>...</body></html>",
"success": true
}
```
**Usage:**
```python
response = requests.post(
"http://localhost:11235/html",
headers={"Authorization": f"Bearer {token}"},
json={"url": "https://example.com"}
)
html = response.json()["html"]
```
**Notes:**
- HTML is preprocessed for schema extraction
- Scripts, styles, and non-content elements removed
- Preserves semantic structure
- Useful for building data extraction schemas
"""
cfg = CrawlerRunConfig()
try:
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
results = await crawler.arun(url=body.url, config=cfg)
# Check if the crawl was successful
if not results[0].success:
raise HTTPException(
status_code=500, detail=results[0].error_message or "Crawl failed"
)
raw_html = results[0].html
from crawl4ai.utils import preprocess_html_for_schema
processed_html = preprocess_html_for_schema(raw_html)
return JSONResponse({"html": processed_html, "url": body.url, "success": True})
except Exception as e:
# Log and raise as HTTP 500 for other exceptions
raise HTTPException(status_code=500, detail=str(e))
# Screenshot endpoint
@app.post("/screenshot",
summary="Capture Screenshot",
description="Capture a full-page PNG screenshot of a URL.",
response_description="Screenshot data (base64 or file path)",
tags=["Content Extraction"]
)
@limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("screenshot")
async def generate_screenshot(
request: Request,
body: ScreenshotRequest,
_td: Dict = Depends(token_dep),
):
"""
Capture a full-page PNG screenshot of a URL.
This endpoint navigates to a URL and captures a full-page screenshot.
Optionally wait for page content to load before capturing.
**Request Body:**
```json
{
"url": "https://example.com",
"screenshot_wait_for": 2.0,
"output_path": "/path/to/screenshot.png"
}
```
**Parameters:**
- `url`: Target URL to screenshot
- `screenshot_wait_for`: Seconds to wait before capture (default: 0)
- `output_path`: Optional path to save screenshot file
**Response (with output_path):**
```json
{
"url": "https://example.com",
"screenshot": "/absolute/path/to/screenshot.png",
"success": true
}
```
**Response (without output_path):**
```json
{
"url": "https://example.com",
"screenshot": "iVBORw0KGgoAAAANS...",
"success": true
}
```
**Usage:**
```python
# Save to file
response = requests.post(
"http://localhost:11235/screenshot",
headers={"Authorization": f"Bearer {token}"},
json={
"url": "https://example.com",
"output_path": "./screenshot.png"
}
)
print(response.json()["screenshot"]) # File path
# Get base64 data
response = requests.post(
"http://localhost:11235/screenshot",
headers={"Authorization": f"Bearer {token}"},
json={"url": "https://example.com"}
)
import base64
screenshot_data = base64.b64decode(response.json()["screenshot"])
with open("screenshot.png", "wb") as f:
f.write(screenshot_data)
```
**Notes:**
- Captures full page (scrolls to bottom)
- Returns base64 PNG data if no output_path specified
- Saves to file and returns path if output_path provided
- Wait time helps ensure dynamic content is loaded
"""
try:
cfg = CrawlerRunConfig(
screenshot=True, screenshot_wait_for=body.screenshot_wait_for
)
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
results = await crawler.arun(url=body.url, config=cfg)
if not results[0].success:
raise HTTPException(
status_code=500, detail=results[0].error_message or "Crawl failed"
)
screenshot_data = results[0].screenshot
if body.output_path:
abs_path = os.path.abspath(body.output_path)
os.makedirs(os.path.dirname(abs_path), exist_ok=True)
with open(abs_path, "wb") as f:
f.write(base64.b64decode(screenshot_data))
return {"success": True, "path": abs_path}
return {"success": True, "screenshot": screenshot_data}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# PDF endpoint
@app.post("/pdf",
summary="Generate PDF",
description="Generate a PDF document from a URL.",
response_description="PDF data (base64 or file path)",
tags=["Content Extraction"]
)
@limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("pdf")
async def generate_pdf(
request: Request,
body: PDFRequest,
_td: Dict = Depends(token_dep),
):
"""
Generate a PDF document from a URL.
This endpoint navigates to a URL and generates a PDF document of the page.
Useful for archiving, printing, or offline viewing.
**Request Body:**
```json
{
"url": "https://example.com",
"output_path": "/path/to/document.pdf"
}
```
**Parameters:**
- `url`: Target URL to convert to PDF
- `output_path`: Optional path to save PDF file
**Response (with output_path):**
```json
{
"success": true,
"path": "/absolute/path/to/document.pdf"
}
```
**Response (without output_path):**
```json
{
"success": true,
"pdf": "JVBERi0xLjQKJeLjz9MKMy..."
}
```
**Usage:**
```python
# Save to file
response = requests.post(
"http://localhost:11235/pdf",
headers={"Authorization": f"Bearer {token}"},
json={
"url": "https://example.com",
"output_path": "./document.pdf"
}
)
print(response.json()["path"])
# Get base64 data
response = requests.post(
"http://localhost:11235/pdf",
headers={"Authorization": f"Bearer {token}"},
json={"url": "https://example.com"}
)
import base64
pdf_data = base64.b64decode(response.json()["pdf"])
with open("document.pdf", "wb") as f:
f.write(pdf_data)
```
**Notes:**
- Generates printable PDF format
- Returns base64 PDF data if no output_path specified
- Saves to file and returns path if output_path provided
- Preserves page layout and styling
"""
try:
cfg = CrawlerRunConfig(pdf=True)
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
results = await crawler.arun(url=body.url, config=cfg)
if not results[0].success:
raise HTTPException(
status_code=500, detail=results[0].error_message or "Crawl failed"
)
pdf_data = results[0].pdf
if body.output_path:
abs_path = os.path.abspath(body.output_path)
os.makedirs(os.path.dirname(abs_path), exist_ok=True)
with open(abs_path, "wb") as f:
f.write(pdf_data)
return {"success": True, "path": abs_path}
return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/execute_js",
summary="Execute JavaScript",
description="Execute JavaScript code on a page and return the full crawl result.",
response_description="Complete CrawlResult with JS execution results",
tags=["Advanced"]
)
@limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("execute_js")
async def execute_js(
request: Request,
body: JSEndpointRequest,
_td: Dict = Depends(token_dep),
):
"""
Execute JavaScript code on a page and return the complete crawl result.
This endpoint navigates to a URL and executes custom JavaScript code in the
browser context. Each script must be an expression that returns a value.
**Request Body:**
```json
{
"url": "https://example.com",
"scripts": [
"document.title",
"(async () => { await new Promise(r => setTimeout(r, 1000)); return document.body.innerText; })()"
],
"wait_for": "css:.content"
}
```
**Parameters:**
- `url`: Target URL to execute scripts on
- `scripts`: List of JavaScript expressions to execute in order
- `wait_for`: Optional selector or condition to wait for
**Script Format:**
Each script should be an expression that returns a value:
- Simple expression: `"document.title"`
- IIFE: `"(() => { return window.location.href; })()"`
- Async IIFE: `"(async () => { await fetch('/api'); return 'done'; })()"`
**Response:**
Returns complete CrawlResult with:
```json
{
"url": "https://example.com",
"html": "<html>...",
"markdown": "# Page Content...",
"js_execution_result": {
"0": "Example Domain",
"1": "This domain is for use in..."
},
"links": {...},
"media": {...},
"success": true
}
```
**Usage:**
```python
response = requests.post(
"http://localhost:11235/execute_js",
headers={"Authorization": f"Bearer {token}"},
json={
"url": "https://example.com",
"scripts": [
"document.title",
"document.querySelectorAll('p').length"
]
}
)
result = response.json()
print(result["js_execution_result"]) # {"0": "Example Domain", "1": 2}
print(result["markdown"]) # Full markdown content
```
**Notes:**
- Scripts execute in order
- Each script must return a value
- Returns full CrawlResult (no need to call other endpoints)
- Use for dynamic content, button clicks, form submissions
- Access results via js_execution_result dictionary (indexed by position)
**Return Format:**
The return result is an instance of CrawlResult, so you have access to markdown, links, and other stuff. If this is enough, you don't need to call again for other endpoints.
```python
class CrawlResult(BaseModel):
url: str
html: str
success: bool
cleaned_html: Optional[str] = None
media: Dict[str, List[Dict]] = {}
links: Dict[str, List[Dict]] = {}
downloaded_files: Optional[List[str]] = None
js_execution_result: Optional[Dict[str, Any]] = None
screenshot: Optional[str] = None
pdf: Optional[bytes] = None
mhtml: Optional[str] = None
_markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
extracted_content: Optional[str] = None
metadata: Optional[dict] = None
error_message: Optional[str] = None
session_id: Optional[str] = None
response_headers: Optional[dict] = None
status_code: Optional[int] = None
ssl_certificate: Optional[SSLCertificate] = None
dispatch_result: Optional[DispatchResult] = None
redirected_url: Optional[str] = None
network_requests: Optional[List[Dict[str, Any]]] = None
console_messages: Optional[List[Dict[str, Any]]] = None
class MarkdownGenerationResult(BaseModel):
raw_markdown: str
markdown_with_citations: str
references_markdown: str
fit_markdown: Optional[str] = None
fit_html: Optional[str] = None
```
"""
try:
cfg = CrawlerRunConfig(js_code=body.scripts)
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
results = await crawler.arun(url=body.url, config=cfg)
if not results[0].success:
raise HTTPException(
status_code=500, detail=results[0].error_message or "Crawl failed"
)
# Return JSON-serializable dict of the first CrawlResult
data = results[0].model_dump()
return JSONResponse(data)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/links/analyze")
@limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("links_analyze")
async def analyze_links(
request: Request,
body: LinkAnalysisRequest,
_td: Dict = Depends(token_dep),
):
"""
Analyze and score links on a webpage.
Returns a dictionary of links with their scores and metadata.
"""
try:
# Create AsyncWebCrawler instance
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
# Deserialize config dict to LinkPreviewConfig, use default if not provided
link_preview_config = LinkPreviewConfig.from_dict(body.config) if body.config else LinkPreviewConfig()
# Create CrawlerRunConfig with link analysis settings
run_config = CrawlerRunConfig(
link_preview_config=link_preview_config,
score_links=True,
screenshot=False,
pdf=False,
extraction_strategy=None
)
# Execute the crawl
result = await crawler.arun(url=body.url, config=run_config)
# Check if crawl was successful
if not result.success:
raise HTTPException(
status_code=500,
detail=result.error_message or "Crawl failed"
)
# Extract and return the links dictionary
return JSONResponse(result.links)
except HTTPException:
# Re-raise HTTP exceptions
raise
except Exception as e:
# Handle any other exceptions
raise HTTPException(
status_code=500,
detail=f"Internal server error: {str(e)}"
)
@app.get("/llm/{url:path}",
summary="LLM Q&A",
description="Ask questions about a webpage using LLM.",
response_description="Answer from LLM based on page content",
tags=["Advanced"]
)
async def llm_endpoint(
request: Request,
url: str = Path(..., description="URL to analyze (can omit https://)"),
q: str = Query(..., description="Question to ask about the page"),
_td: Dict = Depends(token_dep),
):
"""
Ask questions about a webpage using an LLM.
This endpoint crawls a page and uses an LLM to answer questions about
the content. Useful for extracting specific information or insights.
**Request:**
```
GET /llm/example.com?q=What is this page about?
```
**Parameters:**
- `url`: Target URL (path parameter, https:// is optional)
- `q`: Question to ask (query parameter)
**Response:**
```json
{
"answer": "This page is the official documentation for Example Domain..."
}
```
**Usage:**
```python
import requests
from urllib.parse import quote
url = "example.com"
question = "What is this page about?"
response = requests.get(
f"http://localhost:11235/llm/{url}?q={quote(question)}",
headers={"Authorization": f"Bearer {token}"}
)
print(response.json()["answer"])
```
```bash
curl "http://localhost:11235/llm/example.com?q=What%20is%20this%20page%20about?" \\
-H "Authorization: Bearer YOUR_TOKEN"
```
**Notes:**
- Automatically crawls the page and extracts content
- Uses configured LLM to generate answers
- URL can omit https:// prefix
- URL-encode the query parameter
- Supports raw:// prefix for raw HTML
"""
if not q:
raise HTTPException(400, "Query parameter 'q' is required")
if not url.startswith(("http://", "https://")) and not url.startswith(
("raw:", "raw://")
):
url = "https://" + url
answer = await handle_llm_qa(url, q, config)
return JSONResponse({"answer": answer})
@app.get("/schema",
summary="Get Configuration Schemas",
description="Get JSON schemas for BrowserConfig and CrawlerRunConfig.",
response_description="Configuration schemas",
tags=["Utility"]
)
async def get_schema():
"""
Get JSON schemas for configuration objects.
Returns the complete schemas for BrowserConfig and CrawlerRunConfig,
showing all available configuration options and their types.
**Response:**
```json
{
"browser": {
"type": "object",
"properties": {
"headless": {"type": "boolean", "default": true},
"verbose": {"type": "boolean", "default": false},
...
}
},
"crawler": {
"type": "object",
"properties": {
"word_count_threshold": {"type": "integer", "default": 10},
"wait_for": {"type": "string"},
...
}
}
}
```
**Usage:**
```python
response = requests.get(
"http://localhost:11235/schema",
headers={"Authorization": f"Bearer {token}"}
)
schemas = response.json()
print(schemas["browser"]) # BrowserConfig schema
print(schemas["crawler"]) # CrawlerRunConfig schema
```
**Notes:**
- No authentication required
- Shows all available configuration options
- Includes default values and types
- Useful for building configuration UIs
"""
from crawl4ai import BrowserConfig, CrawlerRunConfig
return {"browser": BrowserConfig().dump(), "crawler": CrawlerRunConfig().dump()}
@app.get("/hooks/info")
async def get_hooks_info():
"""Get information about available hook points and their signatures"""
from hook_manager import UserHookManager
hook_info = {}
for hook_point, params in UserHookManager.HOOK_SIGNATURES.items():
hook_info[hook_point] = {
"parameters": params,
"description": get_hook_description(hook_point),
"example": get_hook_example(hook_point),
}
return JSONResponse(
{
"available_hooks": hook_info,
"timeout_limits": {"min": 1, "max": 120, "default": 30},
}
)
def get_hook_description(hook_point: str) -> str:
"""Get description for each hook point"""
descriptions = {
"on_browser_created": "Called after browser instance is created",
"on_page_context_created": "Called after page and context are created - ideal for authentication",
"before_goto": "Called before navigating to the target URL",
"after_goto": "Called after navigation is complete",
"on_user_agent_updated": "Called when user agent is updated",
"on_execution_started": "Called when custom JavaScript execution begins",
"before_retrieve_html": "Called before retrieving the final HTML - ideal for scrolling",
"before_return_html": "Called just before returning the HTML content",
}
return descriptions.get(hook_point, "")
def get_hook_example(hook_point: str) -> str:
"""Get example code for each hook point"""
examples = {
"on_page_context_created": """async def hook(page, context, **kwargs):
# Add authentication cookie
await context.add_cookies([{
'name': 'session',
'value': 'my-session-id',
'domain': '.example.com'
}])
return page""",
"before_retrieve_html": """async def hook(page, context, **kwargs):
# Scroll to load lazy content
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(2000)
return page""",
"before_goto": """async def hook(page, context, url, **kwargs):
# Set custom headers
await page.set_extra_http_headers({
'X-Custom-Header': 'value'
})
return page""",
}
return examples.get(hook_point, "# Implement your hook logic here\nreturn page")
@app.get(config["observability"]["health_check"]["endpoint"],
summary="Health Check",
description="Check if the API server is running and healthy.",
response_description="Health status with timestamp and version",
tags=["Utility"]
)
async def health():
"""
Health check endpoint.
Returns the current health status of the API server, including
timestamp and version information.
**Response:**
```json
{
"status": "ok",
"timestamp": 1704067200.0,
"version": "0.4.0"
}
```
**Usage:**
```python
response = requests.get("http://localhost:11235/health")
print(response.json())
```
```bash
curl http://localhost:11235/health
```
**Notes:**
- No authentication required
- Returns 200 OK if server is healthy
- Use for monitoring and load balancer checks
"""
return {"status": "ok", "timestamp": time.time(), "version": __version__}
@app.get(config["observability"]["prometheus"]["endpoint"],
summary="Prometheus Metrics",
description="Get Prometheus-formatted metrics for monitoring.",
response_description="Prometheus metrics",
tags=["Utility"]
)
async def metrics():
"""
Get Prometheus metrics.
Returns Prometheus-formatted metrics for monitoring API performance,
including request counts, latencies, and error rates.
**Response:**
```
# HELP http_requests_total Total HTTP requests
# TYPE http_requests_total counter
http_requests_total{method="POST",path="/crawl",status="200"} 42
# HELP http_request_duration_seconds HTTP request latency
# TYPE http_request_duration_seconds histogram
http_request_duration_seconds_bucket{le="0.5"} 38
...
```
**Usage:**
```python
response = requests.get("http://localhost:11235/metrics")
print(response.text)
```
```bash
curl http://localhost:11235/metrics
```
**Notes:**
- No authentication required
- Returns metrics in Prometheus exposition format
- Configure Prometheus to scrape this endpoint
- Includes request counts, latencies, and errors
"""
return RedirectResponse(config["observability"]["prometheus"]["endpoint"])
@app.post("/crawl",
summary="Crawl URLs",
description="Main endpoint for crawling one or more URLs and extracting content.",
response_description="Crawl results with extracted content, metadata, and media",
tags=["Core Crawling"]
)
@limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("crawl")
async def crawl(
request: Request,
crawl_request: CrawlRequest | CrawlRequestWithHooks,
_td: Dict = Depends(token_dep),
):
"""
Crawl one or more URLs and extract content.
This is the main crawling endpoint that fetches pages, extracts content, and returns
structured data including HTML, markdown, links, media, and metadata.
**Request Body:**
```json
{
"urls": ["https://example.com"],
"browser_config": {
"headless": true,
"viewport_width": 1920,
"viewport_height": 1080
},
"crawler_config": {
"word_count_threshold": 10,
"wait_until": "networkidle",
"screenshot": true,
"pdf": false
},
"dispatcher": "memory_adaptive",
"anti_bot_strategy": "stealth",
"proxy_rotation_strategy": "round_robin",
"proxies": ["http://proxy1:8080"]
}
```
**Response:**
```json
{
"success": true,
"results": [
{
"url": "https://example.com",
"html": "<html>...</html>",
"markdown": "# Example Domain\\n\\nThis domain is...",
"cleaned_html": "<div>...</div>",
"screenshot": "base64_encoded_image",
"success": true,
"status_code": 200,
"metadata": {
"title": "Example Domain",
"description": "Example description"
},
"links": {
"internal": ["https://example.com/about"],
"external": ["https://other.com"]
},
"media": {
"images": [{"src": "image.jpg", "alt": "Image"}]
}
}
]
}
```
**Configuration Options:**
*Browser Config:*
- `headless`: Run browser in headless mode (default: true)
- `viewport_width`: Browser width in pixels (default: 1920)
- `viewport_height`: Browser height in pixels (default: 1080)
- `user_agent`: Custom user agent string
- `java_script_enabled`: Enable JavaScript (default: true)
*Crawler Config:*
- `word_count_threshold`: Minimum words per content block (default: 10)
- `wait_until`: Page load strategy ("networkidle", "domcontentloaded", "load")
- `wait_for`: CSS selector to wait for before extraction
- `screenshot`: Capture page screenshot (base64 encoded)
- `pdf`: Generate PDF export
- `remove_overlay_elements`: Remove popups/modals automatically
- `css_selector`: Extract only specific elements
- `js_code`: Execute custom JavaScript before extraction
*Dispatcher Options:*
- `memory_adaptive`: Dynamic concurrency based on memory usage (recommended)
- `semaphore`: Fixed concurrency limit
*Anti-Bot Strategies:*
- `stealth`: Basic stealth mode
- `undetected`: Maximum evasion techniques
*Proxy Rotation:*
- `round_robin`: Sequential proxy rotation
- `random`: Random proxy selection
**Usage Examples:**
```python
import requests
response = requests.post(
"http://localhost:11235/crawl",
headers={"Authorization": f"Bearer {token}"},
json={
"urls": ["https://example.com"],
"browser_config": {"headless": True},
"crawler_config": {"screenshot": True},
"dispatcher": "memory_adaptive"
}
)
data = response.json()
if data["success"]:
result = data["results"][0]
print(f"Title: {result['metadata']['title']}")
print(f"Content: {result['markdown'][:200]}...")
```
**Notes:**
- For streaming responses with real-time progress, use `/crawl/stream`
- Set `stream: true` in crawler_config to auto-redirect to streaming endpoint
- All URLs must start with http:// or https://
- Rate limiting applies (default: 100 requests/minute)
- Supports custom hooks for advanced processing
"""
if not crawl_request.urls:
raise HTTPException(400, "At least one URL required")
# Check whether it is a redirection for a streaming request
crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
if crawler_config.stream:
return await stream_process(crawl_request=crawl_request)
# Prepare hooks config if provided
hooks_config = None
if hasattr(crawl_request, 'hooks') and crawl_request.hooks:
hooks_config = {
"code": crawl_request.hooks.code,
"timeout": crawl_request.hooks.timeout,
}
# Get dispatcher from app state
dispatcher_type = crawl_request.dispatcher.value if crawl_request.dispatcher else app.state.default_dispatcher_type
dispatcher = app.state.dispatchers.get(dispatcher_type)
if not dispatcher:
raise HTTPException(
500,
f"Dispatcher '{dispatcher_type}' not available. Available dispatchers: {list(app.state.dispatchers.keys())}"
)
results = await handle_crawl_request(
urls=crawl_request.urls,
browser_config=crawl_request.browser_config,
crawler_config=crawl_request.crawler_config,
config=config,
hooks_config=hooks_config,
anti_bot_strategy=crawl_request.anti_bot_strategy,
headless=crawl_request.headless,
proxy_rotation_strategy=crawl_request.proxy_rotation_strategy,
proxies=crawl_request.proxies,
proxy_failure_threshold=crawl_request.proxy_failure_threshold,
proxy_recovery_time=crawl_request.proxy_recovery_time,
dispatcher=dispatcher,
)
# check if all of the results are not successful
if results["results"] and all(not result["success"] for result in results["results"]):
error_message = results['results'][0].get('error_message', 'Unknown error') if results['results'] else 'No results returned'
raise HTTPException(
500, f"Crawl request failed: {error_message}"
)
return JSONResponse(results)
@app.post("/crawl/stream",
summary="Crawl URLs with Streaming",
description="Stream crawl progress in real-time using Server-Sent Events (SSE).",
response_description="Server-Sent Events stream with progress updates and results",
tags=["Core Crawling"]
)
@limiter.limit(config["rate_limiting"]["default_limit"])
async def crawl_stream(
request: Request,
crawl_request: CrawlRequestWithHooks,
_td: Dict = Depends(token_dep),
):
"""
Crawl URLs with real-time streaming progress updates.
This endpoint returns Server-Sent Events (SSE) stream with real-time updates
about crawl progress, allowing you to monitor long-running crawl operations.
**Request Body:**
Same as `/crawl` endpoint.
**Response Stream:**
Server-Sent Events with the following event types:
```
data: {"type": "progress", "url": "https://example.com", "status": "started"}
data: {"type": "progress", "url": "https://example.com", "status": "fetching"}
data: {"type": "result", "url": "https://example.com", "data": {...}}
data: {"type": "complete", "success": true, "total_urls": 1}
```
**Event Types:**
- `progress`: Crawl progress updates
- `result`: Individual URL result
- `complete`: All URLs processed
- `error`: Error occurred
**Usage Examples:**
*Python with requests:*
```python
import requests
import json
response = requests.post(
"http://localhost:11235/crawl/stream",
headers={
"Authorization": f"Bearer {token}",
"Content-Type": "application/json"
},
json={"urls": ["https://example.com"]},
stream=True
)
for line in response.iter_lines():
if line:
line = line.decode('utf-8')
if line.startswith('data: '):
data = json.loads(line[6:])
print(f"Event: {data.get('type')} - {data}")
if data['type'] == 'complete':
break
```
*JavaScript with EventSource:*
```javascript
const eventSource = new EventSource('http://localhost:11235/crawl/stream');
eventSource.onmessage = (event) => {
const data = JSON.parse(event.data);
console.log('Progress:', data);
if (data.type === 'result') {
console.log('Got result for:', data.url);
}
if (data.type === 'complete') {
eventSource.close();
}
};
eventSource.onerror = (error) => {
console.error('Stream error:', error);
eventSource.close();
};
```
**Benefits:**
- Real-time progress monitoring
- Immediate feedback on each URL
- Better for long-running operations
- Can process results as they arrive
**Notes:**
- Response uses `text/event-stream` content type
- Keep connection alive to receive all events
- Connection automatically closes after completion
- Use `/crawl` for simple batch operations without streaming
"""
if not crawl_request.urls:
raise HTTPException(400, "At least one URL required")
return await stream_process(crawl_request=crawl_request)
async def stream_process(crawl_request: CrawlRequestWithHooks):
# Prepare hooks config if provided
hooks_config = None
if hasattr(crawl_request, 'hooks') and crawl_request.hooks:
hooks_config = {
"code": crawl_request.hooks.code,
"timeout": crawl_request.hooks.timeout,
}
# Get dispatcher from app state
dispatcher_type = crawl_request.dispatcher.value if crawl_request.dispatcher else app.state.default_dispatcher_type
dispatcher = app.state.dispatchers.get(dispatcher_type)
if not dispatcher:
raise HTTPException(
500,
f"Dispatcher '{dispatcher_type}' not available. Available dispatchers: {list(app.state.dispatchers.keys())}"
)
crawler, gen, hooks_info = await handle_stream_crawl_request(
urls=crawl_request.urls,
browser_config=crawl_request.browser_config,
crawler_config=crawl_request.crawler_config,
config=config,
hooks_config=hooks_config,
anti_bot_strategy=crawl_request.anti_bot_strategy,
headless=crawl_request.headless,
proxy_rotation_strategy=crawl_request.proxy_rotation_strategy,
proxies=crawl_request.proxies,
proxy_failure_threshold=crawl_request.proxy_failure_threshold,
proxy_recovery_time=crawl_request.proxy_recovery_time,
dispatcher=dispatcher,
)
# Add hooks info to response headers if available
headers = {
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Stream-Status": "active",
}
if hooks_info:
import json
headers["X-Hooks-Status"] = json.dumps(hooks_info["status"]["status"])
return StreamingResponse(
stream_results(crawler, gen),
media_type="application/x-ndjson",
headers=headers,
)
# ============================================================================
# HTTP Crawling Endpoints
# ============================================================================
@app.post("/crawl/http",
summary="Crawl URLs with HTTP-only strategy",
description="Crawl one or more URLs using a fast, lightweight HTTP-only strategy without browser rendering.",
response_description="Crawl results with extracted content, metadata, and media",
tags=["HTTP Crawling"]
)
@limiter.limit(config["rate_limiting"]["default_limit"])
async def crawl_http(
request: Request,
crawl_request: HTTPCrawlRequest | HTTPCrawlRequestWithHooks,
_td: Dict = Depends(token_dep),
):
"""
Crawl one or more URLs using HTTP-only strategy.
This endpoint provides fast, lightweight crawling without browser rendering.
Perfect for static websites, APIs, and content that doesn't require JavaScript execution.
**Request Body:**
```json
{
"urls": ["https://api.example.com/data"],
"http_config": {
"method": "GET",
"headers": {"Accept": "application/json"},
"timeout": 30
},
"crawler_config": {
"word_count_threshold": 10,
"extraction_strategy": "NoExtractionStrategy"
},
"dispatcher": "memory_adaptive"
}
```
**Response:**
```json
{
"success": true,
"results": [
{
"url": "https://api.example.com/data",
"html": "<html>...</html>",
"markdown": "# API Response\\n\\n...",
"success": true,
"status_code": 200,
"metadata": {
"title": "API Data",
"description": "JSON response data"
}
}
],
"server_processing_time_s": 0.85,
"server_memory_delta_mb": 2.1
}
```
**HTTP Config Options:**
- `method`: HTTP method ("GET", "POST", etc.) (default: "GET")
- `headers`: Custom HTTP headers
- `data`: Form data for POST requests
- `json`: JSON data for POST requests
- `follow_redirects`: Whether to follow redirects (default: true)
- `verify_ssl`: Whether to verify SSL certificates (default: true)
**Notes:**
- Thousands of times faster than browser-based crawling
- No JavaScript execution or browser rendering
- Ideal for APIs, static sites, and sitemaps
- For streaming results, use `/crawl/http/stream`
"""
if not crawl_request.urls:
raise HTTPException(400, "At least one URL required")
# Prepare hooks config if provided
hooks_config = None
if hasattr(crawl_request, 'hooks') and crawl_request.hooks:
hooks_config = {
"code": crawl_request.hooks.code,
"timeout": crawl_request.hooks.timeout,
}
# Get dispatcher from app state
dispatcher_type = crawl_request.dispatcher.value if crawl_request.dispatcher else app.state.default_dispatcher_type
dispatcher = app.state.dispatchers.get(dispatcher_type)
if not dispatcher:
raise HTTPException(
500,
f"Dispatcher '{dispatcher_type}' not available. Available dispatchers: {list(app.state.dispatchers.keys())}"
)
results = await handle_http_crawl_request(
urls=crawl_request.urls,
http_config=crawl_request.http_config,
crawler_config=crawl_request.crawler_config,
config=config,
hooks_config=hooks_config,
dispatcher=dispatcher,
)
return results
@app.post("/crawl/http/stream",
summary="Crawl URLs with HTTP-only strategy (streaming)",
description="Stream HTTP-only crawl progress in real-time using Server-Sent Events (SSE).",
response_description="Server-Sent Events stream with progress updates and results",
tags=["HTTP Crawling"]
)
@limiter.limit(config["rate_limiting"]["default_limit"])
async def crawl_http_stream(
request: Request,
crawl_request: HTTPCrawlRequestWithHooks,
_td: Dict = Depends(token_dep),
):
"""
Stream HTTP-only crawl progress in real-time.
This endpoint returns Server-Sent Events (SSE) stream with real-time updates
for fast HTTP-based crawling operations.
**Request Body:**
Same as `/crawl/http` endpoint.
**Response Stream:**
Server-Sent Events with the following event types:
```
data: {"type": "progress", "url": "https://api.example.com", "status": "started"}
data: {"type": "progress", "url": "https://api.example.com", "status": "fetching"}
data: {"type": "result", "url": "https://api.example.com", "data": {...}}
data: {"type": "complete", "success": true, "total_urls": 1}
```
**Benefits:**
- Real-time progress monitoring for HTTP crawls
- Immediate feedback on each URL
- Lightweight and fast streaming
- Can process results as they arrive
"""
if not crawl_request.urls:
raise HTTPException(400, "At least one URL required")
return await http_stream_process(crawl_request=crawl_request)
async def http_stream_process(crawl_request: HTTPCrawlRequestWithHooks):
# Prepare hooks config if provided
hooks_config = None
if hasattr(crawl_request, 'hooks') and crawl_request.hooks:
hooks_config = {
"code": crawl_request.hooks.code,
"timeout": crawl_request.hooks.timeout,
}
# Get dispatcher from app state
dispatcher_type = crawl_request.dispatcher.value if crawl_request.dispatcher else app.state.default_dispatcher_type
dispatcher = app.state.dispatchers.get(dispatcher_type)
if not dispatcher:
raise HTTPException(
500,
f"Dispatcher '{dispatcher_type}' not available. Available dispatchers: {list(app.state.dispatchers.keys())}"
)
crawler, gen, hooks_info = await handle_http_stream_crawl_request(
urls=crawl_request.urls,
http_config=crawl_request.http_config,
crawler_config=crawl_request.crawler_config,
config=config,
hooks_config=hooks_config,
dispatcher=dispatcher,
)
# Add hooks info to response headers if available
headers = {
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Stream-Status": "active",
}
if hooks_info:
import json
headers["X-Hooks-Status"] = json.dumps(hooks_info["status"]["status"])
return StreamingResponse(
stream_http_results(gen),
media_type="application/x-ndjson",
headers=headers,
)
async def stream_http_results(results_gen: AsyncGenerator) -> AsyncGenerator[bytes, None]:
"""Stream HTTP results as NDJSON (dicts already)."""
import json
try:
async for result in results_gen:
try:
data = json.dumps(result) + "\n"
yield data.encode("utf-8")
except Exception as e:
error_response = {"error": str(e), "url": "unknown"}
yield (json.dumps(error_response) + "\n").encode("utf-8")
except asyncio.CancelledError:
pass
def chunk_code_functions(code_md: str) -> List[str]:
pattern = re.compile(
# match "## File: <path>" then a ```py fence, then capture until the closing ```
r"##\s*File:\s*(?P<path>.+?)\s*?\r?\n" # file header
r"```py\s*?\r?\n" # opening fence
r"(?P<code>.*?)(?=\r?\n```)", # code block
re.DOTALL,
)
chunks: List[str] = []
for m in pattern.finditer(code_md):
file_path = m.group("path").strip()
code_blk = m.group("code")
tree = ast.parse(code_blk)
lines = code_blk.splitlines()
for node in tree.body:
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
start = node.lineno - 1
end = getattr(node, "end_lineno", start + 1)
snippet = "\n".join(lines[start:end])
chunks.append(f"# File: {file_path}\n{snippet}")
return chunks
def chunk_doc_sections(doc: str) -> List[str]:
lines = doc.splitlines(keepends=True)
sections = []
current: List[str] = []
for line in lines:
if re.match(r"^#{1,6}\s", line):
if current:
sections.append("".join(current))
current = [line]
else:
current.append(line)
if current:
sections.append("".join(current))
return sections
@app.get("/ask")
@limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("ask")
async def get_context(
request: Request,
_td: Dict = Depends(token_dep),
context_type: str = Query("all", regex="^(code|doc|all)$"),
query: Optional[str] = Query(None, description="search query to filter chunks"),
score_ratio: float = Query(
0.5, ge=0.0, le=1.0, description="min score as fraction of max_score"
),
max_results: int = Query(20, ge=1, description="absolute cap on returned chunks"),
):
"""
This end point is design for any questions about Crawl4ai library. It returns a plain text markdown with extensive information about Crawl4ai.
You can use this as a context for any AI assistant. Use this endpoint for AI assistants to retrieve library context for decision making or code generation tasks.
Alway is BEST practice you provide a query to filter the context. Otherwise the lenght of the response will be very long.
Parameters:
- context_type: Specify "code" for code context, "doc" for documentation context, or "all" for both.
- query: RECOMMENDED search query to filter paragraphs using BM25. You can leave this empty to get all the context.
- score_ratio: Minimum score as a fraction of the maximum score for filtering results.
- max_results: Maximum number of results to return. Default is 20.
Returns:
- JSON response with the requested context.
- If "code" is specified, returns the code context.
- If "doc" is specified, returns the documentation context.
- If "all" is specified, returns both code and documentation contexts.
"""
# load contexts
base = os.path.dirname(__file__)
code_path = os.path.join(base, "c4ai-code-context.md")
doc_path = os.path.join(base, "c4ai-doc-context.md")
if not os.path.exists(code_path) or not os.path.exists(doc_path):
raise HTTPException(404, "Context files not found")
with open(code_path, "r") as f:
code_content = f.read()
with open(doc_path, "r") as f:
doc_content = f.read()
# if no query, just return raw contexts
if not query:
if context_type == "code":
return JSONResponse({"code_context": code_content})
if context_type == "doc":
return JSONResponse({"doc_context": doc_content})
return JSONResponse(
{
"code_context": code_content,
"doc_context": doc_content,
}
)
tokens = query.split()
results: Dict[str, List[Dict[str, float]]] = {}
# code BM25 over functions/classes
if context_type in ("code", "all"):
code_chunks = chunk_code_functions(code_content)
bm25 = BM25Okapi([c.split() for c in code_chunks])
scores = bm25.get_scores(tokens)
max_sc = float(scores.max()) if scores.size > 0 else 0.0
cutoff = max_sc * score_ratio
picked = [(c, s) for c, s in zip(code_chunks, scores) if s >= cutoff]
picked = sorted(picked, key=lambda x: x[1], reverse=True)[:max_results]
results["code_results"] = [{"text": c, "score": s} for c, s in picked]
# doc BM25 over markdown sections
if context_type in ("doc", "all"):
sections = chunk_doc_sections(doc_content)
bm25d = BM25Okapi([sec.split() for sec in sections])
scores_d = bm25d.get_scores(tokens)
max_sd = float(scores_d.max()) if scores_d.size > 0 else 0.0
cutoff_d = max_sd * score_ratio
idxs = [i for i, s in enumerate(scores_d) if s >= cutoff_d]
neighbors = set(i for idx in idxs for i in (idx - 1, idx, idx + 1))
valid = [i for i in sorted(neighbors) if 0 <= i < len(sections)]
valid = valid[:max_results]
results["doc_results"] = [
{"text": sections[i], "score": scores_d[i]} for i in valid
]
return JSONResponse(results)
# attach MCP layer (adds /mcp/ws, /mcp/sse, /mcp/schema)
print(f"MCP server running on {config['app']['host']}:{config['app']['port']}")
attach_mcp(app, base_url=f"http://{config['app']['host']}:{config['app']['port']}")
# ────────────────────────── cli ──────────────────────────────
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"server:app",
host=config["app"]["host"],
port=config["app"]["port"],
reload=config["app"]["reload"],
timeout_keep_alive=config["app"]["timeout_keep_alive"],
)
# ─────────────────────────────────────────────────────────────