merge next. Resolve conflicts. Fix some import errors and error handling in server.py
This commit is contained in:
@@ -1,8 +1,20 @@
|
||||
# Import from auth.py
|
||||
from auth import create_access_token, get_token_dependency, TokenRequest
|
||||
from api import (
|
||||
handle_markdown_request,
|
||||
handle_llm_qa,
|
||||
handle_stream_crawl_request,
|
||||
handle_crawl_request,
|
||||
stream_results,
|
||||
_get_memory_mb
|
||||
)
|
||||
from utils import FilterType, load_config, setup_logging, verify_email_domain
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import List, Optional, Dict
|
||||
from fastapi import FastAPI, HTTPException, Request, Query, Path, Depends
|
||||
from typing import List, Optional, Dict, AsyncGenerator
|
||||
from contextlib import asynccontextmanager
|
||||
from fastapi import FastAPI, HTTPException, Request, Query, Path, Depends, status
|
||||
from fastapi.responses import StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
|
||||
from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
|
||||
from fastapi.middleware.trustedhost import TrustedHostMiddleware
|
||||
@@ -11,28 +23,40 @@ from slowapi import Limiter
|
||||
from slowapi.util import get_remote_address
|
||||
from prometheus_fastapi_instrumentator import Instrumentator
|
||||
from redis import asyncio as aioredis
|
||||
from crawl4ai import (
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
AsyncLogger
|
||||
)
|
||||
|
||||
from crawler_manager import (
|
||||
CrawlerManager,
|
||||
CrawlerManagerConfig,
|
||||
PoolTimeoutError,
|
||||
NoHealthyCrawlerError
|
||||
)
|
||||
import json
|
||||
|
||||
|
||||
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
|
||||
from utils import FilterType, load_config, setup_logging, verify_email_domain
|
||||
from api import (
|
||||
handle_markdown_request,
|
||||
handle_llm_qa,
|
||||
handle_stream_crawl_request,
|
||||
handle_crawl_request,
|
||||
stream_results
|
||||
)
|
||||
from auth import create_access_token, get_token_dependency, TokenRequest # Import from auth.py
|
||||
|
||||
__version__ = "0.2.6"
|
||||
|
||||
|
||||
class CrawlRequest(BaseModel):
|
||||
urls: List[str] = Field(min_length=1, max_length=100)
|
||||
browser_config: Optional[Dict] = Field(default_factory=dict)
|
||||
crawler_config: Optional[Dict] = Field(default_factory=dict)
|
||||
|
||||
|
||||
# Load configuration and setup
|
||||
config = load_config()
|
||||
setup_logging(config)
|
||||
logger = AsyncLogger(
|
||||
log_file=config["logging"].get("log_file", "app.log"),
|
||||
verbose=config["logging"].get("verbose", False),
|
||||
tag_width=10,
|
||||
)
|
||||
|
||||
# Initialize Redis
|
||||
redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost"))
|
||||
@@ -44,9 +68,43 @@ limiter = Limiter(
|
||||
storage_uri=config["rate_limiting"]["storage_uri"]
|
||||
)
|
||||
|
||||
# --- Initialize Manager (will be done in lifespan) ---
|
||||
# Load manager config from the main config
|
||||
manager_config_dict = config.get("crawler_pool", {})
|
||||
# Use Pydantic to parse and validate
|
||||
manager_config = CrawlerManagerConfig(**manager_config_dict)
|
||||
crawler_manager = CrawlerManager(config=manager_config, logger=logger)
|
||||
|
||||
# --- FastAPI App and Lifespan ---
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
# Startup
|
||||
logger.info("Starting up the server...")
|
||||
if manager_config.enabled:
|
||||
logger.info("Initializing Crawler Manager...")
|
||||
await crawler_manager.initialize()
|
||||
app.state.crawler_manager = crawler_manager # Store manager in app state
|
||||
logger.info("Crawler Manager is enabled.")
|
||||
else:
|
||||
logger.warning("Crawler Manager is disabled.")
|
||||
app.state.crawler_manager = None # Indicate disabled state
|
||||
|
||||
yield # Server runs here
|
||||
|
||||
# Shutdown
|
||||
logger.info("Shutting down server...")
|
||||
if app.state.crawler_manager:
|
||||
logger.info("Shutting down Crawler Manager...")
|
||||
await app.state.crawler_manager.shutdown()
|
||||
logger.info("Crawler Manager shut down.")
|
||||
logger.info("Server shut down.")
|
||||
|
||||
app = FastAPI(
|
||||
title=config["app"]["title"],
|
||||
version=config["app"]["version"]
|
||||
version=config["app"]["version"],
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
# Configure middleware
|
||||
@@ -56,7 +114,9 @@ def setup_security_middleware(app, config):
|
||||
if sec_config.get("https_redirect", False):
|
||||
app.add_middleware(HTTPSRedirectMiddleware)
|
||||
if sec_config.get("trusted_hosts", []) != ["*"]:
|
||||
app.add_middleware(TrustedHostMiddleware, allowed_hosts=sec_config["trusted_hosts"])
|
||||
app.add_middleware(TrustedHostMiddleware,
|
||||
allowed_hosts=sec_config["trusted_hosts"])
|
||||
|
||||
|
||||
setup_security_middleware(app, config)
|
||||
|
||||
@@ -68,6 +128,8 @@ if config["observability"]["prometheus"]["enabled"]:
|
||||
token_dependency = get_token_dependency(config)
|
||||
|
||||
# Middleware for security headers
|
||||
|
||||
|
||||
@app.middleware("http")
|
||||
async def add_security_headers(request: Request, call_next):
|
||||
response = await call_next(request)
|
||||
@@ -75,7 +137,24 @@ async def add_security_headers(request: Request, call_next):
|
||||
response.headers.update(config["security"]["headers"])
|
||||
return response
|
||||
|
||||
|
||||
async def get_manager() -> CrawlerManager:
|
||||
# Ensure manager exists and is enabled before yielding
|
||||
if not hasattr(app.state, 'crawler_manager') or app.state.crawler_manager is None:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
||||
detail="Crawler service is disabled or not initialized"
|
||||
)
|
||||
if not app.state.crawler_manager.is_enabled():
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
||||
detail="Crawler service is currently disabled"
|
||||
)
|
||||
return app.state.crawler_manager
|
||||
|
||||
# Token endpoint (always available, but usage depends on config)
|
||||
|
||||
|
||||
@app.post("/token")
|
||||
async def get_token(request_data: TokenRequest):
|
||||
if not verify_email_domain(request_data.email):
|
||||
@@ -84,6 +163,8 @@ async def get_token(request_data: TokenRequest):
|
||||
return {"email": request_data.email, "access_token": token, "token_type": "bearer"}
|
||||
|
||||
# Endpoints with conditional auth
|
||||
|
||||
|
||||
@app.get("/md/{url:path}")
|
||||
@limiter.limit(config["rate_limiting"]["default_limit"])
|
||||
async def get_markdown(
|
||||
@@ -97,6 +178,7 @@ async def get_markdown(
|
||||
result = await handle_markdown_request(url, f, q, c, config)
|
||||
return PlainTextResponse(result)
|
||||
|
||||
|
||||
@app.get("/llm/{url:path}", description="URL should be without http/https prefix")
|
||||
async def llm_endpoint(
|
||||
request: Request,
|
||||
@@ -110,36 +192,89 @@ async def llm_endpoint(
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.get("/schema")
|
||||
async def get_schema():
|
||||
from crawl4ai import BrowserConfig, CrawlerRunConfig
|
||||
return {"browser": BrowserConfig().dump(), "crawler": CrawlerRunConfig().dump()}
|
||||
|
||||
|
||||
@app.get(config["observability"]["health_check"]["endpoint"])
|
||||
async def health():
|
||||
return {"status": "ok", "timestamp": time.time(), "version": __version__}
|
||||
|
||||
|
||||
@app.get(config["observability"]["prometheus"]["endpoint"])
|
||||
async def metrics():
|
||||
return RedirectResponse(url=config["observability"]["prometheus"]["endpoint"])
|
||||
|
||||
|
||||
@app.get("/browswers")
|
||||
# Optional dependency
|
||||
async def health(manager: Optional[CrawlerManager] = Depends(get_manager, use_cache=False)):
|
||||
base_status = {"status": "ok", "timestamp": time.time(),
|
||||
"version": __version__}
|
||||
if manager:
|
||||
try:
|
||||
manager_status = await manager.get_status()
|
||||
base_status["crawler_manager"] = manager_status
|
||||
except Exception as e:
|
||||
base_status["crawler_manager"] = {
|
||||
"status": "error", "detail": str(e)}
|
||||
else:
|
||||
base_status["crawler_manager"] = {"status": "disabled"}
|
||||
return base_status
|
||||
|
||||
|
||||
@app.post("/crawl")
|
||||
@limiter.limit(config["rate_limiting"]["default_limit"])
|
||||
async def crawl(
|
||||
request: Request,
|
||||
crawl_request: CrawlRequest,
|
||||
token_data: Optional[Dict] = Depends(token_dependency)
|
||||
manager: CrawlerManager = Depends(get_manager), # Use dependency
|
||||
token_data: Optional[Dict] = Depends(token_dependency) # Keep auth
|
||||
):
|
||||
if not crawl_request.urls:
|
||||
raise HTTPException(status_code=400, detail="At least one URL required")
|
||||
results = await handle_crawl_request(
|
||||
urls=crawl_request.urls,
|
||||
browser_config=crawl_request.browser_config,
|
||||
crawler_config=crawl_request.crawler_config,
|
||||
config=config
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=400, detail="At least one URL required")
|
||||
|
||||
return JSONResponse(results)
|
||||
try:
|
||||
# Use the manager's context to get a crawler instance
|
||||
async with manager.get_crawler() as active_crawler:
|
||||
# Call the actual handler from api.py, passing the acquired crawler
|
||||
results_dict = await handle_crawl_request(
|
||||
crawler=active_crawler, # Pass the live crawler instance
|
||||
urls=crawl_request.urls,
|
||||
# Pass user-provided configs, these might override pool defaults if needed
|
||||
# Or the manager/handler could decide how to merge them
|
||||
browser_config=crawl_request.browser_config or {}, # Ensure dict
|
||||
crawler_config=crawl_request.crawler_config or {}, # Ensure dict
|
||||
config=config # Pass the global server config
|
||||
)
|
||||
return JSONResponse(results_dict)
|
||||
|
||||
except PoolTimeoutError as e:
|
||||
logger.warning(f"Request rejected due to pool timeout: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_503_SERVICE_UNAVAILABLE, # Or 429
|
||||
detail=f"Crawler resources busy. Please try again later. Timeout: {e}"
|
||||
)
|
||||
except NoHealthyCrawlerError as e:
|
||||
logger.error(f"Request failed as no healthy crawler available: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
||||
detail=f"Crawler service temporarily unavailable: {e}"
|
||||
)
|
||||
except HTTPException: # Re-raise HTTP exceptions from handler
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Unexpected error during batch crawl processing: {e}", exc_info=True)
|
||||
# Return generic error, details might be logged by handle_crawl_request
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"An unexpected error occurred: {e}"
|
||||
)
|
||||
|
||||
|
||||
@app.post("/crawl/stream")
|
||||
@@ -147,23 +282,114 @@ async def crawl(
|
||||
async def crawl_stream(
|
||||
request: Request,
|
||||
crawl_request: CrawlRequest,
|
||||
manager: CrawlerManager = Depends(get_manager),
|
||||
token_data: Optional[Dict] = Depends(token_dependency)
|
||||
):
|
||||
if not crawl_request.urls:
|
||||
raise HTTPException(status_code=400, detail="At least one URL required")
|
||||
raise HTTPException(
|
||||
status_code=400, detail="At least one URL required")
|
||||
|
||||
crawler, results_gen = await handle_stream_crawl_request(
|
||||
urls=crawl_request.urls,
|
||||
browser_config=crawl_request.browser_config,
|
||||
crawler_config=crawl_request.crawler_config,
|
||||
config=config
|
||||
)
|
||||
try:
|
||||
# THIS IS A BIT WORK OF ART RATHER THAN ENGINEERING
|
||||
# Acquire the crawler context from the manager
|
||||
# IMPORTANT: The context needs to be active for the *duration* of the stream
|
||||
# This structure might be tricky with FastAPI's StreamingResponse which consumes
|
||||
# the generator *after* the endpoint function returns.
|
||||
|
||||
return StreamingResponse(
|
||||
stream_results(crawler, results_gen),
|
||||
media_type='application/x-ndjson',
|
||||
headers={'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'X-Stream-Status': 'active'}
|
||||
)
|
||||
# --- Option A: Acquire crawler, pass to handler, handler yields ---
|
||||
# (Requires handler NOT to be async generator itself, but return one)
|
||||
# async with manager.get_crawler() as active_crawler:
|
||||
# # Handler returns the generator
|
||||
# _, results_gen = await handle_stream_crawl_request(
|
||||
# crawler=active_crawler,
|
||||
# urls=crawl_request.urls,
|
||||
# browser_config=crawl_request.browser_config or {},
|
||||
# crawler_config=crawl_request.crawler_config or {},
|
||||
# config=config
|
||||
# )
|
||||
# # PROBLEM: `active_crawler` context exits before StreamingResponse uses results_gen
|
||||
# # This releases the semaphore too early.
|
||||
|
||||
# --- Option B: Pass manager to handler, handler uses context internally ---
|
||||
# (Requires modifying handle_stream_crawl_request signature/logic)
|
||||
# This seems cleaner. Let's assume api.py is adapted for this.
|
||||
# We need a way for the generator yielded by stream_results to know when
|
||||
# to release the semaphore.
|
||||
|
||||
# --- Option C: Create a wrapper generator that handles context ---
|
||||
async def stream_wrapper(manager: CrawlerManager, crawl_request: CrawlRequest, config: dict) -> AsyncGenerator[bytes, None]:
|
||||
active_crawler = None
|
||||
try:
|
||||
async with manager.get_crawler() as acquired_crawler:
|
||||
active_crawler = acquired_crawler # Keep reference for cleanup
|
||||
# Call the handler which returns the raw result generator
|
||||
_crawler_ref, results_gen = await handle_stream_crawl_request(
|
||||
crawler=acquired_crawler,
|
||||
urls=crawl_request.urls,
|
||||
browser_config=crawl_request.browser_config or {},
|
||||
crawler_config=crawl_request.crawler_config or {},
|
||||
config=config
|
||||
)
|
||||
# Use the stream_results utility to format and yield
|
||||
async for data_bytes in stream_results(_crawler_ref, results_gen):
|
||||
yield data_bytes
|
||||
except (PoolTimeoutError, NoHealthyCrawlerError) as e:
|
||||
# Yield a final error message in the stream
|
||||
error_payload = {"status": "error", "detail": str(e)}
|
||||
yield (json.dumps(error_payload) + "\n").encode('utf-8')
|
||||
logger.warning(f"Stream request failed: {e}")
|
||||
# Re-raise might be better if StreamingResponse handles it? Test needed.
|
||||
except HTTPException as e: # Catch HTTP exceptions from handler setup
|
||||
error_payload = {"status": "error",
|
||||
"detail": e.detail, "status_code": e.status_code}
|
||||
yield (json.dumps(error_payload) + "\n").encode('utf-8')
|
||||
logger.warning(
|
||||
f"Stream request failed with HTTPException: {e.detail}")
|
||||
except Exception as e:
|
||||
error_payload = {"status": "error",
|
||||
"detail": f"Unexpected stream error: {e}"}
|
||||
yield (json.dumps(error_payload) + "\n").encode('utf-8')
|
||||
logger.error(
|
||||
f"Unexpected error during stream processing: {e}", exc_info=True)
|
||||
# finally:
|
||||
# Ensure crawler cleanup if stream_results doesn't handle it?
|
||||
# stream_results *should* call crawler.close(), but only on the
|
||||
# instance it received. If we pass the *manager* instead, this gets complex.
|
||||
# Let's stick to passing the acquired_crawler and rely on stream_results.
|
||||
|
||||
# Create the generator using the wrapper
|
||||
streaming_generator = stream_wrapper(manager, crawl_request, config)
|
||||
|
||||
return StreamingResponse(
|
||||
streaming_generator, # Use the wrapper
|
||||
media_type='application/x-ndjson',
|
||||
headers={'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive', 'X-Stream-Status': 'active'}
|
||||
)
|
||||
|
||||
except (PoolTimeoutError, NoHealthyCrawlerError) as e:
|
||||
# These might occur if get_crawler fails *before* stream starts
|
||||
# Or if the wrapper re-raises them.
|
||||
logger.warning(f"Stream request rejected before starting: {e}")
|
||||
status_code = status.HTTP_503_SERVICE_UNAVAILABLE # Or 429 for timeout
|
||||
# Don't raise HTTPException here, let the wrapper yield the error message.
|
||||
# If we want to return a non-200 initial status, need more complex handling.
|
||||
# Return an *empty* stream with error headers? Or just let wrapper yield error.
|
||||
|
||||
async def _error_stream(e):
|
||||
error_payload = {"status": "error", "detail": str(e)}
|
||||
yield (json.dumps(error_payload) + "\n").encode('utf-8')
|
||||
return StreamingResponse(_error_stream(e), status_code=status_code, media_type='application/x-ndjson')
|
||||
|
||||
except HTTPException: # Re-raise HTTP exceptions from setup
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Unexpected error setting up stream crawl: {e}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"An unexpected error occurred setting up the stream: {e}"
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
@@ -173,4 +399,4 @@ if __name__ == "__main__":
|
||||
port=config["app"]["port"],
|
||||
reload=config["app"]["reload"],
|
||||
timeout_keep_alive=config["app"]["timeout_keep_alive"]
|
||||
)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user