merge next. Resolve conflicts. Fix some import errors and error handling in server.py

2025-04-19 20:27:47 +05:30
parent d2648eaa39 16b2318242
commit b27bb367e8
23 changed files with 5660 additions and 91 deletions
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -1,8 +1,20 @@
+# Import from auth.py
+from auth import create_access_token, get_token_dependency, TokenRequest
+from api import (
+    handle_markdown_request,
+    handle_llm_qa,
+    handle_stream_crawl_request,
+    handle_crawl_request,
+    stream_results,
+    _get_memory_mb
+)
+from utils import FilterType, load_config, setup_logging, verify_email_domain
 import os
 import sys
 import time
-from typing import List, Optional, Dict
-from fastapi import FastAPI, HTTPException, Request, Query, Path, Depends
+from typing import List, Optional, Dict, AsyncGenerator
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, HTTPException, Request, Query, Path, Depends, status
 from fastapi.responses import StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
 from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
 from fastapi.middleware.trustedhost import TrustedHostMiddleware
@@ -11,28 +23,40 @@ from slowapi import Limiter
 from slowapi.util import get_remote_address
 from prometheus_fastapi_instrumentator import Instrumentator
 from redis import asyncio as aioredis
+from crawl4ai import (
+    BrowserConfig,
+    CrawlerRunConfig,
+    AsyncLogger
+)
+
+from crawler_manager import (
+    CrawlerManager,
+    CrawlerManagerConfig,
+    PoolTimeoutError,
+    NoHealthyCrawlerError
+)
+import json
+

 sys.path.append(os.path.dirname(os.path.realpath(__file__)))
-from utils import FilterType, load_config, setup_logging, verify_email_domain
-from api import (
-    handle_markdown_request,
-    handle_llm_qa,
-    handle_stream_crawl_request,
-    handle_crawl_request,
-    stream_results
-)
-from auth import create_access_token, get_token_dependency, TokenRequest  # Import from auth.py

 __version__ = "0.2.6"

+
 class CrawlRequest(BaseModel):
    urls: List[str] = Field(min_length=1, max_length=100)
    browser_config: Optional[Dict] = Field(default_factory=dict)
    crawler_config: Optional[Dict] = Field(default_factory=dict)

+
 # Load configuration and setup
 config = load_config()
 setup_logging(config)
+logger = AsyncLogger(
+    log_file=config["logging"].get("log_file", "app.log"),
+    verbose=config["logging"].get("verbose", False),
+    tag_width=10,
+)

 # Initialize Redis
 redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost"))
@@ -44,9 +68,43 @@ limiter = Limiter(
    storage_uri=config["rate_limiting"]["storage_uri"]
 )

+# --- Initialize Manager (will be done in lifespan) ---
+# Load manager config from the main config
+manager_config_dict = config.get("crawler_pool", {})
+# Use Pydantic to parse and validate
+manager_config = CrawlerManagerConfig(**manager_config_dict)
+crawler_manager = CrawlerManager(config=manager_config, logger=logger)
+
+# --- FastAPI App and Lifespan ---
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Startup
+    logger.info("Starting up the server...")
+    if manager_config.enabled:
+        logger.info("Initializing Crawler Manager...")
+        await crawler_manager.initialize()
+        app.state.crawler_manager = crawler_manager  # Store manager in app state
+        logger.info("Crawler Manager is enabled.")
+    else:
+        logger.warning("Crawler Manager is disabled.")
+        app.state.crawler_manager = None  # Indicate disabled state
+
+    yield  # Server runs here
+
+    # Shutdown
+    logger.info("Shutting down server...")
+    if app.state.crawler_manager:
+        logger.info("Shutting down Crawler Manager...")
+        await app.state.crawler_manager.shutdown()
+        logger.info("Crawler Manager shut down.")
+    logger.info("Server shut down.")
+
 app = FastAPI(
    title=config["app"]["title"],
-    version=config["app"]["version"]
+    version=config["app"]["version"],
+    lifespan=lifespan,
 )

 # Configure middleware
@@ -56,7 +114,9 @@ def setup_security_middleware(app, config):
        if sec_config.get("https_redirect", False):
            app.add_middleware(HTTPSRedirectMiddleware)
        if sec_config.get("trusted_hosts", []) != ["*"]:
-            app.add_middleware(TrustedHostMiddleware, allowed_hosts=sec_config["trusted_hosts"])
+            app.add_middleware(TrustedHostMiddleware,
+                               allowed_hosts=sec_config["trusted_hosts"])
+

 setup_security_middleware(app, config)

@@ -68,6 +128,8 @@ if config["observability"]["prometheus"]["enabled"]:
 token_dependency = get_token_dependency(config)

 # Middleware for security headers
+
+
@app.middleware("http")
 async def add_security_headers(request: Request, call_next):
    response = await call_next(request)
@@ -75,7 +137,24 @@ async def add_security_headers(request: Request, call_next):
        response.headers.update(config["security"]["headers"])
    return response

+
+async def get_manager() -> CrawlerManager:
+    # Ensure manager exists and is enabled before yielding
+    if not hasattr(app.state, 'crawler_manager') or app.state.crawler_manager is None:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Crawler service is disabled or not initialized"
+        )
+    if not app.state.crawler_manager.is_enabled():
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Crawler service is currently disabled"
+        )
+    return app.state.crawler_manager
+
 # Token endpoint (always available, but usage depends on config)
+
+
@app.post("/token")
 async def get_token(request_data: TokenRequest):
    if not verify_email_domain(request_data.email):
@@ -84,6 +163,8 @@ async def get_token(request_data: TokenRequest):
    return {"email": request_data.email, "access_token": token, "token_type": "bearer"}

 # Endpoints with conditional auth
+
+
@app.get("/md/{url:path}")
@limiter.limit(config["rate_limiting"]["default_limit"])
 async def get_markdown(
@@ -97,6 +178,7 @@ async def get_markdown(
    result = await handle_markdown_request(url, f, q, c, config)
    return PlainTextResponse(result)

+
@app.get("/llm/{url:path}", description="URL should be without http/https prefix")
 async def llm_endpoint(
    request: Request,
@@ -110,36 +192,89 @@ async def llm_endpoint(
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

+
@app.get("/schema")
 async def get_schema():
    from crawl4ai import BrowserConfig, CrawlerRunConfig
    return {"browser": BrowserConfig().dump(), "crawler": CrawlerRunConfig().dump()}

+
@app.get(config["observability"]["health_check"]["endpoint"])
 async def health():
    return {"status": "ok", "timestamp": time.time(), "version": __version__}

+
@app.get(config["observability"]["prometheus"]["endpoint"])
 async def metrics():
    return RedirectResponse(url=config["observability"]["prometheus"]["endpoint"])

+
+@app.get("/browswers")
+# Optional dependency
+async def health(manager: Optional[CrawlerManager] = Depends(get_manager, use_cache=False)):
+    base_status = {"status": "ok", "timestamp": time.time(),
+                   "version": __version__}
+    if manager:
+        try:
+            manager_status = await manager.get_status()
+            base_status["crawler_manager"] = manager_status
+        except Exception as e:
+            base_status["crawler_manager"] = {
+                "status": "error", "detail": str(e)}
+    else:
+        base_status["crawler_manager"] = {"status": "disabled"}
+    return base_status
+
+
@app.post("/crawl")
@limiter.limit(config["rate_limiting"]["default_limit"])
 async def crawl(
    request: Request,
    crawl_request: CrawlRequest,
-    token_data: Optional[Dict] = Depends(token_dependency)
+    manager: CrawlerManager = Depends(get_manager),  # Use dependency
+    token_data: Optional[Dict] = Depends(token_dependency)  # Keep auth
 ):
    if not crawl_request.urls:
-        raise HTTPException(status_code=400, detail="At least one URL required")
-    results = await handle_crawl_request(
-        urls=crawl_request.urls,
-        browser_config=crawl_request.browser_config,
-        crawler_config=crawl_request.crawler_config,
-        config=config
-    )
+        raise HTTPException(
+            status_code=400, detail="At least one URL required")

-    return JSONResponse(results)
+    try:
+        # Use the manager's context to get a crawler instance
+        async with manager.get_crawler() as active_crawler:
+            # Call the actual handler from api.py, passing the acquired crawler
+            results_dict = await handle_crawl_request(
+                crawler=active_crawler,  # Pass the live crawler instance
+                urls=crawl_request.urls,
+                # Pass user-provided configs, these might override pool defaults if needed
+                # Or the manager/handler could decide how to merge them
+                browser_config=crawl_request.browser_config or {},  # Ensure dict
+                crawler_config=crawl_request.crawler_config or {},  # Ensure dict
+                config=config  # Pass the global server config
+            )
+            return JSONResponse(results_dict)
+
+    except PoolTimeoutError as e:
+        logger.warning(f"Request rejected due to pool timeout: {e}")
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,  # Or 429
+            detail=f"Crawler resources busy. Please try again later. Timeout: {e}"
+        )
+    except NoHealthyCrawlerError as e:
+        logger.error(f"Request failed as no healthy crawler available: {e}")
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail=f"Crawler service temporarily unavailable: {e}"
+        )
+    except HTTPException:  # Re-raise HTTP exceptions from handler
+        raise
+    except Exception as e:
+        logger.error(
+            f"Unexpected error during batch crawl processing: {e}", exc_info=True)
+        # Return generic error, details might be logged by handle_crawl_request
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"An unexpected error occurred: {e}"
+        )


@app.post("/crawl/stream")
@@ -147,23 +282,114 @@ async def crawl(
 async def crawl_stream(
    request: Request,
    crawl_request: CrawlRequest,
+    manager: CrawlerManager = Depends(get_manager),
    token_data: Optional[Dict] = Depends(token_dependency)
 ):
    if not crawl_request.urls:
-        raise HTTPException(status_code=400, detail="At least one URL required")
+        raise HTTPException(
+            status_code=400, detail="At least one URL required")

-    crawler, results_gen = await handle_stream_crawl_request(
-        urls=crawl_request.urls,
-        browser_config=crawl_request.browser_config,
-        crawler_config=crawl_request.crawler_config,
-        config=config
-    )
+    try:
+        # THIS IS A BIT WORK OF ART RATHER THAN ENGINEERING
+        # Acquire the crawler context from the manager
+        # IMPORTANT: The context needs to be active for the *duration* of the stream
+        # This structure might be tricky with FastAPI's StreamingResponse which consumes
+        # the generator *after* the endpoint function returns.

-    return StreamingResponse(
-        stream_results(crawler, results_gen),
-        media_type='application/x-ndjson',
-        headers={'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'X-Stream-Status': 'active'}
-    )
+        # --- Option A: Acquire crawler, pass to handler, handler yields ---
+        # (Requires handler NOT to be async generator itself, but return one)
+        # async with manager.get_crawler() as active_crawler:
+        #     # Handler returns the generator
+        #     _, results_gen = await handle_stream_crawl_request(
+        #         crawler=active_crawler,
+        #         urls=crawl_request.urls,
+        #         browser_config=crawl_request.browser_config or {},
+        #         crawler_config=crawl_request.crawler_config or {},
+        #         config=config
+        #     )
+        #     # PROBLEM: `active_crawler` context exits before StreamingResponse uses results_gen
+        #     # This releases the semaphore too early.
+
+        # --- Option B: Pass manager to handler, handler uses context internally ---
+        # (Requires modifying handle_stream_crawl_request signature/logic)
+        # This seems cleaner. Let's assume api.py is adapted for this.
+        # We need a way for the generator yielded by stream_results to know when
+        # to release the semaphore.
+
+        # --- Option C: Create a wrapper generator that handles context ---
+        async def stream_wrapper(manager: CrawlerManager, crawl_request: CrawlRequest, config: dict) -> AsyncGenerator[bytes, None]:
+            active_crawler = None
+            try:
+                async with manager.get_crawler() as acquired_crawler:
+                    active_crawler = acquired_crawler  # Keep reference for cleanup
+                    # Call the handler which returns the raw result generator
+                    _crawler_ref, results_gen = await handle_stream_crawl_request(
+                        crawler=acquired_crawler,
+                        urls=crawl_request.urls,
+                        browser_config=crawl_request.browser_config or {},
+                        crawler_config=crawl_request.crawler_config or {},
+                        config=config
+                    )
+                    # Use the stream_results utility to format and yield
+                    async for data_bytes in stream_results(_crawler_ref, results_gen):
+                        yield data_bytes
+            except (PoolTimeoutError, NoHealthyCrawlerError) as e:
+                # Yield a final error message in the stream
+                error_payload = {"status": "error", "detail": str(e)}
+                yield (json.dumps(error_payload) + "\n").encode('utf-8')
+                logger.warning(f"Stream request failed: {e}")
+                # Re-raise might be better if StreamingResponse handles it? Test needed.
+            except HTTPException as e:  # Catch HTTP exceptions from handler setup
+                error_payload = {"status": "error",
+                                 "detail": e.detail, "status_code": e.status_code}
+                yield (json.dumps(error_payload) + "\n").encode('utf-8')
+                logger.warning(
+                    f"Stream request failed with HTTPException: {e.detail}")
+            except Exception as e:
+                error_payload = {"status": "error",
+                                 "detail": f"Unexpected stream error: {e}"}
+                yield (json.dumps(error_payload) + "\n").encode('utf-8')
+                logger.error(
+                    f"Unexpected error during stream processing: {e}", exc_info=True)
+            # finally:
+                # Ensure crawler cleanup if stream_results doesn't handle it?
+                # stream_results *should* call crawler.close(), but only on the
+                # instance it received. If we pass the *manager* instead, this gets complex.
+                # Let's stick to passing the acquired_crawler and rely on stream_results.
+
+        # Create the generator using the wrapper
+        streaming_generator = stream_wrapper(manager, crawl_request, config)
+
+        return StreamingResponse(
+            streaming_generator,  # Use the wrapper
+            media_type='application/x-ndjson',
+            headers={'Cache-Control': 'no-cache',
+                     'Connection': 'keep-alive', 'X-Stream-Status': 'active'}
+        )
+
+    except (PoolTimeoutError, NoHealthyCrawlerError) as e:
+        # These might occur if get_crawler fails *before* stream starts
+        # Or if the wrapper re-raises them.
+        logger.warning(f"Stream request rejected before starting: {e}")
+        status_code = status.HTTP_503_SERVICE_UNAVAILABLE  # Or 429 for timeout
+        # Don't raise HTTPException here, let the wrapper yield the error message.
+        # If we want to return a non-200 initial status, need more complex handling.
+        # Return an *empty* stream with error headers? Or just let wrapper yield error.
+
+        async def _error_stream(e):
+            error_payload = {"status": "error", "detail": str(e)}
+            yield (json.dumps(error_payload) + "\n").encode('utf-8')
+        return StreamingResponse(_error_stream(e), status_code=status_code, media_type='application/x-ndjson')
+
+    except HTTPException:  # Re-raise HTTP exceptions from setup
+        raise
+    except Exception as e:
+        logger.error(
+            f"Unexpected error setting up stream crawl: {e}", exc_info=True)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"An unexpected error occurred setting up the stream: {e}"
+        )

 if __name__ == "__main__":
    import uvicorn
@@ -173,4 +399,4 @@ if __name__ == "__main__":
        port=config["app"]["port"],
        reload=config["app"]["reload"],
        timeout_keep_alive=config["app"]["timeout_keep_alive"]
-    )
+    )