Add link analysis tests and integration tests for /links/analyze endpoint

- Implemented `test_link_analysis` in `test_docker.py` to validate link analysis functionality. - Created `test_link_analysis.py` with comprehensive tests for link analysis, including basic functionality, configuration options, error handling, performance, and edge cases. - Added integration tests in `test_link_analysis_integration.py` to verify the /links/analyze endpoint, including health checks, authentication, and error handling.
2025-10-14 19:24:16 +08:00
parent 8cca9704eb
commit aebf5a3694
7 changed files with 1926 additions and 0 deletions
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -7,9 +7,37 @@ Crawl4AI FastAPI entry‑point
 """

 # ── stdlib & 3rd‑party imports ───────────────────────────────
+from crawler_pool import get_crawler, close_all, janitor
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LinkPreviewConfig
+from auth import create_access_token, get_token_dependency, TokenRequest
+from pydantic import BaseModel
+from typing import Optional, List, Dict
+from fastapi import Request, Depends
+from fastapi.responses import FileResponse
 import ast
 import asyncio
 import base64
+import re
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LinkPreviewConfig
+from api import (
+    handle_markdown_request, handle_llm_qa,
+    handle_stream_crawl_request, handle_crawl_request,
+    stream_results
+)
+from schemas import (
+    CrawlRequestWithHooks,
+    MarkdownRequest,
+    RawCode,
+    HTMLRequest,
+    ScreenshotRequest,
+    PDFRequest,
+    JSEndpointRequest,
+    LinkAnalysisRequest,
+)
+
+from utils import (
+    FilterType, load_config, setup_logging, verify_email_domain
+)
 import os
 import pathlib
 import re
@@ -1045,6 +1073,57 @@ async def execute_js(
        raise HTTPException(status_code=500, detail=str(e))


+@app.post("/links/analyze")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("links_analyze")
+async def analyze_links(
+    request: Request,
+    body: LinkAnalysisRequest,
+    _td: Dict = Depends(token_dep),
+):
+    """
+    Analyze and score links on a webpage.
+    Returns a dictionary of links with their scores and metadata.
+    """
+    try:
+        # Create AsyncWebCrawler instance
+        async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+            # Deserialize config dict to LinkPreviewConfig, use default if not provided
+            link_preview_config = LinkPreviewConfig.from_dict(body.config) if body.config else LinkPreviewConfig()
+
+            # Create CrawlerRunConfig with link analysis settings
+            run_config = CrawlerRunConfig(
+                link_preview_config=link_preview_config,
+                score_links=True,
+                screenshot=False,
+                pdf=False,
+                extraction_strategy=None
+            )
+
+            # Execute the crawl
+            result = await crawler.arun(url=body.url, config=run_config)
+
+            # Check if crawl was successful
+            if not result.success:
+                raise HTTPException(
+                    status_code=500,
+                    detail=result.error_message or "Crawl failed"
+                )
+
+            # Extract and return the links dictionary
+            return JSONResponse(result.links)
+
+    except HTTPException:
+        # Re-raise HTTP exceptions
+        raise
+    except Exception as e:
+        # Handle any other exceptions
+        raise HTTPException(
+            status_code=500,
+            detail=f"Internal server error: {str(e)}"
+        )
+
+
@app.get("/llm/{url:path}",
    summary="LLM Q&A",
    description="Ask questions about a webpage using LLM.",