Add comprehensive tests for anti-bot strategies and extended features

- Implemented `test_adapter_verification.py` to verify correct usage of browser adapters. - Created `test_all_features.py` for a comprehensive suite covering URL seeding, adaptive crawling, browser adapters, proxy rotation, and dispatchers. - Developed `test_anti_bot_strategy.py` to validate the functionality of various anti-bot strategies. - Added `test_antibot_simple.py` for simple testing of anti-bot strategies using async web crawling. - Introduced `test_bot_detection.py` to assess adapter performance against bot detection mechanisms. - Compiled `test_final_summary.py` to provide a detailed summary of all tests and their results.
2025-10-07 18:51:13 +08:00
parent f00e8cbf35
commit 201843a204
23 changed files with 5265 additions and 96 deletions
--- a/deploy/docker/routers/adaptive.py
+++ b/deploy/docker/routers/adaptive.py
@@ -71,16 +71,86 @@ async def run_adaptive_digest(task_id: str, request: AdaptiveCrawlRequest):
 # --- API Endpoints ---


-@router.post("/job", response_model=AdaptiveJobStatus, status_code=202)
+@router.post("/job",
+    summary="Submit Adaptive Crawl Job",
+    description="Start a long-running adaptive crawling job that intelligently discovers relevant content.",
+    response_description="Job ID for status polling",
+    response_model=AdaptiveJobStatus,
+    status_code=202
+)
 async def submit_adaptive_digest_job(
    request: AdaptiveCrawlRequest,
    background_tasks: BackgroundTasks,
 ):
    """
    Submit a new adaptive crawling job.
-
-    This endpoint starts a long-running adaptive crawl in the background and
-    immediately returns a task ID for polling the job's status.
+    
+    This endpoint starts an intelligent, long-running crawl that automatically
+    discovers and extracts relevant content based on your query. Returns
+    immediately with a task ID for polling.
+    
+    **Request Body:**
+    ```json
+    {
+        "start_url": "https://example.com",
+        "query": "Find all product documentation",
+        "config": {
+            "max_depth": 3,
+            "max_pages": 50,
+            "confidence_threshold": 0.7,
+            "timeout": 300
+        }
+    }
+    ```
+    
+    **Parameters:**
+    - `start_url`: Starting URL for the crawl
+    - `query`: Natural language query describing what to find
+    - `config`: Optional adaptive configuration (max_depth, max_pages, etc.)
+    
+    **Response:**
+    ```json
+    {
+        "task_id": "550e8400-e29b-41d4-a716-446655440000",
+        "status": "PENDING",
+        "metrics": null,
+        "result": null,
+        "error": null
+    }
+    ```
+    
+    **Usage:**
+    ```python
+    # Submit job
+    response = requests.post(
+        "http://localhost:11235/adaptive/digest/job",
+        headers={"Authorization": f"Bearer {token}"},
+        json={
+            "start_url": "https://example.com",
+            "query": "Find all API documentation"
+        }
+    )
+    task_id = response.json()["task_id"]
+    
+    # Poll for results
+    while True:
+        status_response = requests.get(
+            f"http://localhost:11235/adaptive/digest/job/{task_id}",
+            headers={"Authorization": f"Bearer {token}"}
+        )
+        status = status_response.json()
+        if status["status"] in ["COMPLETED", "FAILED"]:
+            print(status["result"])
+            break
+        time.sleep(2)
+    ```
+    
+    **Notes:**
+    - Job runs in background, returns immediately
+    - Use task_id to poll status with GET /adaptive/digest/job/{task_id}
+    - Adaptive crawler intelligently follows links based on relevance
+    - Automatically stops when sufficient content found
+    - Returns HTTP 202 Accepted
    """

    print("Received adaptive crawl request:", request)
@@ -101,13 +171,93 @@ async def submit_adaptive_digest_job(
    return ADAPTIVE_JOBS[task_id]


-@router.get("/job/{task_id}", response_model=AdaptiveJobStatus)
+@router.get("/job/{task_id}",
+    summary="Get Adaptive Job Status",
+    description="Poll the status and results of an adaptive crawling job.",
+    response_description="Job status, metrics, and results",
+    response_model=AdaptiveJobStatus
+)
 async def get_adaptive_digest_status(task_id: str):
    """
    Get the status and result of an adaptive crawling job.
-
-    Poll this endpoint with the `task_id` returned from the submission
-    endpoint until the status is 'COMPLETED' or 'FAILED'.
+    
+    Poll this endpoint with the task_id returned from the submission endpoint
+    until the status is 'COMPLETED' or 'FAILED'.
+    
+    **Parameters:**
+    - `task_id`: Job ID from POST /adaptive/digest/job
+    
+    **Response (Running):**
+    ```json
+    {
+        "task_id": "550e8400-e29b-41d4-a716-446655440000",
+        "status": "RUNNING",
+        "metrics": {
+            "confidence": 0.45,
+            "pages_crawled": 15,
+            "relevant_pages": 8
+        },
+        "result": null,
+        "error": null
+    }
+    ```
+    
+    **Response (Completed):**
+    ```json
+    {
+        "task_id": "550e8400-e29b-41d4-a716-446655440000",
+        "status": "COMPLETED",
+        "metrics": {
+            "confidence": 0.85,
+            "pages_crawled": 42,
+            "relevant_pages": 28
+        },
+        "result": {
+            "confidence": 0.85,
+            "is_sufficient": true,
+            "coverage_stats": {...},
+            "relevant_content": [...]
+        },
+        "error": null
+    }
+    ```
+    
+    **Status Values:**
+    - `PENDING`: Job queued, not started yet
+    - `RUNNING`: Job actively crawling
+    - `COMPLETED`: Job finished successfully
+    - `FAILED`: Job encountered an error
+    
+    **Usage:**
+    ```python
+    import time
+    
+    # Poll until complete
+    while True:
+        response = requests.get(
+            f"http://localhost:11235/adaptive/digest/job/{task_id}",
+            headers={"Authorization": f"Bearer {token}"}
+        )
+        job = response.json()
+        
+        print(f"Status: {job['status']}")
+        if job['status'] == 'RUNNING':
+            print(f"Progress: {job['metrics']['pages_crawled']} pages")
+        elif job['status'] == 'COMPLETED':
+            print(f"Found {len(job['result']['relevant_content'])} relevant items")
+            break
+        elif job['status'] == 'FAILED':
+            print(f"Error: {job['error']}")
+            break
+        
+        time.sleep(2)
+    ```
+    
+    **Notes:**
+    - Poll every 1-5 seconds
+    - Metrics updated in real-time while running
+    - Returns 404 if task_id not found
+    - Results include top relevant content and statistics
    """
    job = ADAPTIVE_JOBS.get(task_id)
    if not job:
--- a/deploy/docker/routers/dispatchers.py
+++ b/deploy/docker/routers/dispatchers.py
@@ -0,0 +1,259 @@
+"""
+Router for dispatcher management endpoints.
+
+Provides endpoints to:
+- List available dispatchers
+- Get default dispatcher info
+- Get dispatcher statistics
+"""
+
+import logging
+from typing import Dict, List
+
+from fastapi import APIRouter, HTTPException, Request
+from schemas import DispatcherInfo, DispatcherStatsResponse, DispatcherType
+from utils import get_available_dispatchers, get_dispatcher_config
+
+logger = logging.getLogger(__name__)
+
+# --- APIRouter for Dispatcher Endpoints ---
+router = APIRouter(
+    prefix="/dispatchers",
+    tags=["Dispatchers"],
+)
+
+
+@router.get("",
+    summary="List Dispatchers",
+    description="Get information about all available dispatcher types.",
+    response_description="List of dispatcher configurations and features",
+    response_model=List[DispatcherInfo]
+)
+async def list_dispatchers(request: Request):
+    """
+    List all available dispatcher types.
+    
+    Returns information about each dispatcher type including name, description,
+    configuration parameters, and key features.
+    
+    **Dispatchers:**
+    - `memory_adaptive`: Automatically manages crawler instances based on memory
+    - `semaphore`: Simple semaphore-based concurrency control
+    
+    **Response:**
+    ```json
+    [
+        {
+            "type": "memory_adaptive",
+            "name": "Memory Adaptive Dispatcher",
+            "description": "Automatically adjusts crawler pool based on memory usage",
+            "config": {...},
+            "features": ["Auto-scaling", "Memory monitoring", "Smart throttling"]
+        },
+        {
+            "type": "semaphore",
+            "name": "Semaphore Dispatcher",
+            "description": "Simple semaphore-based concurrency control",
+            "config": {...},
+            "features": ["Fixed concurrency", "Simple queue"]
+        }
+    ]
+    ```
+    
+    **Usage:**
+    ```python
+    response = requests.get(
+        "http://localhost:11235/dispatchers",
+        headers={"Authorization": f"Bearer {token}"}
+    )
+    dispatchers = response.json()
+    for dispatcher in dispatchers:
+        print(f"{dispatcher['type']}: {dispatcher['description']}")
+    ```
+    
+    **Notes:**
+    - Lists all registered dispatcher types
+    - Shows configuration options for each
+    - Use with /crawl endpoint's `dispatcher` parameter
+    """
+    try:
+        dispatchers_info = get_available_dispatchers()
+        
+        result = []
+        for dispatcher_type, info in dispatchers_info.items():
+            result.append(
+                DispatcherInfo(
+                    type=DispatcherType(dispatcher_type),
+                    name=info["name"],
+                    description=info["description"],
+                    config=info["config"],
+                    features=info["features"],
+                )
+            )
+        
+        return result
+    except Exception as e:
+        logger.error(f"Error listing dispatchers: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to list dispatchers: {str(e)}")
+
+
+@router.get("/default",
+    summary="Get Default Dispatcher",
+    description="Get information about the currently configured default dispatcher.",
+    response_description="Default dispatcher information",
+    response_model=Dict
+)
+async def get_default_dispatcher(request: Request):
+    """
+    Get information about the current default dispatcher.
+    
+    Returns the dispatcher type, configuration, and status for the default
+    dispatcher used when no specific dispatcher is requested.
+    
+    **Response:**
+    ```json
+    {
+        "type": "memory_adaptive",
+        "config": {
+            "max_memory_percent": 80,
+            "check_interval": 10,
+            "min_instances": 1,
+            "max_instances": 10
+        },
+        "active": true
+    }
+    ```
+    
+    **Usage:**
+    ```python
+    response = requests.get(
+        "http://localhost:11235/dispatchers/default",
+        headers={"Authorization": f"Bearer {token}"}
+    )
+    default_dispatcher = response.json()
+    print(f"Default: {default_dispatcher['type']}")
+    ```
+    
+    **Notes:**
+    - Shows which dispatcher is used by default
+    - Default can be configured via server settings
+    - Override with `dispatcher` parameter in /crawl requests
+    """
+    try:
+        default_type = request.app.state.default_dispatcher_type
+        dispatcher = request.app.state.dispatchers.get(default_type)
+        
+        if not dispatcher:
+            raise HTTPException(
+                status_code=500, 
+                detail=f"Default dispatcher '{default_type}' not initialized"
+            )
+        
+        return {
+            "type": default_type,
+            "config": get_dispatcher_config(default_type),
+            "active": True,
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error getting default dispatcher: {e}")
+        raise HTTPException(
+            status_code=500, 
+            detail=f"Failed to get default dispatcher: {str(e)}"
+        )
+
+
+@router.get("/{dispatcher_type}/stats",
+    summary="Get Dispatcher Statistics",
+    description="Get runtime statistics for a specific dispatcher.",
+    response_description="Dispatcher statistics and metrics",
+    response_model=DispatcherStatsResponse
+)
+async def get_dispatcher_stats(dispatcher_type: DispatcherType, request: Request):
+    """
+    Get runtime statistics for a specific dispatcher.
+    
+    Returns active sessions, configuration, and dispatcher-specific metrics.
+    Useful for monitoring and debugging dispatcher performance.
+    
+    **Parameters:**
+    - `dispatcher_type`: Dispatcher type (memory_adaptive, semaphore)
+    
+    **Response:**
+    ```json
+    {
+        "type": "memory_adaptive",
+        "active_sessions": 3,
+        "config": {
+            "max_memory_percent": 80,
+            "check_interval": 10
+        },
+        "stats": {
+            "current_memory_percent": 45.2,
+            "active_instances": 3,
+            "max_instances": 10,
+            "throttled_count": 0
+        }
+    }
+    ```
+    
+    **Usage:**
+    ```python
+    response = requests.get(
+        "http://localhost:11235/dispatchers/memory_adaptive/stats",
+        headers={"Authorization": f"Bearer {token}"}
+    )
+    stats = response.json()
+    print(f"Active sessions: {stats['active_sessions']}")
+    print(f"Memory usage: {stats['stats']['current_memory_percent']}%")
+    ```
+    
+    **Notes:**
+    - Real-time statistics
+    - Stats vary by dispatcher type
+    - Use for monitoring and capacity planning
+    - Returns 404 if dispatcher type not found
+    """
+    try:
+        dispatcher_name = dispatcher_type.value
+        dispatcher = request.app.state.dispatchers.get(dispatcher_name)
+        
+        if not dispatcher:
+            raise HTTPException(
+                status_code=404, 
+                detail=f"Dispatcher '{dispatcher_name}' not found or not initialized"
+            )
+        
+        # Get basic stats
+        stats = {
+            "type": dispatcher_type,
+            "active_sessions": dispatcher.concurrent_sessions,
+            "config": get_dispatcher_config(dispatcher_name),
+            "stats": {}
+        }
+        
+        # Add dispatcher-specific stats
+        if dispatcher_name == "memory_adaptive":
+            stats["stats"] = {
+                "current_memory_percent": getattr(dispatcher, "current_memory_percent", 0.0),
+                "memory_pressure_mode": getattr(dispatcher, "memory_pressure_mode", False),
+                "task_queue_size": dispatcher.task_queue.qsize() if hasattr(dispatcher, "task_queue") else 0,
+            }
+        elif dispatcher_name == "semaphore":
+            # For semaphore dispatcher, show semaphore availability
+            if hasattr(dispatcher, "semaphore_count"):
+                stats["stats"] = {
+                    "max_concurrent": dispatcher.semaphore_count,
+                }
+        
+        return DispatcherStatsResponse(**stats)
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error getting dispatcher stats for '{dispatcher_type}': {e}")
+        raise HTTPException(
+            status_code=500, 
+            detail=f"Failed to get dispatcher stats: {str(e)}"
+        )
--- a/deploy/docker/routers/scripts.py
+++ b/deploy/docker/routers/scripts.py
@@ -27,30 +27,148 @@ router = APIRouter(
 # --- Background Worker Function ---


-@router.post(
-    "/validate", response_model=ValidationResult, summary="Validate a C4A-Script"
+@router.post("/validate",
+    summary="Validate C4A-Script",
+    description="Validate the syntax of a C4A-Script without compiling it.",
+    response_description="Validation result with errors if any",
+    response_model=ValidationResult
 )
 async def validate_c4a_script_endpoint(payload: C4AScriptPayload):
    """
-    Validates the syntax of a C4A-Script without compiling it.
-
-    Returns a `ValidationResult` object indicating whether the script is
-    valid and providing detailed error information if it's not.
+    Validate the syntax of a C4A-Script.
+    
+    Checks the script syntax without compiling to executable JavaScript.
+    Returns detailed error information if validation fails.
+    
+    **Request Body:**
+    ```json
+    {
+        "script": "NAVIGATE https://example.com\\nWAIT 2\\nCLICK button.submit"
+    }
+    ```
+    
+    **Response (Valid):**
+    ```json
+    {
+        "success": true,
+        "errors": []
+    }
+    ```
+    
+    **Response (Invalid):**
+    ```json
+    {
+        "success": false,
+        "errors": [
+            {
+                "line": 3,
+                "message": "Unknown command: CLCK",
+                "type": "SyntaxError"
+            }
+        ]
+    }
+    ```
+    
+    **Usage:**
+    ```python
+    response = requests.post(
+        "http://localhost:11235/c4a/validate",
+        headers={"Authorization": f"Bearer {token}"},
+        json={
+            "script": "NAVIGATE https://example.com\\nWAIT 2"
+        }
+    )
+    result = response.json()
+    if result["success"]:
+        print("Script is valid!")
+    else:
+        for error in result["errors"]:
+            print(f"Line {error['line']}: {error['message']}")
+    ```
+    
+    **Notes:**
+    - Validates syntax only, doesn't execute
+    - Returns detailed error locations
+    - Use before compiling to check for issues
    """
    # The validate function is designed not to raise exceptions
    validation_result = c4a_validate(payload.script)
    return validation_result


-@router.post(
-    "/compile", response_model=CompilationResult, summary="Compile a C4A-Script"
+@router.post("/compile",
+    summary="Compile C4A-Script",
+    description="Compile a C4A-Script into executable JavaScript code.",
+    response_description="Compiled JavaScript code or compilation errors",
+    response_model=CompilationResult
 )
 async def compile_c4a_script_endpoint(payload: C4AScriptPayload):
    """
-    Compiles a C4A-Script into executable JavaScript.
-
-    If successful, returns the compiled JavaScript code. If there are syntax
-    errors, it returns a detailed error report.
+    Compile a C4A-Script into executable JavaScript.
+    
+    Transforms high-level C4A-Script commands into JavaScript that can be
+    executed in a browser context.
+    
+    **Request Body:**
+    ```json
+    {
+        "script": "NAVIGATE https://example.com\\nWAIT 2\\nCLICK button.submit"
+    }
+    ```
+    
+    **Response (Success):**
+    ```json
+    {
+        "success": true,
+        "javascript": "await page.goto('https://example.com');\\nawait page.waitForTimeout(2000);\\nawait page.click('button.submit');",
+        "errors": []
+    }
+    ```
+    
+    **Response (Error):**
+    ```json
+    {
+        "success": false,
+        "javascript": null,
+        "errors": [
+            {
+                "line": 2,
+                "message": "Invalid WAIT duration",
+                "type": "CompilationError"
+            }
+        ]
+    }
+    ```
+    
+    **Usage:**
+    ```python
+    response = requests.post(
+        "http://localhost:11235/c4a/compile",
+        headers={"Authorization": f"Bearer {token}"},
+        json={
+            "script": "NAVIGATE https://example.com\\nCLICK .login-button"
+        }
+    )
+    result = response.json()
+    if result["success"]:
+        print("Compiled JavaScript:")
+        print(result["javascript"])
+    else:
+        print("Compilation failed:", result["errors"])
+    ```
+    
+    **C4A-Script Commands:**
+    - `NAVIGATE <url>` - Navigate to URL
+    - `WAIT <seconds>` - Wait for specified time
+    - `CLICK <selector>` - Click element
+    - `TYPE <selector> <text>` - Type text into element
+    - `SCROLL <direction>` - Scroll page
+    - And many more...
+    
+    **Notes:**
+    - Returns HTTP 400 if compilation fails
+    - JavaScript can be used with /execute_js endpoint
+    - Simplifies browser automation scripting
    """
    # The compile function also returns a result object instead of raising
    compilation_result = c4a_compile(payload.script)
@@ -66,25 +184,78 @@ async def compile_c4a_script_endpoint(payload: C4AScriptPayload):
    return compilation_result


-@router.post(
-    "/compile-file",
-    response_model=CompilationResult,
-    summary="Compile a C4A-Script from file or string",
+@router.post("/compile-file",
+    summary="Compile C4A-Script from File",
+    description="Compile a C4A-Script from an uploaded file or form string.",
+    response_description="Compiled JavaScript code or compilation errors",
+    response_model=CompilationResult
 )
 async def compile_c4a_script_file_endpoint(
    file: Optional[UploadFile] = File(None), script: Optional[str] = Form(None)
 ):
    """
-    Compiles a C4A-Script into executable JavaScript from either an uploaded file or string content.
-
-    Accepts either:
-    - A file upload containing the C4A-Script
-    - A string containing the C4A-Script content
-
-    At least one of the parameters must be provided.
-
-    If successful, returns the compiled JavaScript code. If there are syntax
-    errors, it returns a detailed error report.
+    Compile a C4A-Script from file upload or form data.
+    
+    Accepts either a file upload or a string parameter. Useful for uploading
+    C4A-Script files or sending multipart form data.
+    
+    **Parameters:**
+    - `file`: C4A-Script file upload (multipart/form-data)
+    - `script`: C4A-Script content as string (form field)
+    
+    **Note:** Provide either file OR script, not both.
+    
+    **Request (File Upload):**
+    ```bash
+    curl -X POST "http://localhost:11235/c4a/compile-file" \\
+      -H "Authorization: Bearer YOUR_TOKEN" \\
+      -F "file=@myscript.c4a"
+    ```
+    
+    **Request (Form String):**
+    ```bash
+    curl -X POST "http://localhost:11235/c4a/compile-file" \\
+      -H "Authorization: Bearer YOUR_TOKEN" \\
+      -F "script=NAVIGATE https://example.com"
+    ```
+    
+    **Response:**
+    ```json
+    {
+        "success": true,
+        "javascript": "await page.goto('https://example.com');",
+        "errors": []
+    }
+    ```
+    
+    **Usage (Python with file):**
+    ```python
+    with open('script.c4a', 'rb') as f:
+        response = requests.post(
+            "http://localhost:11235/c4a/compile-file",
+            headers={"Authorization": f"Bearer {token}"},
+            files={"file": f}
+        )
+    result = response.json()
+    print(result["javascript"])
+    ```
+    
+    **Usage (Python with string):**
+    ```python
+    response = requests.post(
+        "http://localhost:11235/c4a/compile-file",
+        headers={"Authorization": f"Bearer {token}"},
+        data={"script": "NAVIGATE https://example.com"}
+    )
+    result = response.json()
+    print(result["javascript"])
+    ```
+    
+    **Notes:**
+    - File must be UTF-8 encoded text
+    - Use for batch script compilation
+    - Returns HTTP 400 if both or neither parameter provided
+    - Returns HTTP 400 if compilation fails
    """
    script_content = None