feat(api): add seed URL endpoint and related request model

2025-09-30 13:35:08 +08:00
parent 3fe49a766c
commit 1ea021b721
4 changed files with 64 additions and 7 deletions
--- a/.yoyo/snapshot
+++ b/.yoyo/snapshot
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -14,7 +14,6 @@ from fastapi import HTTPException, Request, status
 from fastapi.background import BackgroundTasks
 from fastapi.responses import JSONResponse
 from redis import asyncio as aioredis
 from crawl4ai import (
    AsyncWebCrawler,
    CrawlerRunConfig,
@@ -23,7 +22,9 @@ from crawl4ai import (
    BrowserConfig,
    MemoryAdaptiveDispatcher,
    RateLimiter, 
-    LLMConfig
+    LLMConfig,
    AsyncUrlSeeder, 
    SeedingConfig
 )
 from crawl4ai.utils import perform_completion_with_backoff
 from crawl4ai.content_filter_strategy import (
@@ -716,4 +717,22 @@ async def handle_crawl_job(
            })
    background_tasks.add_task(_runner)
-    return {"task_id": task_id}
+    return {"task_id": task_id}
 async def handle_seed(url ,cfg):
    # Create the configuration from the request body
    try:
        seeding_config = cfg
        config = SeedingConfig(**seeding_config)
        # Use an async context manager for the seeder
        async with AsyncUrlSeeder() as seeder:
            # The seeder's 'urls' method expects a domain, not a full URL
            urls = await seeder.urls(url, config)
        return urls
    except Exception as e:
        return {
                "seeded_urls": [],
                "count": 0,
                "message": "No URLs found for the given domain and configuration.",
            }
--- a/deploy/docker/schemas.py
+++ b/deploy/docker/schemas.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Dict
+from typing import List, Optional, Dict, Any
 from enum import Enum
 from pydantic import BaseModel, Field
 from utils import FilterType
@@ -85,4 +85,10 @@ class JSEndpointRequest(BaseModel):
    scripts: List[str] = Field(
        ...,
        description="List of separated JavaScript snippets to execute"
-    )
+    )
 class SeedRequest(BaseModel):
    """Request model for URL seeding endpoint."""
    url: str = Field(..., example="https://docs.crawl4ai.com")
    config: Dict[str, Any] = Field(default_factory=dict)
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -11,16 +11,22 @@ from crawler_pool import get_crawler, close_all, janitor
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
 from auth import create_access_token, get_token_dependency, TokenRequest
 from pydantic import BaseModel
-from typing import Optional, List, Dict
+from typing import Optional, List, Dict, Any
 from fastapi import Request, Depends
 from fastapi.responses import FileResponse
 import base64
 import re
 import os
 import sys
 import asyncio
 from contextlib import asynccontextmanager
 from pathlib import Path
 from urllib.parse import urlparse
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
 from api import (
    handle_markdown_request, handle_llm_qa,
    handle_stream_crawl_request, handle_crawl_request,
-    stream_results
+    stream_results, handle_seed
 )
 from schemas import (
    CrawlRequestWithHooks,
@@ -30,6 +36,7 @@ from schemas import (
    ScreenshotRequest,
    PDFRequest,
    JSEndpointRequest,
    SeedRequest,
 )
 from utils import (
@@ -229,6 +236,30 @@ async def config_dump(raw: RawCode):
        raise HTTPException(400, str(e))
@app.post("/seed")
 async def seed_url(request: SeedRequest):
    """
    Seed a domain for crawling based on a URL.
    • Extract domain from provided URL
    • Generate crawlable URLs using AsyncUrlSeeder
    • Return list of seeded URLs for testing
    """
    try:
        # Extract the domain (e.g., "docs.crawl4ai.com") from the full URL
        domain = urlparse(request.url).netloc
        if not domain:
            raise HTTPException(
                status_code=400,
                detail="Invalid URL provided. Could not extract domain.",
            )
        res = await handle_seed(request.url , request.config)
        return JSONResponse({"seed_url":res , "count":len(res)})
    except Exception as e:
        print(f"❌ Error in seed_url: {e}")
        raise HTTPException(status_code=500, detail=str(e))
@app.post("/md")
@limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("md")