feat(api): add seed URL endpoint and related request model

2025-09-30 13:35:08 +08:00
parent 3fe49a766c
commit 1ea021b721
4 changed files with 64 additions and 7 deletions
--- a/.yoyo/snapshot
+++ b/.yoyo/snapshot
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -14,7 +14,6 @@ from fastapi import HTTPException, Request, status
 from fastapi.background import BackgroundTasks
 from fastapi.responses import JSONResponse
 from redis import asyncio as aioredis
-
 from crawl4ai import (
    AsyncWebCrawler,
    CrawlerRunConfig,
@@ -23,7 +22,9 @@ from crawl4ai import (
    BrowserConfig,
    MemoryAdaptiveDispatcher,
    RateLimiter, 
-    LLMConfig
+    LLMConfig,
+    AsyncUrlSeeder, 
+    SeedingConfig
 )
 from crawl4ai.utils import perform_completion_with_backoff
 from crawl4ai.content_filter_strategy import (
@@ -716,4 +717,22 @@ async def handle_crawl_job(
            })

    background_tasks.add_task(_runner)
-    return {"task_id": task_id}
+    return {"task_id": task_id}
+
+async def handle_seed(url ,cfg):
+    # Create the configuration from the request body
+    try:
+        seeding_config = cfg
+        config = SeedingConfig(**seeding_config)
+
+        # Use an async context manager for the seeder
+        async with AsyncUrlSeeder() as seeder:
+            # The seeder's 'urls' method expects a domain, not a full URL
+            urls = await seeder.urls(url, config)
+        return urls
+    except Exception as e:
+        return {
+                "seeded_urls": [],
+                "count": 0,
+                "message": "No URLs found for the given domain and configuration.",
+            }
--- a/deploy/docker/schemas.py
+++ b/deploy/docker/schemas.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Dict
+from typing import List, Optional, Dict, Any
 from enum import Enum
 from pydantic import BaseModel, Field
 from utils import FilterType
@@ -85,4 +85,10 @@ class JSEndpointRequest(BaseModel):
    scripts: List[str] = Field(
        ...,
        description="List of separated JavaScript snippets to execute"
-    )
+    )
+
+
+class SeedRequest(BaseModel):
+    """Request model for URL seeding endpoint."""
+    url: str = Field(..., example="https://docs.crawl4ai.com")
+    config: Dict[str, Any] = Field(default_factory=dict)
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -11,16 +11,22 @@ from crawler_pool import get_crawler, close_all, janitor
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
 from auth import create_access_token, get_token_dependency, TokenRequest
 from pydantic import BaseModel
-from typing import Optional, List, Dict
+from typing import Optional, List, Dict, Any
 from fastapi import Request, Depends
 from fastapi.responses import FileResponse
 import base64
 import re
+import os
+import sys
+import asyncio
+from contextlib import asynccontextmanager
+from pathlib import Path
+from urllib.parse import urlparse
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
 from api import (
    handle_markdown_request, handle_llm_qa,
    handle_stream_crawl_request, handle_crawl_request,
-    stream_results
+    stream_results, handle_seed
 )
 from schemas import (
    CrawlRequestWithHooks,
@@ -30,6 +36,7 @@ from schemas import (
    ScreenshotRequest,
    PDFRequest,
    JSEndpointRequest,
+    SeedRequest,
 )

 from utils import (
@@ -229,6 +236,30 @@ async def config_dump(raw: RawCode):
        raise HTTPException(400, str(e))


+@app.post("/seed")
+async def seed_url(request: SeedRequest):
+    """
+    Seed a domain for crawling based on a URL.
+    • Extract domain from provided URL
+    • Generate crawlable URLs using AsyncUrlSeeder
+    • Return list of seeded URLs for testing
+    """
+    try:
+        # Extract the domain (e.g., "docs.crawl4ai.com") from the full URL
+        domain = urlparse(request.url).netloc
+        if not domain:
+            raise HTTPException(
+                status_code=400,
+                detail="Invalid URL provided. Could not extract domain.",
+            )
+        res = await handle_seed(request.url , request.config)
+        return JSONResponse({"seed_url":res , "count":len(res)})
+        
+    except Exception as e:
+        print(f"❌ Error in seed_url: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
@app.post("/md")
@limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("md")