feat(api): add seed URL endpoint and related request model

This commit is contained in:
AHMET YILMAZ
2025-09-30 13:35:08 +08:00
parent 3fe49a766c
commit 1ea021b721
4 changed files with 64 additions and 7 deletions

1
.yoyo/snapshot Submodule

Submodule .yoyo/snapshot added at bf72603f11

View File

@@ -14,7 +14,6 @@ from fastapi import HTTPException, Request, status
from fastapi.background import BackgroundTasks
from fastapi.responses import JSONResponse
from redis import asyncio as aioredis
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
@@ -23,7 +22,9 @@ from crawl4ai import (
BrowserConfig,
MemoryAdaptiveDispatcher,
RateLimiter,
LLMConfig
LLMConfig,
AsyncUrlSeeder,
SeedingConfig
)
from crawl4ai.utils import perform_completion_with_backoff
from crawl4ai.content_filter_strategy import (
@@ -716,4 +717,22 @@ async def handle_crawl_job(
})
background_tasks.add_task(_runner)
return {"task_id": task_id}
return {"task_id": task_id}
async def handle_seed(url ,cfg):
# Create the configuration from the request body
try:
seeding_config = cfg
config = SeedingConfig(**seeding_config)
# Use an async context manager for the seeder
async with AsyncUrlSeeder() as seeder:
# The seeder's 'urls' method expects a domain, not a full URL
urls = await seeder.urls(url, config)
return urls
except Exception as e:
return {
"seeded_urls": [],
"count": 0,
"message": "No URLs found for the given domain and configuration.",
}

View File

@@ -1,4 +1,4 @@
from typing import List, Optional, Dict
from typing import List, Optional, Dict, Any
from enum import Enum
from pydantic import BaseModel, Field
from utils import FilterType
@@ -85,4 +85,10 @@ class JSEndpointRequest(BaseModel):
scripts: List[str] = Field(
...,
description="List of separated JavaScript snippets to execute"
)
)
class SeedRequest(BaseModel):
"""Request model for URL seeding endpoint."""
url: str = Field(..., example="https://docs.crawl4ai.com")
config: Dict[str, Any] = Field(default_factory=dict)

View File

@@ -11,16 +11,22 @@ from crawler_pool import get_crawler, close_all, janitor
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from auth import create_access_token, get_token_dependency, TokenRequest
from pydantic import BaseModel
from typing import Optional, List, Dict
from typing import Optional, List, Dict, Any
from fastapi import Request, Depends
from fastapi.responses import FileResponse
import base64
import re
import os
import sys
import asyncio
from contextlib import asynccontextmanager
from pathlib import Path
from urllib.parse import urlparse
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from api import (
handle_markdown_request, handle_llm_qa,
handle_stream_crawl_request, handle_crawl_request,
stream_results
stream_results, handle_seed
)
from schemas import (
CrawlRequestWithHooks,
@@ -30,6 +36,7 @@ from schemas import (
ScreenshotRequest,
PDFRequest,
JSEndpointRequest,
SeedRequest,
)
from utils import (
@@ -229,6 +236,30 @@ async def config_dump(raw: RawCode):
raise HTTPException(400, str(e))
@app.post("/seed")
async def seed_url(request: SeedRequest):
"""
Seed a domain for crawling based on a URL.
• Extract domain from provided URL
• Generate crawlable URLs using AsyncUrlSeeder
• Return list of seeded URLs for testing
"""
try:
# Extract the domain (e.g., "docs.crawl4ai.com") from the full URL
domain = urlparse(request.url).netloc
if not domain:
raise HTTPException(
status_code=400,
detail="Invalid URL provided. Could not extract domain.",
)
res = await handle_seed(request.url , request.config)
return JSONResponse({"seed_url":res , "count":len(res)})
except Exception as e:
print(f"❌ Error in seed_url: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/md")
@limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("md")