feat(api): add seed URL endpoint and related request model
This commit is contained in:
1
.yoyo/snapshot
Submodule
1
.yoyo/snapshot
Submodule
Submodule .yoyo/snapshot added at bf72603f11
@@ -14,7 +14,6 @@ from fastapi import HTTPException, Request, status
|
||||
from fastapi.background import BackgroundTasks
|
||||
from fastapi.responses import JSONResponse
|
||||
from redis import asyncio as aioredis
|
||||
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
@@ -23,7 +22,9 @@ from crawl4ai import (
|
||||
BrowserConfig,
|
||||
MemoryAdaptiveDispatcher,
|
||||
RateLimiter,
|
||||
LLMConfig
|
||||
LLMConfig,
|
||||
AsyncUrlSeeder,
|
||||
SeedingConfig
|
||||
)
|
||||
from crawl4ai.utils import perform_completion_with_backoff
|
||||
from crawl4ai.content_filter_strategy import (
|
||||
@@ -716,4 +717,22 @@ async def handle_crawl_job(
|
||||
})
|
||||
|
||||
background_tasks.add_task(_runner)
|
||||
return {"task_id": task_id}
|
||||
return {"task_id": task_id}
|
||||
|
||||
async def handle_seed(url ,cfg):
|
||||
# Create the configuration from the request body
|
||||
try:
|
||||
seeding_config = cfg
|
||||
config = SeedingConfig(**seeding_config)
|
||||
|
||||
# Use an async context manager for the seeder
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# The seeder's 'urls' method expects a domain, not a full URL
|
||||
urls = await seeder.urls(url, config)
|
||||
return urls
|
||||
except Exception as e:
|
||||
return {
|
||||
"seeded_urls": [],
|
||||
"count": 0,
|
||||
"message": "No URLs found for the given domain and configuration.",
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import List, Optional, Dict
|
||||
from typing import List, Optional, Dict, Any
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel, Field
|
||||
from utils import FilterType
|
||||
@@ -85,4 +85,10 @@ class JSEndpointRequest(BaseModel):
|
||||
scripts: List[str] = Field(
|
||||
...,
|
||||
description="List of separated JavaScript snippets to execute"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class SeedRequest(BaseModel):
|
||||
"""Request model for URL seeding endpoint."""
|
||||
url: str = Field(..., example="https://docs.crawl4ai.com")
|
||||
config: Dict[str, Any] = Field(default_factory=dict)
|
||||
@@ -11,16 +11,22 @@ from crawler_pool import get_crawler, close_all, janitor
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from auth import create_access_token, get_token_dependency, TokenRequest
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, List, Dict
|
||||
from typing import Optional, List, Dict, Any
|
||||
from fastapi import Request, Depends
|
||||
from fastapi.responses import FileResponse
|
||||
import base64
|
||||
import re
|
||||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
from contextlib import asynccontextmanager
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from api import (
|
||||
handle_markdown_request, handle_llm_qa,
|
||||
handle_stream_crawl_request, handle_crawl_request,
|
||||
stream_results
|
||||
stream_results, handle_seed
|
||||
)
|
||||
from schemas import (
|
||||
CrawlRequestWithHooks,
|
||||
@@ -30,6 +36,7 @@ from schemas import (
|
||||
ScreenshotRequest,
|
||||
PDFRequest,
|
||||
JSEndpointRequest,
|
||||
SeedRequest,
|
||||
)
|
||||
|
||||
from utils import (
|
||||
@@ -229,6 +236,30 @@ async def config_dump(raw: RawCode):
|
||||
raise HTTPException(400, str(e))
|
||||
|
||||
|
||||
@app.post("/seed")
|
||||
async def seed_url(request: SeedRequest):
|
||||
"""
|
||||
Seed a domain for crawling based on a URL.
|
||||
• Extract domain from provided URL
|
||||
• Generate crawlable URLs using AsyncUrlSeeder
|
||||
• Return list of seeded URLs for testing
|
||||
"""
|
||||
try:
|
||||
# Extract the domain (e.g., "docs.crawl4ai.com") from the full URL
|
||||
domain = urlparse(request.url).netloc
|
||||
if not domain:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Invalid URL provided. Could not extract domain.",
|
||||
)
|
||||
res = await handle_seed(request.url , request.config)
|
||||
return JSONResponse({"seed_url":res , "count":len(res)})
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error in seed_url: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.post("/md")
|
||||
@limiter.limit(config["rate_limiting"]["default_limit"])
|
||||
@mcp_tool("md")
|
||||
|
||||
Reference in New Issue
Block a user