feat(api): add seed URL endpoint and related request model
This commit is contained in:
1
.yoyo/snapshot
Submodule
1
.yoyo/snapshot
Submodule
Submodule .yoyo/snapshot added at bf72603f11
@@ -14,7 +14,6 @@ from fastapi import HTTPException, Request, status
|
|||||||
from fastapi.background import BackgroundTasks
|
from fastapi.background import BackgroundTasks
|
||||||
from fastapi.responses import JSONResponse
|
from fastapi.responses import JSONResponse
|
||||||
from redis import asyncio as aioredis
|
from redis import asyncio as aioredis
|
||||||
|
|
||||||
from crawl4ai import (
|
from crawl4ai import (
|
||||||
AsyncWebCrawler,
|
AsyncWebCrawler,
|
||||||
CrawlerRunConfig,
|
CrawlerRunConfig,
|
||||||
@@ -23,7 +22,9 @@ from crawl4ai import (
|
|||||||
BrowserConfig,
|
BrowserConfig,
|
||||||
MemoryAdaptiveDispatcher,
|
MemoryAdaptiveDispatcher,
|
||||||
RateLimiter,
|
RateLimiter,
|
||||||
LLMConfig
|
LLMConfig,
|
||||||
|
AsyncUrlSeeder,
|
||||||
|
SeedingConfig
|
||||||
)
|
)
|
||||||
from crawl4ai.utils import perform_completion_with_backoff
|
from crawl4ai.utils import perform_completion_with_backoff
|
||||||
from crawl4ai.content_filter_strategy import (
|
from crawl4ai.content_filter_strategy import (
|
||||||
@@ -716,4 +717,22 @@ async def handle_crawl_job(
|
|||||||
})
|
})
|
||||||
|
|
||||||
background_tasks.add_task(_runner)
|
background_tasks.add_task(_runner)
|
||||||
return {"task_id": task_id}
|
return {"task_id": task_id}
|
||||||
|
|
||||||
|
async def handle_seed(url ,cfg):
|
||||||
|
# Create the configuration from the request body
|
||||||
|
try:
|
||||||
|
seeding_config = cfg
|
||||||
|
config = SeedingConfig(**seeding_config)
|
||||||
|
|
||||||
|
# Use an async context manager for the seeder
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
|
# The seeder's 'urls' method expects a domain, not a full URL
|
||||||
|
urls = await seeder.urls(url, config)
|
||||||
|
return urls
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"seeded_urls": [],
|
||||||
|
"count": 0,
|
||||||
|
"message": "No URLs found for the given domain and configuration.",
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from typing import List, Optional, Dict
|
from typing import List, Optional, Dict, Any
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from utils import FilterType
|
from utils import FilterType
|
||||||
@@ -85,4 +85,10 @@ class JSEndpointRequest(BaseModel):
|
|||||||
scripts: List[str] = Field(
|
scripts: List[str] = Field(
|
||||||
...,
|
...,
|
||||||
description="List of separated JavaScript snippets to execute"
|
description="List of separated JavaScript snippets to execute"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class SeedRequest(BaseModel):
|
||||||
|
"""Request model for URL seeding endpoint."""
|
||||||
|
url: str = Field(..., example="https://docs.crawl4ai.com")
|
||||||
|
config: Dict[str, Any] = Field(default_factory=dict)
|
||||||
@@ -11,16 +11,22 @@ from crawler_pool import get_crawler, close_all, janitor
|
|||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
from auth import create_access_token, get_token_dependency, TokenRequest
|
from auth import create_access_token, get_token_dependency, TokenRequest
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing import Optional, List, Dict
|
from typing import Optional, List, Dict, Any
|
||||||
from fastapi import Request, Depends
|
from fastapi import Request, Depends
|
||||||
from fastapi.responses import FileResponse
|
from fastapi.responses import FileResponse
|
||||||
import base64
|
import base64
|
||||||
import re
|
import re
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import asyncio
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib.parse import urlparse
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
from api import (
|
from api import (
|
||||||
handle_markdown_request, handle_llm_qa,
|
handle_markdown_request, handle_llm_qa,
|
||||||
handle_stream_crawl_request, handle_crawl_request,
|
handle_stream_crawl_request, handle_crawl_request,
|
||||||
stream_results
|
stream_results, handle_seed
|
||||||
)
|
)
|
||||||
from schemas import (
|
from schemas import (
|
||||||
CrawlRequestWithHooks,
|
CrawlRequestWithHooks,
|
||||||
@@ -30,6 +36,7 @@ from schemas import (
|
|||||||
ScreenshotRequest,
|
ScreenshotRequest,
|
||||||
PDFRequest,
|
PDFRequest,
|
||||||
JSEndpointRequest,
|
JSEndpointRequest,
|
||||||
|
SeedRequest,
|
||||||
)
|
)
|
||||||
|
|
||||||
from utils import (
|
from utils import (
|
||||||
@@ -229,6 +236,30 @@ async def config_dump(raw: RawCode):
|
|||||||
raise HTTPException(400, str(e))
|
raise HTTPException(400, str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/seed")
|
||||||
|
async def seed_url(request: SeedRequest):
|
||||||
|
"""
|
||||||
|
Seed a domain for crawling based on a URL.
|
||||||
|
• Extract domain from provided URL
|
||||||
|
• Generate crawlable URLs using AsyncUrlSeeder
|
||||||
|
• Return list of seeded URLs for testing
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Extract the domain (e.g., "docs.crawl4ai.com") from the full URL
|
||||||
|
domain = urlparse(request.url).netloc
|
||||||
|
if not domain:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail="Invalid URL provided. Could not extract domain.",
|
||||||
|
)
|
||||||
|
res = await handle_seed(request.url , request.config)
|
||||||
|
return JSONResponse({"seed_url":res , "count":len(res)})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error in seed_url: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
@app.post("/md")
|
@app.post("/md")
|
||||||
@limiter.limit(config["rate_limiting"]["default_limit"])
|
@limiter.limit(config["rate_limiting"]["default_limit"])
|
||||||
@mcp_tool("md")
|
@mcp_tool("md")
|
||||||
|
|||||||
Reference in New Issue
Block a user