diff --git a/.yoyo/snapshot b/.yoyo/snapshot new file mode 160000 index 00000000..bf72603f --- /dev/null +++ b/.yoyo/snapshot @@ -0,0 +1 @@ +Subproject commit bf72603f11e4ec2868c15f2fd2d7662552c92ca6 diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 78a36bf3..959e64c0 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -14,7 +14,6 @@ from fastapi import HTTPException, Request, status from fastapi.background import BackgroundTasks from fastapi.responses import JSONResponse from redis import asyncio as aioredis - from crawl4ai import ( AsyncWebCrawler, CrawlerRunConfig, @@ -23,7 +22,9 @@ from crawl4ai import ( BrowserConfig, MemoryAdaptiveDispatcher, RateLimiter, - LLMConfig + LLMConfig, + AsyncUrlSeeder, + SeedingConfig ) from crawl4ai.utils import perform_completion_with_backoff from crawl4ai.content_filter_strategy import ( @@ -716,4 +717,22 @@ async def handle_crawl_job( }) background_tasks.add_task(_runner) - return {"task_id": task_id} \ No newline at end of file + return {"task_id": task_id} + +async def handle_seed(url ,cfg): + # Create the configuration from the request body + try: + seeding_config = cfg + config = SeedingConfig(**seeding_config) + + # Use an async context manager for the seeder + async with AsyncUrlSeeder() as seeder: + # The seeder's 'urls' method expects a domain, not a full URL + urls = await seeder.urls(url, config) + return urls + except Exception as e: + return { + "seeded_urls": [], + "count": 0, + "message": "No URLs found for the given domain and configuration.", + } diff --git a/deploy/docker/schemas.py b/deploy/docker/schemas.py index 792936bb..aa479693 100644 --- a/deploy/docker/schemas.py +++ b/deploy/docker/schemas.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Dict +from typing import List, Optional, Dict, Any from enum import Enum from pydantic import BaseModel, Field from utils import FilterType @@ -85,4 +85,10 @@ class JSEndpointRequest(BaseModel): scripts: List[str] = Field( ..., description="List of separated JavaScript snippets to execute" - ) \ No newline at end of file + ) + + +class SeedRequest(BaseModel): + """Request model for URL seeding endpoint.""" + url: str = Field(..., example="https://docs.crawl4ai.com") + config: Dict[str, Any] = Field(default_factory=dict) \ No newline at end of file diff --git a/deploy/docker/server.py b/deploy/docker/server.py index 101e8614..4f09daf5 100644 --- a/deploy/docker/server.py +++ b/deploy/docker/server.py @@ -11,16 +11,22 @@ from crawler_pool import get_crawler, close_all, janitor from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig from auth import create_access_token, get_token_dependency, TokenRequest from pydantic import BaseModel -from typing import Optional, List, Dict +from typing import Optional, List, Dict, Any from fastapi import Request, Depends from fastapi.responses import FileResponse import base64 import re +import os +import sys +import asyncio +from contextlib import asynccontextmanager +from pathlib import Path +from urllib.parse import urlparse from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig from api import ( handle_markdown_request, handle_llm_qa, handle_stream_crawl_request, handle_crawl_request, - stream_results + stream_results, handle_seed ) from schemas import ( CrawlRequestWithHooks, @@ -30,6 +36,7 @@ from schemas import ( ScreenshotRequest, PDFRequest, JSEndpointRequest, + SeedRequest, ) from utils import ( @@ -229,6 +236,30 @@ async def config_dump(raw: RawCode): raise HTTPException(400, str(e)) +@app.post("/seed") +async def seed_url(request: SeedRequest): + """ + Seed a domain for crawling based on a URL. + • Extract domain from provided URL + • Generate crawlable URLs using AsyncUrlSeeder + • Return list of seeded URLs for testing + """ + try: + # Extract the domain (e.g., "docs.crawl4ai.com") from the full URL + domain = urlparse(request.url).netloc + if not domain: + raise HTTPException( + status_code=400, + detail="Invalid URL provided. Could not extract domain.", + ) + res = await handle_seed(request.url , request.config) + return JSONResponse({"seed_url":res , "count":len(res)}) + + except Exception as e: + print(f"❌ Error in seed_url: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + @app.post("/md") @limiter.limit(config["rate_limiting"]["default_limit"]) @mcp_tool("md")