Implements new asynchronous endpoints for handling long-running crawl and LLM tasks:
- POST /crawl/job and GET /crawl/job/{task_id} for crawl operations
- POST /llm/job and GET /llm/job/{task_id} for LLM operations
- Added Redis-based task management with configurable TTL
- Moved schema definitions to dedicated schemas.py
- Added example polling client demo_docker_polling.py
This change allows clients to handle long-running operations asynchronously through a polling pattern rather than holding connections open.
42 lines
1.3 KiB
Python
42 lines
1.3 KiB
Python
from typing import List, Optional, Dict
|
||
from enum import Enum
|
||
from pydantic import BaseModel, Field
|
||
from utils import FilterType
|
||
|
||
|
||
class CrawlRequest(BaseModel):
|
||
urls: List[str] = Field(min_length=1, max_length=100)
|
||
browser_config: Optional[Dict] = Field(default_factory=dict)
|
||
crawler_config: Optional[Dict] = Field(default_factory=dict)
|
||
|
||
class MarkdownRequest(BaseModel):
|
||
"""Request body for the /md endpoint."""
|
||
url: str = Field(..., description="Absolute http/https URL to fetch")
|
||
f: FilterType = Field(FilterType.FIT,
|
||
description="Content‑filter strategy: FIT, RAW, BM25, or LLM")
|
||
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
||
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
||
|
||
|
||
class RawCode(BaseModel):
|
||
code: str
|
||
|
||
class HTMLRequest(BaseModel):
|
||
url: str
|
||
|
||
class ScreenshotRequest(BaseModel):
|
||
url: str
|
||
screenshot_wait_for: Optional[float] = 2
|
||
output_path: Optional[str] = None
|
||
|
||
class PDFRequest(BaseModel):
|
||
url: str
|
||
output_path: Optional[str] = None
|
||
|
||
|
||
class JSEndpointRequest(BaseModel):
|
||
url: str
|
||
scripts: List[str] = Field(
|
||
...,
|
||
description="List of separated JavaScript snippets to execute"
|
||
) |