Implements new asynchronous endpoints for handling long-running crawl and LLM tasks:
- POST /crawl/job and GET /crawl/job/{task_id} for crawl operations
- POST /llm/job and GET /llm/job/{task_id} for LLM operations
- Added Redis-based task management with configurable TTL
- Moved schema definitions to dedicated schemas.py
- Added example polling client demo_docker_polling.py
This change allows clients to handle long-running operations asynchronously through a polling pattern rather than holding connections open.
150 lines
5.0 KiB
Python
150 lines
5.0 KiB
Python
|
||
#!/usr/bin/env python3
|
||
"""
|
||
demo_docker_polling.py
|
||
Quick sanity-check for the asynchronous crawl job endpoints:
|
||
|
||
• POST /crawl/job – enqueue work, get task_id
|
||
• GET /crawl/job/{id} – poll status / fetch result
|
||
|
||
The style matches demo_docker_api.py (console.rule banners, helper
|
||
functions, coloured status lines). Adjust BASE_URL as needed.
|
||
|
||
Run: python demo_docker_polling.py
|
||
"""
|
||
|
||
import asyncio, json, os, time, urllib.parse
|
||
from typing import Dict, List
|
||
|
||
import httpx
|
||
from rich.console import Console
|
||
from rich.panel import Panel
|
||
from rich.syntax import Syntax
|
||
|
||
console = Console()
|
||
BASE_URL = os.getenv("BASE_URL", "http://localhost:11234")
|
||
SIMPLE_URL = "https://example.org"
|
||
LINKS_URL = "https://httpbin.org/links/10/1"
|
||
|
||
# --- helpers --------------------------------------------------------------
|
||
|
||
|
||
def print_payload(payload: Dict):
|
||
console.print(Panel(Syntax(json.dumps(payload, indent=2),
|
||
"json", theme="monokai", line_numbers=False),
|
||
title="Payload", border_style="cyan", expand=False))
|
||
|
||
|
||
async def check_server_health(client: httpx.AsyncClient) -> bool:
|
||
try:
|
||
resp = await client.get("/health")
|
||
if resp.is_success:
|
||
console.print("[green]Server healthy[/]")
|
||
return True
|
||
except Exception:
|
||
pass
|
||
console.print("[bold red]Server is not responding on /health[/]")
|
||
return False
|
||
|
||
|
||
async def poll_for_result(client: httpx.AsyncClient, task_id: str,
|
||
poll_interval: float = 1.5, timeout: float = 90.0):
|
||
"""Hit /crawl/job/{id} until COMPLETED/FAILED or timeout."""
|
||
start = time.time()
|
||
while True:
|
||
resp = await client.get(f"/crawl/job/{task_id}")
|
||
resp.raise_for_status()
|
||
data = resp.json()
|
||
status = data.get("status")
|
||
if status.upper() in ("COMPLETED", "FAILED"):
|
||
return data
|
||
if time.time() - start > timeout:
|
||
raise TimeoutError(f"Task {task_id} did not finish in {timeout}s")
|
||
await asyncio.sleep(poll_interval)
|
||
|
||
|
||
# --- demo functions -------------------------------------------------------
|
||
|
||
|
||
async def demo_poll_single_url(client: httpx.AsyncClient):
|
||
payload = {
|
||
"urls": [SIMPLE_URL],
|
||
"browser_config": {"type": "BrowserConfig",
|
||
"params": {"headless": True}},
|
||
"crawler_config": {"type": "CrawlerRunConfig",
|
||
"params": {"cache_mode": "BYPASS"}}
|
||
}
|
||
|
||
console.rule("[bold blue]Demo A: /crawl/job Single URL[/]", style="blue")
|
||
print_payload(payload)
|
||
|
||
# enqueue
|
||
resp = await client.post("/crawl/job", json=payload)
|
||
console.print(f"Enqueue status: [bold]{resp.status_code}[/]")
|
||
resp.raise_for_status()
|
||
task_id = resp.json()["task_id"]
|
||
console.print(f"Task ID: [yellow]{task_id}[/]")
|
||
|
||
# poll
|
||
console.print("Polling…")
|
||
result = await poll_for_result(client, task_id)
|
||
console.print(Panel(Syntax(json.dumps(result, indent=2),
|
||
"json", theme="fruity"),
|
||
title="Final result", border_style="green"))
|
||
if result["status"] == "COMPLETED":
|
||
console.print("[green]✅ Crawl succeeded[/]")
|
||
else:
|
||
console.print("[red]❌ Crawl failed[/]")
|
||
|
||
|
||
async def demo_poll_multi_url(client: httpx.AsyncClient):
|
||
payload = {
|
||
"urls": [SIMPLE_URL, LINKS_URL],
|
||
"browser_config": {"type": "BrowserConfig",
|
||
"params": {"headless": True}},
|
||
"crawler_config": {"type": "CrawlerRunConfig",
|
||
"params": {"cache_mode": "BYPASS"}}
|
||
}
|
||
|
||
console.rule("[bold magenta]Demo B: /crawl/job Multi-URL[/]",
|
||
style="magenta")
|
||
print_payload(payload)
|
||
|
||
resp = await client.post("/crawl/job", json=payload)
|
||
console.print(f"Enqueue status: [bold]{resp.status_code}[/]")
|
||
resp.raise_for_status()
|
||
task_id = resp.json()["task_id"]
|
||
console.print(f"Task ID: [yellow]{task_id}[/]")
|
||
|
||
console.print("Polling…")
|
||
result = await poll_for_result(client, task_id)
|
||
console.print(Panel(Syntax(json.dumps(result, indent=2),
|
||
"json", theme="fruity"),
|
||
title="Final result", border_style="green"))
|
||
if result["status"] == "COMPLETED":
|
||
console.print(
|
||
f"[green]✅ {len(json.loads(result['result'])['results'])} URLs crawled[/]")
|
||
else:
|
||
console.print("[red]❌ Crawl failed[/]")
|
||
|
||
|
||
# --- main runner ----------------------------------------------------------
|
||
|
||
|
||
async def main_demo():
|
||
async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
|
||
if not await check_server_health(client):
|
||
return
|
||
await demo_poll_single_url(client)
|
||
await demo_poll_multi_url(client)
|
||
console.rule("[bold green]Polling demos complete[/]", style="green")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
try:
|
||
asyncio.run(main_demo())
|
||
except KeyboardInterrupt:
|
||
console.print("\n[yellow]Interrupted by user[/]")
|
||
except Exception:
|
||
console.print_exception(show_locals=False)
|