Files
crawl4ai/docs/examples/docker/demo_docker_polling.py
UncleCode 94e9959fe0 feat(docker-api): add job-based polling endpoints for crawl and LLM tasks
Implements new asynchronous endpoints for handling long-running crawl and LLM tasks:
- POST /crawl/job and GET /crawl/job/{task_id} for crawl operations
- POST /llm/job and GET /llm/job/{task_id} for LLM operations
- Added Redis-based task management with configurable TTL
- Moved schema definitions to dedicated schemas.py
- Added example polling client demo_docker_polling.py

This change allows clients to handle long-running operations asynchronously through a polling pattern rather than holding connections open.
2025-05-01 21:24:52 +08:00

150 lines
5.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
demo_docker_polling.py
Quick sanity-check for the asynchronous crawl job endpoints:
• POST /crawl/job enqueue work, get task_id
• GET /crawl/job/{id} poll status / fetch result
The style matches demo_docker_api.py (console.rule banners, helper
functions, coloured status lines). Adjust BASE_URL as needed.
Run: python demo_docker_polling.py
"""
import asyncio, json, os, time, urllib.parse
from typing import Dict, List
import httpx
from rich.console import Console
from rich.panel import Panel
from rich.syntax import Syntax
console = Console()
BASE_URL = os.getenv("BASE_URL", "http://localhost:11234")
SIMPLE_URL = "https://example.org"
LINKS_URL = "https://httpbin.org/links/10/1"
# --- helpers --------------------------------------------------------------
def print_payload(payload: Dict):
console.print(Panel(Syntax(json.dumps(payload, indent=2),
"json", theme="monokai", line_numbers=False),
title="Payload", border_style="cyan", expand=False))
async def check_server_health(client: httpx.AsyncClient) -> bool:
try:
resp = await client.get("/health")
if resp.is_success:
console.print("[green]Server healthy[/]")
return True
except Exception:
pass
console.print("[bold red]Server is not responding on /health[/]")
return False
async def poll_for_result(client: httpx.AsyncClient, task_id: str,
poll_interval: float = 1.5, timeout: float = 90.0):
"""Hit /crawl/job/{id} until COMPLETED/FAILED or timeout."""
start = time.time()
while True:
resp = await client.get(f"/crawl/job/{task_id}")
resp.raise_for_status()
data = resp.json()
status = data.get("status")
if status.upper() in ("COMPLETED", "FAILED"):
return data
if time.time() - start > timeout:
raise TimeoutError(f"Task {task_id} did not finish in {timeout}s")
await asyncio.sleep(poll_interval)
# --- demo functions -------------------------------------------------------
async def demo_poll_single_url(client: httpx.AsyncClient):
payload = {
"urls": [SIMPLE_URL],
"browser_config": {"type": "BrowserConfig",
"params": {"headless": True}},
"crawler_config": {"type": "CrawlerRunConfig",
"params": {"cache_mode": "BYPASS"}}
}
console.rule("[bold blue]Demo A: /crawl/job Single URL[/]", style="blue")
print_payload(payload)
# enqueue
resp = await client.post("/crawl/job", json=payload)
console.print(f"Enqueue status: [bold]{resp.status_code}[/]")
resp.raise_for_status()
task_id = resp.json()["task_id"]
console.print(f"Task ID: [yellow]{task_id}[/]")
# poll
console.print("Polling…")
result = await poll_for_result(client, task_id)
console.print(Panel(Syntax(json.dumps(result, indent=2),
"json", theme="fruity"),
title="Final result", border_style="green"))
if result["status"] == "COMPLETED":
console.print("[green]✅ Crawl succeeded[/]")
else:
console.print("[red]❌ Crawl failed[/]")
async def demo_poll_multi_url(client: httpx.AsyncClient):
payload = {
"urls": [SIMPLE_URL, LINKS_URL],
"browser_config": {"type": "BrowserConfig",
"params": {"headless": True}},
"crawler_config": {"type": "CrawlerRunConfig",
"params": {"cache_mode": "BYPASS"}}
}
console.rule("[bold magenta]Demo B: /crawl/job Multi-URL[/]",
style="magenta")
print_payload(payload)
resp = await client.post("/crawl/job", json=payload)
console.print(f"Enqueue status: [bold]{resp.status_code}[/]")
resp.raise_for_status()
task_id = resp.json()["task_id"]
console.print(f"Task ID: [yellow]{task_id}[/]")
console.print("Polling…")
result = await poll_for_result(client, task_id)
console.print(Panel(Syntax(json.dumps(result, indent=2),
"json", theme="fruity"),
title="Final result", border_style="green"))
if result["status"] == "COMPLETED":
console.print(
f"[green]✅ {len(json.loads(result['result'])['results'])} URLs crawled[/]")
else:
console.print("[red]❌ Crawl failed[/]")
# --- main runner ----------------------------------------------------------
async def main_demo():
async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
if not await check_server_health(client):
return
await demo_poll_single_url(client)
await demo_poll_multi_url(client)
console.rule("[bold green]Polling demos complete[/]", style="green")
if __name__ == "__main__":
try:
asyncio.run(main_demo())
except KeyboardInterrupt:
console.print("\n[yellow]Interrupted by user[/]")
except Exception:
console.print_exception(show_locals=False)