feat(models): add dedicated tables field to CrawlResult

- Add tables field to CrawlResult model while maintaining backward compatibility
- Update async_webcrawler.py to extract tables from media and pass to tables field
- Update crypto_analysis_example.py to use the new tables field
- Add /config/dump examples to demo_docker_api.py
- Bump version to 0.6.1
This commit is contained in:
UncleCode
2025-04-24 18:36:25 +08:00
parent ad4dfb21e1
commit ccec40ed17
7 changed files with 287 additions and 26 deletions

View File

@@ -391,12 +391,14 @@ async def main():
# Process results
raw_df = pd.DataFrame()
for result in results:
if result.success and result.media["tables"]:
# Use the new tables field, falling back to media["tables"] for backward compatibility
tables = result.tables if hasattr(result, "tables") and result.tables else result.media.get("tables", [])
if result.success and tables:
# Extract primary market table
# DataFrame
raw_df = pd.DataFrame(
result.media["tables"][0]["rows"],
columns=result.media["tables"][0]["headers"],
tables[0]["rows"],
columns=tables[0]["headers"],
)
break

View File

@@ -4,6 +4,8 @@ import json
import os
import time
from typing import List, Dict, Any, AsyncGenerator, Optional
import textwrap # ← new: for pretty code literals
import urllib.parse # ← needed for URL-safe /llm calls
from dotenv import load_dotenv
from rich.console import Console
from rich.syntax import Syntax
@@ -969,13 +971,111 @@ async def demo_deep_with_ssl(client: httpx.AsyncClient):
else:
console.print(f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
# 7. Markdown helper endpoint
async def demo_markdown_endpoint(client: httpx.AsyncClient):
"""
One-shot helper around /md.
Fetches PYTHON_URL with FIT filter and prints the first 500 chars of Markdown.
"""
target_url = PYTHON_URL
payload = {"url": target_url, "f": "fit", "q": None, "c": "0"}
console.rule("[bold blue]Demo 7a: /md Endpoint[/]", style="blue")
print_payload(payload)
try:
t0 = time.time()
resp = await client.post("/md", json=payload)
dt = time.time() - t0
console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
resp.raise_for_status()
md = resp.json().get("markdown", "")
snippet = (md[:500] + "...") if len(md) > 500 else md
console.print(Panel(snippet, title="Markdown snippet", border_style="cyan", expand=False))
except Exception as e:
console.print(f"[bold red]Error hitting /md:[/] {e}")
# 8. LLM QA helper endpoint
async def demo_llm_endpoint(client: httpx.AsyncClient):
"""
Quick QA round-trip with /llm.
Asks a trivial question against SIMPLE_URL just to show wiring.
"""
page_url = SIMPLE_URL
question = "What is the title of this page?"
console.rule("[bold magenta]Demo 7b: /llm Endpoint[/]", style="magenta")
enc = urllib.parse.quote_plus(page_url, safe="")
console.print(f"GET /llm/{enc}?q={question}")
try:
t0 = time.time()
resp = await client.get(f"/llm/{enc}", params={"q": question})
dt = time.time() - t0
console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
resp.raise_for_status()
answer = resp.json().get("answer", "")
console.print(Panel(answer or "No answer returned", title="LLM answer", border_style="magenta", expand=False))
except Exception as e:
console.print(f"[bold red]Error hitting /llm:[/] {e}")
# 9. /config/dump helpers --------------------------------------------------
async def demo_config_dump_valid(client: httpx.AsyncClient):
"""
Send a single top-level CrawlerRunConfig(...) expression and show the dump.
"""
code_snippet = "CrawlerRunConfig(cache_mode='BYPASS', screenshot=True)"
payload = {"code": code_snippet}
console.rule("[bold blue]Demo 8a: /config/dump (valid)[/]", style="blue")
print_payload(payload)
try:
t0 = time.time()
resp = await client.post("/config/dump", json=payload)
dt = time.time() - t0
console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
resp.raise_for_status()
dump_json = resp.json()
console.print(Panel(Syntax(json.dumps(dump_json, indent=2), "json", theme="monokai"), title="Dump()", border_style="cyan"))
except Exception as e:
console.print(f"[bold red]Error in valid /config/dump call:[/] {e}")
async def demo_config_dump_invalid(client: httpx.AsyncClient):
"""
Purposely break the rule (nested call) to show the 400 parse error.
"""
bad_code = textwrap.dedent("""
BrowserConfig(headless=True); CrawlerRunConfig()
""").strip()
payload = {"code": bad_code}
console.rule("[bold magenta]Demo 8b: /config/dump (invalid)[/]", style="magenta")
print_payload(payload)
try:
resp = await client.post("/config/dump", json=payload)
console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/]")
resp.raise_for_status() # should throw -> except
except httpx.HTTPStatusError as e:
console.print("[cyan]Expected parse/validation failure captured:[/]")
try:
console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="fruity"), title="Error payload"))
except Exception:
console.print(e.response.text)
except Exception as e:
console.print(f"[bold red]Unexpected error during invalid test:[/] {e}")
# --- Update Main Runner to include new demo ---
async def main_demo():
async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
if not await check_server_health(client):
return
# --- Run Demos ---
await demo_basic_single_url(client)
await demo_basic_multi_url(client)
@@ -1001,7 +1101,15 @@ async def main_demo():
await demo_deep_with_css_extraction(client)
await demo_deep_with_llm_extraction(client) # Skips if no common LLM key env var
await demo_deep_with_proxy(client) # Skips if no PROXIES env var
await demo_deep_with_ssl(client) # Added the new demo
await demo_deep_with_ssl(client) # Added the new demo
# --- Helper endpoints ---
await demo_markdown_endpoint(client)
await demo_llm_endpoint(client)
# --- /config/dump sanity checks ---
await demo_config_dump_valid(client)
await demo_config_dump_invalid(client)
console.rule("[bold green]Demo Complete[/]", style="green")