From ccec40ed174df45e6b4f0b00efae331fa4a3ba3a Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 24 Apr 2025 18:36:25 +0800 Subject: [PATCH] feat(models): add dedicated tables field to CrawlResult - Add tables field to CrawlResult model while maintaining backward compatibility - Update async_webcrawler.py to extract tables from media and pass to tables field - Update crypto_analysis_example.py to use the new tables field - Add /config/dump examples to demo_docker_api.py - Bump version to 0.6.1 --- CHANGELOG.md | 9 ++ crawl4ai/__version__.py | 2 +- crawl4ai/async_webcrawler.py | 3 + crawl4ai/models.py | 3 +- deploy/docker/static/playground/index.html | 176 ++++++++++++++++++--- docs/examples/crypto_analysis_example.py | 8 +- docs/examples/docker/demo_docker_api.py | 112 ++++++++++++- 7 files changed, 287 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9205c0b0..16f96f47 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,15 @@ All notable changes to Crawl4AI will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.6.1] - 2025-04-24 + +### Added +- New dedicated `tables` field in `CrawlResult` model for better table extraction handling +- Updated crypto_analysis_example.py to use the new tables field with backward compatibility + +### Changed +- Improved playground UI in Docker deployment with better endpoint handling and UI feedback + ## [0.6.0] ‑ 2025‑04‑22 ### Added diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index ee78de23..fe6f9b8a 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,3 +1,3 @@ # crawl4ai/_version.py -__version__ = "0.6.0" +__version__ = "0.6.1" diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 98acfd12..bb3765c2 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -496,11 +496,13 @@ class AsyncWebCrawler: cleaned_html = sanitize_input_encode( result.get("cleaned_html", "")) media = result.get("media", {}) + tables = media.pop("tables", []) if isinstance(media, dict) else [] links = result.get("links", {}) metadata = result.get("metadata", {}) else: cleaned_html = sanitize_input_encode(result.cleaned_html) media = result.media.model_dump() + tables = media.pop("tables", []) links = result.links.model_dump() metadata = result.metadata @@ -627,6 +629,7 @@ class AsyncWebCrawler: cleaned_html=cleaned_html, markdown=markdown_result, media=media, + tables=tables, # NEW links=links, metadata=metadata, screenshot=screenshot_data, diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 32cca3ed..64270b77 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel, HttpUrl, PrivateAttr +from pydantic import BaseModel, HttpUrl, PrivateAttr, Field from typing import List, Dict, Optional, Callable, Awaitable, Union, Any from typing import AsyncGenerator from typing import Generic, TypeVar @@ -150,6 +150,7 @@ class CrawlResult(BaseModel): redirected_url: Optional[str] = None network_requests: Optional[List[Dict[str, Any]]] = None console_messages: Optional[List[Dict[str, Any]]] = None + tables: List[Dict] = Field(default_factory=list) # NEW – [{headers,rows,caption,summary}] class Config: arbitrary_types_allowed = True diff --git a/deploy/docker/static/playground/index.html b/deploy/docker/static/playground/index.html index 8f0e2bdd..7af96f1f 100644 --- a/deploy/docker/static/playground/index.html +++ b/deploy/docker/static/playground/index.html @@ -193,7 +193,48 @@ -
+ + + + + + + +
Advanced Config (Python → auto‑JSON) @@ -437,6 +478,33 @@ cm.setValue(TEMPLATES[e.target.value]); document.getElementById('cfg-status').textContent = ''; }); + + // Handle endpoint selection change to show appropriate options + document.getElementById('endpoint').addEventListener('change', function(e) { + const endpoint = e.target.value; + const mdOptions = document.getElementById('md-options'); + const llmOptions = document.getElementById('llm-options'); + const advConfig = document.getElementById('adv-config'); + + // Hide all option sections first + mdOptions.classList.add('hidden'); + llmOptions.classList.add('hidden'); + advConfig.classList.add('hidden'); + + // Show the appropriate section based on endpoint + if (endpoint === 'md') { + mdOptions.classList.remove('hidden'); + // Auto-open the /md options + mdOptions.setAttribute('open', ''); + } else if (endpoint === 'llm') { + llmOptions.classList.remove('hidden'); + // Auto-open the /llm options + llmOptions.setAttribute('open', ''); + } else { + // For /crawl endpoints, show the advanced config + advConfig.classList.remove('hidden'); + } + }); async function pyConfigToJson() { const code = cm.getValue().trim(); @@ -494,10 +562,18 @@ } // Generate code snippets - function generateSnippets(api, payload) { + function generateSnippets(api, payload, method = 'POST') { // Python snippet const pyCodeEl = document.querySelector('#python-content code'); - const pySnippet = `import httpx\n\nasync def crawl():\n async with httpx.AsyncClient() as client:\n response = await client.post(\n "${window.location.origin}${api}",\n json=${JSON.stringify(payload, null, 4).replace(/\n/g, '\n ')}\n )\n return response.json()`; + let pySnippet; + + if (method === 'GET') { + // GET request (for /llm endpoint) + pySnippet = `import httpx\n\nasync def crawl():\n async with httpx.AsyncClient() as client:\n response = await client.get(\n "${window.location.origin}${api}"\n )\n return response.json()`; + } else { + // POST request (for /crawl and /md endpoints) + pySnippet = `import httpx\n\nasync def crawl():\n async with httpx.AsyncClient() as client:\n response = await client.post(\n "${window.location.origin}${api}",\n json=${JSON.stringify(payload, null, 4).replace(/\n/g, '\n ')}\n )\n return response.json()`; + } pyCodeEl.textContent = pySnippet; pyCodeEl.className = 'python hljs'; // Reset classes @@ -505,7 +581,15 @@ // cURL snippet const curlCodeEl = document.querySelector('#curl-content code'); - const curlSnippet = `curl -X POST ${window.location.origin}${api} \\\n -H "Content-Type: application/json" \\\n -d '${JSON.stringify(payload)}'`; + let curlSnippet; + + if (method === 'GET') { + // GET request (for /llm endpoint) + curlSnippet = `curl -X GET "${window.location.origin}${api}"`; + } else { + // POST request (for /crawl and /md endpoints) + curlSnippet = `curl -X POST ${window.location.origin}${api} \\\n -H "Content-Type: application/json" \\\n -d '${JSON.stringify(payload)}'`; + } curlCodeEl.textContent = curlSnippet; curlCodeEl.className = 'bash hljs'; // Reset classes @@ -536,20 +620,39 @@ const endpointMap = { crawl: '/crawl', - }; - - /*const endpointMap = { - crawl: '/crawl', - crawl_stream: '/crawl/stream', + // crawl_stream: '/crawl/stream', md: '/md', llm: '/llm' - };*/ + }; const api = endpointMap[endpoint]; - const payload = { - urls, - ...advConfig - }; + let payload; + + // Create appropriate payload based on endpoint type + if (endpoint === 'md') { + // Get values from the /md specific inputs + const filterType = document.getElementById('md-filter').value; + const query = document.getElementById('md-query').value.trim(); + const cache = document.getElementById('md-cache').value; + + // MD endpoint expects: { url, f, q, c } + payload = { + url: urls[0], // Take first URL + f: filterType, // Lowercase filter type as required by server + q: query || null, // Use the query if provided, otherwise null + c: cache + }; + } else if (endpoint === 'llm') { + // LLM endpoint has a different URL pattern and uses query params + // This will be handled directly in the fetch below + payload = null; + } else { + // Default payload for /crawl and /crawl/stream + payload = { + urls, + ...advConfig + }; + } updateStatus('processing'); @@ -557,7 +660,18 @@ const startTime = performance.now(); let response, responseData; - if (endpoint === 'crawl_stream') { + if (endpoint === 'llm') { + // Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query} + const url = urls[0]; + const encodedUrl = encodeURIComponent(url); + // Get the question from the LLM-specific input + const question = document.getElementById('llm-question').value.trim() || "What is this page about?"; + + response = await fetch(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, { + method: 'GET', + headers: { 'Accept': 'application/json' } + }); + } else if (endpoint === 'crawl_stream') { // Stream processing response = await fetch(api, { method: 'POST', @@ -597,7 +711,7 @@ document.querySelector('#response-content code').className = 'json hljs'; // Reset classes forceHighlightElement(document.querySelector('#response-content code')); } else { - // Regular request + // Regular request (handles /crawl and /md) response = await fetch(api, { method: 'POST', headers: { 'Content-Type': 'application/json' }, @@ -625,7 +739,16 @@ } forceHighlightElement(document.querySelector('#response-content code')); - generateSnippets(api, payload); + + // For generateSnippets, handle the LLM case specially + if (endpoint === 'llm') { + const url = urls[0]; + const encodedUrl = encodeURIComponent(url); + const question = document.getElementById('llm-question').value.trim() || "What is this page about?"; + generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET'); + } else { + generateSnippets(api, payload); + } } catch (error) { console.error('Error:', error); updateStatus('error'); @@ -807,9 +930,24 @@ }); }); } + + // Function to initialize UI based on selected endpoint + function initUI() { + // Trigger the endpoint change handler to set initial UI state + const endpointSelect = document.getElementById('endpoint'); + const event = new Event('change'); + endpointSelect.dispatchEvent(event); + + // Initialize copy buttons + initCopyButtons(); + } - // Call this in your DOMContentLoaded or initialization - initCopyButtons(); + // Initialize on page load + document.addEventListener('DOMContentLoaded', initUI); + // Also call it immediately in case the script runs after DOM is already loaded + if (document.readyState !== 'loading') { + initUI(); + } diff --git a/docs/examples/crypto_analysis_example.py b/docs/examples/crypto_analysis_example.py index 10b9e7ab..c5537a93 100644 --- a/docs/examples/crypto_analysis_example.py +++ b/docs/examples/crypto_analysis_example.py @@ -391,12 +391,14 @@ async def main(): # Process results raw_df = pd.DataFrame() for result in results: - if result.success and result.media["tables"]: + # Use the new tables field, falling back to media["tables"] for backward compatibility + tables = result.tables if hasattr(result, "tables") and result.tables else result.media.get("tables", []) + if result.success and tables: # Extract primary market table # DataFrame raw_df = pd.DataFrame( - result.media["tables"][0]["rows"], - columns=result.media["tables"][0]["headers"], + tables[0]["rows"], + columns=tables[0]["headers"], ) break diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py index 09625248..d989e030 100644 --- a/docs/examples/docker/demo_docker_api.py +++ b/docs/examples/docker/demo_docker_api.py @@ -4,6 +4,8 @@ import json import os import time from typing import List, Dict, Any, AsyncGenerator, Optional +import textwrap # ← new: for pretty code literals +import urllib.parse # ← needed for URL-safe /llm calls from dotenv import load_dotenv from rich.console import Console from rich.syntax import Syntax @@ -969,13 +971,111 @@ async def demo_deep_with_ssl(client: httpx.AsyncClient): else: console.print(f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.") +# 7. Markdown helper endpoint +async def demo_markdown_endpoint(client: httpx.AsyncClient): + """ + One-shot helper around /md. + Fetches PYTHON_URL with FIT filter and prints the first 500 chars of Markdown. + """ + target_url = PYTHON_URL + payload = {"url": target_url, "f": "fit", "q": None, "c": "0"} + + console.rule("[bold blue]Demo 7a: /md Endpoint[/]", style="blue") + print_payload(payload) + + try: + t0 = time.time() + resp = await client.post("/md", json=payload) + dt = time.time() - t0 + console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)") + resp.raise_for_status() + md = resp.json().get("markdown", "") + snippet = (md[:500] + "...") if len(md) > 500 else md + console.print(Panel(snippet, title="Markdown snippet", border_style="cyan", expand=False)) + except Exception as e: + console.print(f"[bold red]Error hitting /md:[/] {e}") + +# 8. LLM QA helper endpoint +async def demo_llm_endpoint(client: httpx.AsyncClient): + """ + Quick QA round-trip with /llm. + Asks a trivial question against SIMPLE_URL just to show wiring. + """ + page_url = SIMPLE_URL + question = "What is the title of this page?" + + console.rule("[bold magenta]Demo 7b: /llm Endpoint[/]", style="magenta") + enc = urllib.parse.quote_plus(page_url, safe="") + console.print(f"GET /llm/{enc}?q={question}") + + try: + t0 = time.time() + resp = await client.get(f"/llm/{enc}", params={"q": question}) + dt = time.time() - t0 + console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)") + resp.raise_for_status() + answer = resp.json().get("answer", "") + console.print(Panel(answer or "No answer returned", title="LLM answer", border_style="magenta", expand=False)) + except Exception as e: + console.print(f"[bold red]Error hitting /llm:[/] {e}") + + +# 9. /config/dump helpers -------------------------------------------------- + +async def demo_config_dump_valid(client: httpx.AsyncClient): + """ + Send a single top-level CrawlerRunConfig(...) expression and show the dump. + """ + code_snippet = "CrawlerRunConfig(cache_mode='BYPASS', screenshot=True)" + payload = {"code": code_snippet} + + console.rule("[bold blue]Demo 8a: /config/dump (valid)[/]", style="blue") + print_payload(payload) + + try: + t0 = time.time() + resp = await client.post("/config/dump", json=payload) + dt = time.time() - t0 + console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)") + resp.raise_for_status() + dump_json = resp.json() + console.print(Panel(Syntax(json.dumps(dump_json, indent=2), "json", theme="monokai"), title="Dump()", border_style="cyan")) + except Exception as e: + console.print(f"[bold red]Error in valid /config/dump call:[/] {e}") + + +async def demo_config_dump_invalid(client: httpx.AsyncClient): + """ + Purposely break the rule (nested call) to show the 400 parse error. + """ + bad_code = textwrap.dedent(""" + BrowserConfig(headless=True); CrawlerRunConfig() + """).strip() + payload = {"code": bad_code} + + console.rule("[bold magenta]Demo 8b: /config/dump (invalid)[/]", style="magenta") + print_payload(payload) + + try: + resp = await client.post("/config/dump", json=payload) + console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/]") + resp.raise_for_status() # should throw -> except + except httpx.HTTPStatusError as e: + console.print("[cyan]Expected parse/validation failure captured:[/]") + try: + console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="fruity"), title="Error payload")) + except Exception: + console.print(e.response.text) + except Exception as e: + console.print(f"[bold red]Unexpected error during invalid test:[/] {e}") + # --- Update Main Runner to include new demo --- async def main_demo(): async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client: if not await check_server_health(client): return - + # --- Run Demos --- await demo_basic_single_url(client) await demo_basic_multi_url(client) @@ -1001,7 +1101,15 @@ async def main_demo(): await demo_deep_with_css_extraction(client) await demo_deep_with_llm_extraction(client) # Skips if no common LLM key env var await demo_deep_with_proxy(client) # Skips if no PROXIES env var - await demo_deep_with_ssl(client) # Added the new demo + await demo_deep_with_ssl(client) # Added the new demo + + # --- Helper endpoints --- + await demo_markdown_endpoint(client) + await demo_llm_endpoint(client) + + # --- /config/dump sanity checks --- + await demo_config_dump_valid(client) + await demo_config_dump_invalid(client) console.rule("[bold green]Demo Complete[/]", style="green")