From ccec40ed174df45e6b4f0b00efae331fa4a3ba3a Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Thu, 24 Apr 2025 18:36:25 +0800
Subject: [PATCH] feat(models): add dedicated tables field to CrawlResult

- Add tables field to CrawlResult model while maintaining backward compatibility
- Update async_webcrawler.py to extract tables from media and pass to tables field
- Update crypto_analysis_example.py to use the new tables field
- Add /config/dump examples to demo_docker_api.py
- Bump version to 0.6.1
---
 CHANGELOG.md                               |   9 ++
 crawl4ai/__version__.py                    |   2 +-
 crawl4ai/async_webcrawler.py               |   3 +
 crawl4ai/models.py                         |   3 +-
 deploy/docker/static/playground/index.html | 176 ++++++++++++++++++---
 docs/examples/crypto_analysis_example.py   |   8 +-
 docs/examples/docker/demo_docker_api.py    | 112 ++++++++++++-
 7 files changed, 287 insertions(+), 26 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9205c0b0..16f96f47 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,15 @@ All notable changes to Crawl4AI will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.6.1] - 2025-04-24
+
+### Added
+- New dedicated `tables` field in `CrawlResult` model for better table extraction handling
+- Updated crypto_analysis_example.py to use the new tables field with backward compatibility
+
+### Changed
+- Improved playground UI in Docker deployment with better endpoint handling and UI feedback
+
 ## [0.6.0] ‑ 2025‑04‑22
 
 ### Added
diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py
index ee78de23..fe6f9b8a 100644
--- a/crawl4ai/__version__.py
+++ b/crawl4ai/__version__.py
@@ -1,3 +1,3 @@
 # crawl4ai/_version.py
-__version__ = "0.6.0"
+__version__ = "0.6.1"
 
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 98acfd12..bb3765c2 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -496,11 +496,13 @@ class AsyncWebCrawler:
             cleaned_html = sanitize_input_encode(
                 result.get("cleaned_html", ""))
             media = result.get("media", {})
+            tables = media.pop("tables", []) if isinstance(media, dict) else []
             links = result.get("links", {})
             metadata = result.get("metadata", {})
         else:
             cleaned_html = sanitize_input_encode(result.cleaned_html)
             media = result.media.model_dump()
+            tables = media.pop("tables", [])
             links = result.links.model_dump()
             metadata = result.metadata
 
@@ -627,6 +629,7 @@ class AsyncWebCrawler:
             cleaned_html=cleaned_html,
             markdown=markdown_result,
             media=media,
+            tables=tables,                       # NEW
             links=links,
             metadata=metadata,
             screenshot=screenshot_data,
diff --git a/crawl4ai/models.py b/crawl4ai/models.py
index 32cca3ed..64270b77 100644
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -1,4 +1,4 @@
-from pydantic import BaseModel, HttpUrl, PrivateAttr
+from pydantic import BaseModel, HttpUrl, PrivateAttr, Field
 from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
 from typing import AsyncGenerator
 from typing import Generic, TypeVar
@@ -150,6 +150,7 @@ class CrawlResult(BaseModel):
     redirected_url: Optional[str] = None
     network_requests: Optional[List[Dict[str, Any]]] = None
     console_messages: Optional[List[Dict[str, Any]]] = None
+    tables: List[Dict] = Field(default_factory=list)  # NEW – [{headers,rows,caption,summary}]
 
     class Config:
         arbitrary_types_allowed = True
diff --git a/deploy/docker/static/playground/index.html b/deploy/docker/static/playground/index.html
index 8f0e2bdd..7af96f1f 100644
--- a/deploy/docker/static/playground/index.html
+++ b/deploy/docker/static/playground/index.html
@@ -193,7 +193,48 @@
                 <textarea id="urls" class="w-full bg-dark border border-border rounded p-2 h-32 text-sm mb-4"
                     spellcheck="false">https://example.com</textarea>
 
-                <details class="mb-4">
+                <!-- Specific options for /md endpoint -->
+                <details id="md-options" class="mb-4 hidden">
+                    <summary class="text-sm text-secondary cursor-pointer">/md Options</summary>
+                    <div class="mt-2 space-y-3 p-2 border border-border rounded">
+                        <div>
+                            <label for="md-filter" class="block text-xs text-secondary mb-1">Filter Type</label>
+                            <select id="md-filter" class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
+                                <option value="fit">fit - Adaptive content filtering</option>
+                                <option value="raw">raw - No filtering</option>
+                                <option value="bm25">bm25 - BM25 keyword relevance</option>
+                                <option value="llm">llm - LLM-based filtering</option>
+                            </select>
+                        </div>
+                        <div>
+                            <label for="md-query" class="block text-xs text-secondary mb-1">Query (for BM25/LLM filters)</label>
+                            <input id="md-query" type="text" placeholder="Enter search terms or instructions" 
+                                class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
+                        </div>
+                        <div>
+                            <label for="md-cache" class="block text-xs text-secondary mb-1">Cache Mode</label>
+                            <select id="md-cache" class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
+                                <option value="0">Write-Only (0)</option>
+                                <option value="1">Enabled (1)</option>
+                            </select>
+                        </div>
+                    </div>
+                </details>
+
+                <!-- Specific options for /llm endpoint -->
+                <details id="llm-options" class="mb-4 hidden">
+                    <summary class="text-sm text-secondary cursor-pointer">/llm Options</summary>
+                    <div class="mt-2 space-y-3 p-2 border border-border rounded">
+                        <div>
+                            <label for="llm-question" class="block text-xs text-secondary mb-1">Question</label>
+                            <input id="llm-question" type="text" value="What is this page about?" 
+                                class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
+                        </div>
+                    </div>
+                </details>
+
+                <!-- Advanced config for /crawl endpoints -->
+                <details id="adv-config" class="mb-4">
                     <summary class="text-sm text-secondary cursor-pointer">Advanced Config <span
                         class="text-xs text-primary">(Python → auto‑JSON)</span></summary>
 
@@ -437,6 +478,33 @@
             cm.setValue(TEMPLATES[e.target.value]);
             document.getElementById('cfg-status').textContent = '';
         });
+        
+        // Handle endpoint selection change to show appropriate options
+        document.getElementById('endpoint').addEventListener('change', function(e) {
+            const endpoint = e.target.value;
+            const mdOptions = document.getElementById('md-options');
+            const llmOptions = document.getElementById('llm-options');
+            const advConfig = document.getElementById('adv-config');
+            
+            // Hide all option sections first
+            mdOptions.classList.add('hidden');
+            llmOptions.classList.add('hidden');
+            advConfig.classList.add('hidden');
+            
+            // Show the appropriate section based on endpoint
+            if (endpoint === 'md') {
+                mdOptions.classList.remove('hidden');
+                // Auto-open the /md options
+                mdOptions.setAttribute('open', '');
+            } else if (endpoint === 'llm') {
+                llmOptions.classList.remove('hidden');
+                // Auto-open the /llm options
+                llmOptions.setAttribute('open', '');
+            } else {
+                // For /crawl endpoints, show the advanced config
+                advConfig.classList.remove('hidden');
+            }
+        });
 
         async function pyConfigToJson() {
             const code = cm.getValue().trim();
@@ -494,10 +562,18 @@
         }
 
         // Generate code snippets
-        function generateSnippets(api, payload) {
+        function generateSnippets(api, payload, method = 'POST') {
             // Python snippet
             const pyCodeEl = document.querySelector('#python-content code');
-            const pySnippet = `import httpx\n\nasync def crawl():\n    async with httpx.AsyncClient() as client:\n        response = await client.post(\n            "${window.location.origin}${api}",\n            json=${JSON.stringify(payload, null, 4).replace(/\n/g, '\n            ')}\n        )\n        return response.json()`;
+            let pySnippet;
+            
+            if (method === 'GET') {
+                // GET request (for /llm endpoint)
+                pySnippet = `import httpx\n\nasync def crawl():\n    async with httpx.AsyncClient() as client:\n        response = await client.get(\n            "${window.location.origin}${api}"\n        )\n        return response.json()`;
+            } else {
+                // POST request (for /crawl and /md endpoints)
+                pySnippet = `import httpx\n\nasync def crawl():\n    async with httpx.AsyncClient() as client:\n        response = await client.post(\n            "${window.location.origin}${api}",\n            json=${JSON.stringify(payload, null, 4).replace(/\n/g, '\n            ')}\n        )\n        return response.json()`;
+            }
 
             pyCodeEl.textContent = pySnippet;
             pyCodeEl.className = 'python hljs'; // Reset classes
@@ -505,7 +581,15 @@
 
             // cURL snippet
             const curlCodeEl = document.querySelector('#curl-content code');
-            const curlSnippet = `curl -X POST ${window.location.origin}${api} \\\n  -H "Content-Type: application/json" \\\n  -d '${JSON.stringify(payload)}'`;
+            let curlSnippet;
+            
+            if (method === 'GET') {
+                // GET request (for /llm endpoint)
+                curlSnippet = `curl -X GET "${window.location.origin}${api}"`;
+            } else {
+                // POST request (for /crawl and /md endpoints)
+                curlSnippet = `curl -X POST ${window.location.origin}${api} \\\n  -H "Content-Type: application/json" \\\n  -d '${JSON.stringify(payload)}'`;
+            }
 
             curlCodeEl.textContent = curlSnippet;
             curlCodeEl.className = 'bash hljs'; // Reset classes
@@ -536,20 +620,39 @@
 
             const endpointMap = {
                 crawl: '/crawl',
-            };
-
-            /*const endpointMap = {
-                crawl: '/crawl',
-                crawl_stream: '/crawl/stream',
+                // crawl_stream: '/crawl/stream',
                 md: '/md',
                 llm: '/llm'
-            };*/
+            };
 
             const api = endpointMap[endpoint];
-            const payload = {
-                urls,
-                ...advConfig
-            };
+            let payload;
+            
+            // Create appropriate payload based on endpoint type
+            if (endpoint === 'md') {
+                // Get values from the /md specific inputs
+                const filterType = document.getElementById('md-filter').value;
+                const query = document.getElementById('md-query').value.trim();
+                const cache = document.getElementById('md-cache').value;
+                
+                // MD endpoint expects: { url, f, q, c }
+                payload = {
+                    url: urls[0], // Take first URL
+                    f: filterType, // Lowercase filter type as required by server
+                    q: query || null, // Use the query if provided, otherwise null
+                    c: cache
+                };
+            } else if (endpoint === 'llm') {
+                // LLM endpoint has a different URL pattern and uses query params
+                // This will be handled directly in the fetch below
+                payload = null;
+            } else {
+                // Default payload for /crawl and /crawl/stream
+                payload = {
+                    urls,
+                    ...advConfig
+                };
+            }
 
             updateStatus('processing');
 
@@ -557,7 +660,18 @@
                 const startTime = performance.now();
                 let response, responseData;
 
-                if (endpoint === 'crawl_stream') {
+                if (endpoint === 'llm') {
+                    // Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
+                    const url = urls[0];
+                    const encodedUrl = encodeURIComponent(url);
+                    // Get the question from the LLM-specific input
+                    const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
+                    
+                    response = await fetch(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, {
+                        method: 'GET',
+                        headers: { 'Accept': 'application/json' }
+                    });
+                } else if (endpoint === 'crawl_stream') {
                     // Stream processing
                     response = await fetch(api, {
                         method: 'POST',
@@ -597,7 +711,7 @@
                     document.querySelector('#response-content code').className = 'json hljs'; // Reset classes
                     forceHighlightElement(document.querySelector('#response-content code'));
                 } else {
-                    // Regular request
+                    // Regular request (handles /crawl and /md)
                     response = await fetch(api, {
                         method: 'POST',
                         headers: { 'Content-Type': 'application/json' },
@@ -625,7 +739,16 @@
                 }
 
                 forceHighlightElement(document.querySelector('#response-content code'));
-                generateSnippets(api, payload);
+                
+                // For generateSnippets, handle the LLM case specially
+                if (endpoint === 'llm') {
+                    const url = urls[0];
+                    const encodedUrl = encodeURIComponent(url);
+                    const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
+                    generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
+                } else {
+                    generateSnippets(api, payload);
+                }
             } catch (error) {
                 console.error('Error:', error);
                 updateStatus('error');
@@ -807,9 +930,24 @@
                 });
             });
         }
+        
+        // Function to initialize UI based on selected endpoint
+        function initUI() {
+            // Trigger the endpoint change handler to set initial UI state
+            const endpointSelect = document.getElementById('endpoint');
+            const event = new Event('change');
+            endpointSelect.dispatchEvent(event);
+            
+            // Initialize copy buttons
+            initCopyButtons();
+        }
 
-        // Call this in your DOMContentLoaded or initialization
-        initCopyButtons();
+        // Initialize on page load
+        document.addEventListener('DOMContentLoaded', initUI);
+        // Also call it immediately in case the script runs after DOM is already loaded
+        if (document.readyState !== 'loading') {
+            initUI();
+        }
 
     </script>
 </body>
diff --git a/docs/examples/crypto_analysis_example.py b/docs/examples/crypto_analysis_example.py
index 10b9e7ab..c5537a93 100644
--- a/docs/examples/crypto_analysis_example.py
+++ b/docs/examples/crypto_analysis_example.py
@@ -391,12 +391,14 @@ async def main():
         # Process results
         raw_df = pd.DataFrame()
         for result in results:
-            if result.success and result.media["tables"]:
+            # Use the new tables field, falling back to media["tables"] for backward compatibility
+            tables = result.tables if hasattr(result, "tables") and result.tables else result.media.get("tables", [])
+            if result.success and tables:
                 # Extract primary market table
                 # DataFrame
                 raw_df = pd.DataFrame(
-                    result.media["tables"][0]["rows"],
-                    columns=result.media["tables"][0]["headers"],
+                    tables[0]["rows"],
+                    columns=tables[0]["headers"],
                 )
                 break
 
diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py
index 09625248..d989e030 100644
--- a/docs/examples/docker/demo_docker_api.py
+++ b/docs/examples/docker/demo_docker_api.py
@@ -4,6 +4,8 @@ import json
 import os
 import time
 from typing import List, Dict, Any, AsyncGenerator, Optional
+import textwrap          # ← new: for pretty code literals
+import urllib.parse  # ← needed for URL-safe /llm calls
 from dotenv import load_dotenv
 from rich.console import Console
 from rich.syntax import Syntax
@@ -969,13 +971,111 @@ async def demo_deep_with_ssl(client: httpx.AsyncClient):
             else:
                  console.print(f"  [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
 
+# 7. Markdown helper endpoint
+async def demo_markdown_endpoint(client: httpx.AsyncClient):
+    """
+    One-shot helper around /md.
+    Fetches PYTHON_URL with FIT filter and prints the first 500 chars of Markdown.
+    """
+    target_url = PYTHON_URL
+    payload = {"url": target_url, "f": "fit", "q": None, "c": "0"}
+
+    console.rule("[bold blue]Demo 7a: /md Endpoint[/]", style="blue")
+    print_payload(payload)
+
+    try:
+        t0 = time.time()
+        resp = await client.post("/md", json=payload)
+        dt = time.time() - t0
+        console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
+        resp.raise_for_status()
+        md = resp.json().get("markdown", "")
+        snippet = (md[:500] + "...") if len(md) > 500 else md
+        console.print(Panel(snippet, title="Markdown snippet", border_style="cyan", expand=False))
+    except Exception as e:
+        console.print(f"[bold red]Error hitting /md:[/] {e}")
+
+# 8. LLM QA helper endpoint
+async def demo_llm_endpoint(client: httpx.AsyncClient):
+    """
+    Quick QA round-trip with /llm.
+    Asks a trivial question against SIMPLE_URL just to show wiring.
+    """
+    page_url = SIMPLE_URL
+    question = "What is the title of this page?"
+
+    console.rule("[bold magenta]Demo 7b: /llm Endpoint[/]", style="magenta")
+    enc = urllib.parse.quote_plus(page_url, safe="")
+    console.print(f"GET /llm/{enc}?q={question}")
+
+    try:
+        t0 = time.time()
+        resp = await client.get(f"/llm/{enc}", params={"q": question})
+        dt = time.time() - t0
+        console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
+        resp.raise_for_status()
+        answer = resp.json().get("answer", "")
+        console.print(Panel(answer or "No answer returned", title="LLM answer", border_style="magenta", expand=False))
+    except Exception as e:
+        console.print(f"[bold red]Error hitting /llm:[/] {e}")
+
+
+# 9. /config/dump helpers --------------------------------------------------
+
+async def demo_config_dump_valid(client: httpx.AsyncClient):
+    """
+    Send a single top-level CrawlerRunConfig(...) expression and show the dump.
+    """
+    code_snippet = "CrawlerRunConfig(cache_mode='BYPASS', screenshot=True)"
+    payload = {"code": code_snippet}
+
+    console.rule("[bold blue]Demo 8a: /config/dump (valid)[/]", style="blue")
+    print_payload(payload)
+
+    try:
+        t0 = time.time()
+        resp = await client.post("/config/dump", json=payload)
+        dt = time.time() - t0
+        console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
+        resp.raise_for_status()
+        dump_json = resp.json()
+        console.print(Panel(Syntax(json.dumps(dump_json, indent=2), "json", theme="monokai"), title="Dump()", border_style="cyan"))
+    except Exception as e:
+        console.print(f"[bold red]Error in valid /config/dump call:[/] {e}")
+
+
+async def demo_config_dump_invalid(client: httpx.AsyncClient):
+    """
+    Purposely break the rule (nested call) to show the 400 parse error.
+    """
+    bad_code = textwrap.dedent("""
+        BrowserConfig(headless=True); CrawlerRunConfig()
+    """).strip()
+    payload = {"code": bad_code}
+
+    console.rule("[bold magenta]Demo 8b: /config/dump (invalid)[/]", style="magenta")
+    print_payload(payload)
+
+    try:
+        resp = await client.post("/config/dump", json=payload)
+        console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/]")
+        resp.raise_for_status()   # should throw -> except
+    except httpx.HTTPStatusError as e:
+        console.print("[cyan]Expected parse/validation failure captured:[/]")
+        try:
+            console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="fruity"), title="Error payload"))
+        except Exception:
+            console.print(e.response.text)
+    except Exception as e:
+        console.print(f"[bold red]Unexpected error during invalid test:[/] {e}")
+
 
 # --- Update Main Runner to include new demo ---
 async def main_demo():
     async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
         if not await check_server_health(client):
             return
-
+        
         # --- Run Demos ---
         await demo_basic_single_url(client)
         await demo_basic_multi_url(client)
@@ -1001,7 +1101,15 @@ async def main_demo():
         await demo_deep_with_css_extraction(client)
         await demo_deep_with_llm_extraction(client) # Skips if no common LLM key env var
         await demo_deep_with_proxy(client) # Skips if no PROXIES env var
-        await demo_deep_with_ssl(client) # Added the new demo
+        await demo_deep_with_ssl(client)   # Added the new demo
+
+        # --- Helper endpoints ---
+        await demo_markdown_endpoint(client)
+        await demo_llm_endpoint(client)
+
+        # --- /config/dump sanity checks ---
+        await demo_config_dump_valid(client)
+        await demo_config_dump_invalid(client)
 
         console.rule("[bold green]Demo Complete[/]", style="green")