diff --git a/CHANGELOG.md b/CHANGELOG.md index 9205c0b0..16f96f47 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,15 @@ All notable changes to Crawl4AI will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.6.1] - 2025-04-24 + +### Added +- New dedicated `tables` field in `CrawlResult` model for better table extraction handling +- Updated crypto_analysis_example.py to use the new tables field with backward compatibility + +### Changed +- Improved playground UI in Docker deployment with better endpoint handling and UI feedback + ## [0.6.0] ‑ 2025‑04‑22 ### Added diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index ee78de23..fe6f9b8a 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,3 +1,3 @@ # crawl4ai/_version.py -__version__ = "0.6.0" +__version__ = "0.6.1" diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 98acfd12..bb3765c2 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -496,11 +496,13 @@ class AsyncWebCrawler: cleaned_html = sanitize_input_encode( result.get("cleaned_html", "")) media = result.get("media", {}) + tables = media.pop("tables", []) if isinstance(media, dict) else [] links = result.get("links", {}) metadata = result.get("metadata", {}) else: cleaned_html = sanitize_input_encode(result.cleaned_html) media = result.media.model_dump() + tables = media.pop("tables", []) links = result.links.model_dump() metadata = result.metadata @@ -627,6 +629,7 @@ class AsyncWebCrawler: cleaned_html=cleaned_html, markdown=markdown_result, media=media, + tables=tables, # NEW links=links, metadata=metadata, screenshot=screenshot_data, diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 32cca3ed..64270b77 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel, HttpUrl, PrivateAttr +from pydantic import BaseModel, HttpUrl, PrivateAttr, Field from typing import List, Dict, Optional, Callable, Awaitable, Union, Any from typing import AsyncGenerator from typing import Generic, TypeVar @@ -150,6 +150,7 @@ class CrawlResult(BaseModel): redirected_url: Optional[str] = None network_requests: Optional[List[Dict[str, Any]]] = None console_messages: Optional[List[Dict[str, Any]]] = None + tables: List[Dict] = Field(default_factory=list) # NEW – [{headers,rows,caption,summary}] class Config: arbitrary_types_allowed = True diff --git a/deploy/docker/static/playground/index.html b/deploy/docker/static/playground/index.html index 8f0e2bdd..7af96f1f 100644 --- a/deploy/docker/static/playground/index.html +++ b/deploy/docker/static/playground/index.html @@ -193,7 +193,48 @@ https://example.com - + + + /md Options + + + Filter Type + + fit - Adaptive content filtering + raw - No filtering + bm25 - BM25 keyword relevance + llm - LLM-based filtering + + + + Query (for BM25/LLM filters) + + + + Cache Mode + + Write-Only (0) + Enabled (1) + + + + + + + + /llm Options + + + Question + + + + + + + Advanced Config (Python → auto‑JSON) @@ -437,6 +478,33 @@ cm.setValue(TEMPLATES[e.target.value]); document.getElementById('cfg-status').textContent = ''; }); + + // Handle endpoint selection change to show appropriate options + document.getElementById('endpoint').addEventListener('change', function(e) { + const endpoint = e.target.value; + const mdOptions = document.getElementById('md-options'); + const llmOptions = document.getElementById('llm-options'); + const advConfig = document.getElementById('adv-config'); + + // Hide all option sections first + mdOptions.classList.add('hidden'); + llmOptions.classList.add('hidden'); + advConfig.classList.add('hidden'); + + // Show the appropriate section based on endpoint + if (endpoint === 'md') { + mdOptions.classList.remove('hidden'); + // Auto-open the /md options + mdOptions.setAttribute('open', ''); + } else if (endpoint === 'llm') { + llmOptions.classList.remove('hidden'); + // Auto-open the /llm options + llmOptions.setAttribute('open', ''); + } else { + // For /crawl endpoints, show the advanced config + advConfig.classList.remove('hidden'); + } + }); async function pyConfigToJson() { const code = cm.getValue().trim(); @@ -494,10 +562,18 @@ } // Generate code snippets - function generateSnippets(api, payload) { + function generateSnippets(api, payload, method = 'POST') { // Python snippet const pyCodeEl = document.querySelector('#python-content code'); - const pySnippet = `import httpx\n\nasync def crawl():\n async with httpx.AsyncClient() as client:\n response = await client.post(\n "${window.location.origin}${api}",\n json=${JSON.stringify(payload, null, 4).replace(/\n/g, '\n ')}\n )\n return response.json()`; + let pySnippet; + + if (method === 'GET') { + // GET request (for /llm endpoint) + pySnippet = `import httpx\n\nasync def crawl():\n async with httpx.AsyncClient() as client:\n response = await client.get(\n "${window.location.origin}${api}"\n )\n return response.json()`; + } else { + // POST request (for /crawl and /md endpoints) + pySnippet = `import httpx\n\nasync def crawl():\n async with httpx.AsyncClient() as client:\n response = await client.post(\n "${window.location.origin}${api}",\n json=${JSON.stringify(payload, null, 4).replace(/\n/g, '\n ')}\n )\n return response.json()`; + } pyCodeEl.textContent = pySnippet; pyCodeEl.className = 'python hljs'; // Reset classes @@ -505,7 +581,15 @@ // cURL snippet const curlCodeEl = document.querySelector('#curl-content code'); - const curlSnippet = `curl -X POST ${window.location.origin}${api} \\\n -H "Content-Type: application/json" \\\n -d '${JSON.stringify(payload)}'`; + let curlSnippet; + + if (method === 'GET') { + // GET request (for /llm endpoint) + curlSnippet = `curl -X GET "${window.location.origin}${api}"`; + } else { + // POST request (for /crawl and /md endpoints) + curlSnippet = `curl -X POST ${window.location.origin}${api} \\\n -H "Content-Type: application/json" \\\n -d '${JSON.stringify(payload)}'`; + } curlCodeEl.textContent = curlSnippet; curlCodeEl.className = 'bash hljs'; // Reset classes @@ -536,20 +620,39 @@ const endpointMap = { crawl: '/crawl', - }; - - /*const endpointMap = { - crawl: '/crawl', - crawl_stream: '/crawl/stream', + // crawl_stream: '/crawl/stream', md: '/md', llm: '/llm' - };*/ + }; const api = endpointMap[endpoint]; - const payload = { - urls, - ...advConfig - }; + let payload; + + // Create appropriate payload based on endpoint type + if (endpoint === 'md') { + // Get values from the /md specific inputs + const filterType = document.getElementById('md-filter').value; + const query = document.getElementById('md-query').value.trim(); + const cache = document.getElementById('md-cache').value; + + // MD endpoint expects: { url, f, q, c } + payload = { + url: urls[0], // Take first URL + f: filterType, // Lowercase filter type as required by server + q: query || null, // Use the query if provided, otherwise null + c: cache + }; + } else if (endpoint === 'llm') { + // LLM endpoint has a different URL pattern and uses query params + // This will be handled directly in the fetch below + payload = null; + } else { + // Default payload for /crawl and /crawl/stream + payload = { + urls, + ...advConfig + }; + } updateStatus('processing'); @@ -557,7 +660,18 @@ const startTime = performance.now(); let response, responseData; - if (endpoint === 'crawl_stream') { + if (endpoint === 'llm') { + // Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query} + const url = urls[0]; + const encodedUrl = encodeURIComponent(url); + // Get the question from the LLM-specific input + const question = document.getElementById('llm-question').value.trim() || "What is this page about?"; + + response = await fetch(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, { + method: 'GET', + headers: { 'Accept': 'application/json' } + }); + } else if (endpoint === 'crawl_stream') { // Stream processing response = await fetch(api, { method: 'POST', @@ -597,7 +711,7 @@ document.querySelector('#response-content code').className = 'json hljs'; // Reset classes forceHighlightElement(document.querySelector('#response-content code')); } else { - // Regular request + // Regular request (handles /crawl and /md) response = await fetch(api, { method: 'POST', headers: { 'Content-Type': 'application/json' }, @@ -625,7 +739,16 @@ } forceHighlightElement(document.querySelector('#response-content code')); - generateSnippets(api, payload); + + // For generateSnippets, handle the LLM case specially + if (endpoint === 'llm') { + const url = urls[0]; + const encodedUrl = encodeURIComponent(url); + const question = document.getElementById('llm-question').value.trim() || "What is this page about?"; + generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET'); + } else { + generateSnippets(api, payload); + } } catch (error) { console.error('Error:', error); updateStatus('error'); @@ -807,9 +930,24 @@ }); }); } + + // Function to initialize UI based on selected endpoint + function initUI() { + // Trigger the endpoint change handler to set initial UI state + const endpointSelect = document.getElementById('endpoint'); + const event = new Event('change'); + endpointSelect.dispatchEvent(event); + + // Initialize copy buttons + initCopyButtons(); + } - // Call this in your DOMContentLoaded or initialization - initCopyButtons(); + // Initialize on page load + document.addEventListener('DOMContentLoaded', initUI); + // Also call it immediately in case the script runs after DOM is already loaded + if (document.readyState !== 'loading') { + initUI(); + }