Commit without API

2025-09-03 17:02:40 +08:00
parent 1eacea1d2d
commit 6a3b3e9d38
3 changed files with 74 additions and 14 deletions
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -482,9 +482,14 @@ async def crawl(
 ):
    """
    Crawl a list of URLs and return the results as JSON.
    For streaming responses, use /crawl/stream endpoint.
    """
    if not crawl_request.urls:
        raise HTTPException(400, "At least one URL required")
    # Check whether it is a redirection for a streaming request
    crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
    if crawler_config.stream:
        return await stream_process(crawl_request=crawl_request)
    results = await handle_crawl_request(
        urls=crawl_request.urls,
        browser_config=crawl_request.browser_config,
@@ -506,12 +511,16 @@ async def crawl_stream(
 ):
    if not crawl_request.urls:
        raise HTTPException(400, "At least one URL required")
    return await stream_process(crawl_request=crawl_request)
 async def stream_process(crawl_request: CrawlRequest):
    crawler, gen = await handle_stream_crawl_request(
        urls=crawl_request.urls,
        browser_config=crawl_request.browser_config,
        crawler_config=crawl_request.crawler_config,
        config=config,
-    )
+)
    return StreamingResponse(
        stream_results(crawler, gen),
        media_type="application/x-ndjson",
--- a/deploy/docker/static/playground/index.html
+++ b/deploy/docker/static/playground/index.html
@@ -182,8 +182,8 @@
            <div class="px-4 py-2 border-b border-border flex items-center">
                <h2 class="font-medium">Request Builder</h2>
                <select id="endpoint" class="ml-auto bg-dark border border-border rounded px-2 py-1 text-sm">
-                    <option value="crawl">/crawl (batch)</option>
+                    <option value="crawl">/crawl (supports streaming)</option>
-                    <option value="crawl_stream">/crawl/stream</option>
+                    <option value="crawl_stream">/crawl/stream (legacy)</option>
                    <option value="md">/md</option>
                    <option value="llm">/llm</option>
                </select>
@@ -371,7 +371,7 @@
                <div class="flex items-center">
                    <input id="st-stream" type="checkbox" class="mr-2">
-                    <label for="st-stream" class="text-sm">Use /crawl/stream</label>
+                    <label for="st-stream" class="text-sm">Enable streaming mode</label>
                    <button id="st-run"
                        class="ml-auto bg-accent text-dark px-4 py-2 rounded hover:bg-opacity-90 font-medium">
                        Run Stress Test
@@ -596,6 +596,14 @@
            forceHighlightElement(curlCodeEl);
        }
        // Detect if stream is requested inside payload
        function shouldUseStream(payload) {
            const toBool = (v) => v === true || (typeof v === 'string' && v.toLowerCase() === 'true');
            const fromCrawler = payload && payload.crawler_config && payload.crawler_config.params && payload.crawler_config.params.stream;
            const direct = payload && payload.stream;
            return toBool(fromCrawler) || toBool(direct);
        }
        // Main run function
        async function runCrawl() {
            const endpoint = document.getElementById('endpoint').value;
@@ -611,16 +619,24 @@
                        : { browser_config: cfgJson };
                }
            } catch (err) {
-                updateStatus('error');
+                const codeText = cm.getValue();
-                document.querySelector('#response-content code').textContent =
+                const streamFlag = /stream\s*=\s*True/i.test(codeText);
-                    JSON.stringify({ error: err.message }, null, 2);
+                const isCrawlEndpoint = document.getElementById('endpoint').value === 'crawl';
-                forceHighlightElement(document.querySelector('#response-content code'));
+                if (isCrawlEndpoint && streamFlag) {
-                return; // stop run
+                    // Fallback: proceed with minimal config only for stream
                    advConfig = { crawler_config: { stream: true } };
                } else {
                    updateStatus('error');
                    document.querySelector('#response-content code').textContent =
                        JSON.stringify({ error: err.message }, null, 2);
                    forceHighlightElement(document.querySelector('#response-content code'));
                    return; // stop run
                }
            }
            const endpointMap = {
                crawl: '/crawl',
-                // crawl_stream: '/crawl/stream',
+                crawl_stream: '/crawl/stream', // Keep for backward compatibility
                md: '/md',
                llm: '/llm'
            };
@@ -647,7 +663,7 @@
                // This will be handled directly in the fetch below
                payload = null;
            } else {
-                // Default payload for /crawl and /crawl/stream
+                // Default payload for /crawl (supports both streaming and batch modes)
                payload = {
                    urls,
                    ...advConfig
@@ -659,6 +675,7 @@
            try {
                const startTime = performance.now();
                let response, responseData;
                const useStreamOverride = (endpoint === 'crawl') && shouldUseStream(payload);
                if (endpoint === 'llm') {
                    // Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
@@ -681,8 +698,8 @@
                    document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
                    document.querySelector('#response-content code').className = 'json hljs';
                    forceHighlightElement(document.querySelector('#response-content code'));
-                } else if (endpoint === 'crawl_stream') {
+                } else if (endpoint === 'crawl_stream' || useStreamOverride) {
-                    // Stream processing
+                    // Stream processing - now handled directly by /crawl endpoint
                    response = await fetch(api, {
                        method: 'POST',
                        headers: { 'Content-Type': 'application/json' },
@@ -757,6 +774,7 @@
                    const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
                    generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
                } else {
                    // Use the same API endpoint for both streaming and non-streaming
                    generateSnippets(api, payload);
                }
            } catch (error) {
@@ -786,7 +804,7 @@
            document.getElementById('stress-avg-time').textContent = '0';
            document.getElementById('stress-peak-mem').textContent = '0';
-            const api = useStream ? '/crawl/stream' : '/crawl';
+            const api = '/crawl'; // Always use /crawl - backend handles streaming internally
            const urls = Array.from({ length: total }, (_, i) => `https://httpbin.org/anything/stress-${i}-${Date.now()}`);
            const chunks = [];
--- a/tests/docker/test_server_requests.py
+++ b/tests/docker/test_server_requests.py
@@ -143,7 +143,40 @@ class TestCrawlEndpoints:
        assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
        # We don't specify a markdown generator in this test, so don't make assumptions about markdown field
        # It might be null, missing, or populated depending on the server's default behavior
    async def test_crawl_with_stream_direct(self, async_client: httpx.AsyncClient):
        """Test that /crawl endpoint handles stream=True directly without redirect."""
        payload = {
            "urls": [SIMPLE_HTML_URL],
            "browser_config": {
                "type": "BrowserConfig",
                "params": {
                    "headless": True,
                }
            },
            "crawler_config": {
                "type": "CrawlerRunConfig", 
                "params": {
                    "stream": True,  # Set stream to True for direct streaming
                    "screenshot": False,
                    "cache_mode": CacheMode.BYPASS.value
                }
            }
        }
        # Send a request to the /crawl endpoint - should handle streaming directly
        async with async_client.stream("POST", "/crawl", json=payload) as response:
            assert response.status_code == 200
            assert response.headers["content-type"] == "application/x-ndjson"
            assert response.headers.get("x-stream-status") == "active"
            results = await process_streaming_response(response)
            assert len(results) == 1
            result = results[0]
            await assert_crawl_result_structure(result)
            assert result["success"] is True
            assert result["url"] == SIMPLE_HTML_URL
            assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
    async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
        """Test /crawl/stream with a single URL and simple config values."""
        payload = {