Commit without API

2025-09-03 17:02:40 +08:00
parent 1eacea1d2d
commit 6a3b3e9d38
3 changed files with 74 additions and 14 deletions
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -482,9 +482,14 @@ async def crawl(
 ):
    """
    Crawl a list of URLs and return the results as JSON.
+    For streaming responses, use /crawl/stream endpoint.
    """
    if not crawl_request.urls:
        raise HTTPException(400, "At least one URL required")
+    # Check whether it is a redirection for a streaming request
+    crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
+    if crawler_config.stream:
+        return await stream_process(crawl_request=crawl_request)
    results = await handle_crawl_request(
        urls=crawl_request.urls,
        browser_config=crawl_request.browser_config,
@@ -506,6 +511,10 @@ async def crawl_stream(
 ):
    if not crawl_request.urls:
        raise HTTPException(400, "At least one URL required")
+
+    return await stream_process(crawl_request=crawl_request)
+
+async def stream_process(crawl_request: CrawlRequest):
    crawler, gen = await handle_stream_crawl_request(
        urls=crawl_request.urls,
        browser_config=crawl_request.browser_config,
--- a/deploy/docker/static/playground/index.html
+++ b/deploy/docker/static/playground/index.html
@@ -182,8 +182,8 @@
            <div class="px-4 py-2 border-b border-border flex items-center">
                <h2 class="font-medium">Request Builder</h2>
                <select id="endpoint" class="ml-auto bg-dark border border-border rounded px-2 py-1 text-sm">
-                    <option value="crawl">/crawl (batch)</option>
-                    <option value="crawl_stream">/crawl/stream</option>
+                    <option value="crawl">/crawl (supports streaming)</option>
+                    <option value="crawl_stream">/crawl/stream (legacy)</option>
                    <option value="md">/md</option>
                    <option value="llm">/llm</option>
                </select>
@@ -371,7 +371,7 @@

                <div class="flex items-center">
                    <input id="st-stream" type="checkbox" class="mr-2">
-                    <label for="st-stream" class="text-sm">Use /crawl/stream</label>
+                    <label for="st-stream" class="text-sm">Enable streaming mode</label>
                    <button id="st-run"
                        class="ml-auto bg-accent text-dark px-4 py-2 rounded hover:bg-opacity-90 font-medium">
                        Run Stress Test
@@ -596,6 +596,14 @@
            forceHighlightElement(curlCodeEl);
        }

+        // Detect if stream is requested inside payload
+        function shouldUseStream(payload) {
+            const toBool = (v) => v === true || (typeof v === 'string' && v.toLowerCase() === 'true');
+            const fromCrawler = payload && payload.crawler_config && payload.crawler_config.params && payload.crawler_config.params.stream;
+            const direct = payload && payload.stream;
+            return toBool(fromCrawler) || toBool(direct);
+        }
+
        // Main run function
        async function runCrawl() {
            const endpoint = document.getElementById('endpoint').value;
@@ -611,16 +619,24 @@
                        : { browser_config: cfgJson };
                }
            } catch (err) {
+                const codeText = cm.getValue();
+                const streamFlag = /stream\s*=\s*True/i.test(codeText);
+                const isCrawlEndpoint = document.getElementById('endpoint').value === 'crawl';
+                if (isCrawlEndpoint && streamFlag) {
+                    // Fallback: proceed with minimal config only for stream
+                    advConfig = { crawler_config: { stream: true } };
+                } else {
                    updateStatus('error');
                    document.querySelector('#response-content code').textContent =
                        JSON.stringify({ error: err.message }, null, 2);
                    forceHighlightElement(document.querySelector('#response-content code'));
                    return; // stop run
                }
+            }

            const endpointMap = {
                crawl: '/crawl',
-                // crawl_stream: '/crawl/stream',
+                crawl_stream: '/crawl/stream', // Keep for backward compatibility
                md: '/md',
                llm: '/llm'
            };
@@ -647,7 +663,7 @@
                // This will be handled directly in the fetch below
                payload = null;
            } else {
-                // Default payload for /crawl and /crawl/stream
+                // Default payload for /crawl (supports both streaming and batch modes)
                payload = {
                    urls,
                    ...advConfig
@@ -659,6 +675,7 @@
            try {
                const startTime = performance.now();
                let response, responseData;
+                const useStreamOverride = (endpoint === 'crawl') && shouldUseStream(payload);

                if (endpoint === 'llm') {
                    // Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
@@ -681,8 +698,8 @@
                    document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
                    document.querySelector('#response-content code').className = 'json hljs';
                    forceHighlightElement(document.querySelector('#response-content code'));
-                } else if (endpoint === 'crawl_stream') {
-                    // Stream processing
+                } else if (endpoint === 'crawl_stream' || useStreamOverride) {
+                    // Stream processing - now handled directly by /crawl endpoint
                    response = await fetch(api, {
                        method: 'POST',
                        headers: { 'Content-Type': 'application/json' },
@@ -757,6 +774,7 @@
                    const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
                    generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
                } else {
+                    // Use the same API endpoint for both streaming and non-streaming
                    generateSnippets(api, payload);
                }
            } catch (error) {
@@ -786,7 +804,7 @@
            document.getElementById('stress-avg-time').textContent = '0';
            document.getElementById('stress-peak-mem').textContent = '0';

-            const api = useStream ? '/crawl/stream' : '/crawl';
+            const api = '/crawl'; // Always use /crawl - backend handles streaming internally
            const urls = Array.from({ length: total }, (_, i) => `https://httpbin.org/anything/stress-${i}-${Date.now()}`);
            const chunks = [];

--- a/tests/docker/test_server_requests.py
+++ b/tests/docker/test_server_requests.py
@@ -143,7 +143,40 @@ class TestCrawlEndpoints:
        assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
        # We don't specify a markdown generator in this test, so don't make assumptions about markdown field
        # It might be null, missing, or populated depending on the server's default behavior
+    async def test_crawl_with_stream_direct(self, async_client: httpx.AsyncClient):
+        """Test that /crawl endpoint handles stream=True directly without redirect."""
+        payload = {
+            "urls": [SIMPLE_HTML_URL],
+            "browser_config": {
+                "type": "BrowserConfig",
+                "params": {
+                    "headless": True,
+                }
+            },
+            "crawler_config": {
+                "type": "CrawlerRunConfig", 
+                "params": {
+                    "stream": True,  # Set stream to True for direct streaming
+                    "screenshot": False,
+                    "cache_mode": CacheMode.BYPASS.value
+                }
+            }
+        }

+        # Send a request to the /crawl endpoint - should handle streaming directly
+        async with async_client.stream("POST", "/crawl", json=payload) as response:
+            assert response.status_code == 200
+            assert response.headers["content-type"] == "application/x-ndjson"
+            assert response.headers.get("x-stream-status") == "active"
+
+            results = await process_streaming_response(response)
+
+            assert len(results) == 1
+            result = results[0]
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            assert result["url"] == SIMPLE_HTML_URL
+            assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
    async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
        """Test /crawl/stream with a single URL and simple config values."""
        payload = {