Commit without API

This commit is contained in:
AHMET YILMAZ
2025-09-03 17:02:40 +08:00
parent 1eacea1d2d
commit 6a3b3e9d38
3 changed files with 74 additions and 14 deletions

View File

@@ -482,9 +482,14 @@ async def crawl(
): ):
""" """
Crawl a list of URLs and return the results as JSON. Crawl a list of URLs and return the results as JSON.
For streaming responses, use /crawl/stream endpoint.
""" """
if not crawl_request.urls: if not crawl_request.urls:
raise HTTPException(400, "At least one URL required") raise HTTPException(400, "At least one URL required")
# Check whether it is a redirection for a streaming request
crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
if crawler_config.stream:
return await stream_process(crawl_request=crawl_request)
results = await handle_crawl_request( results = await handle_crawl_request(
urls=crawl_request.urls, urls=crawl_request.urls,
browser_config=crawl_request.browser_config, browser_config=crawl_request.browser_config,
@@ -506,12 +511,16 @@ async def crawl_stream(
): ):
if not crawl_request.urls: if not crawl_request.urls:
raise HTTPException(400, "At least one URL required") raise HTTPException(400, "At least one URL required")
return await stream_process(crawl_request=crawl_request)
async def stream_process(crawl_request: CrawlRequest):
crawler, gen = await handle_stream_crawl_request( crawler, gen = await handle_stream_crawl_request(
urls=crawl_request.urls, urls=crawl_request.urls,
browser_config=crawl_request.browser_config, browser_config=crawl_request.browser_config,
crawler_config=crawl_request.crawler_config, crawler_config=crawl_request.crawler_config,
config=config, config=config,
) )
return StreamingResponse( return StreamingResponse(
stream_results(crawler, gen), stream_results(crawler, gen),
media_type="application/x-ndjson", media_type="application/x-ndjson",

View File

@@ -182,8 +182,8 @@
<div class="px-4 py-2 border-b border-border flex items-center"> <div class="px-4 py-2 border-b border-border flex items-center">
<h2 class="font-medium">Request Builder</h2> <h2 class="font-medium">Request Builder</h2>
<select id="endpoint" class="ml-auto bg-dark border border-border rounded px-2 py-1 text-sm"> <select id="endpoint" class="ml-auto bg-dark border border-border rounded px-2 py-1 text-sm">
<option value="crawl">/crawl (batch)</option> <option value="crawl">/crawl (supports streaming)</option>
<option value="crawl_stream">/crawl/stream</option> <option value="crawl_stream">/crawl/stream (legacy)</option>
<option value="md">/md</option> <option value="md">/md</option>
<option value="llm">/llm</option> <option value="llm">/llm</option>
</select> </select>
@@ -371,7 +371,7 @@
<div class="flex items-center"> <div class="flex items-center">
<input id="st-stream" type="checkbox" class="mr-2"> <input id="st-stream" type="checkbox" class="mr-2">
<label for="st-stream" class="text-sm">Use /crawl/stream</label> <label for="st-stream" class="text-sm">Enable streaming mode</label>
<button id="st-run" <button id="st-run"
class="ml-auto bg-accent text-dark px-4 py-2 rounded hover:bg-opacity-90 font-medium"> class="ml-auto bg-accent text-dark px-4 py-2 rounded hover:bg-opacity-90 font-medium">
Run Stress Test Run Stress Test
@@ -596,6 +596,14 @@
forceHighlightElement(curlCodeEl); forceHighlightElement(curlCodeEl);
} }
// Detect if stream is requested inside payload
function shouldUseStream(payload) {
const toBool = (v) => v === true || (typeof v === 'string' && v.toLowerCase() === 'true');
const fromCrawler = payload && payload.crawler_config && payload.crawler_config.params && payload.crawler_config.params.stream;
const direct = payload && payload.stream;
return toBool(fromCrawler) || toBool(direct);
}
// Main run function // Main run function
async function runCrawl() { async function runCrawl() {
const endpoint = document.getElementById('endpoint').value; const endpoint = document.getElementById('endpoint').value;
@@ -611,16 +619,24 @@
: { browser_config: cfgJson }; : { browser_config: cfgJson };
} }
} catch (err) { } catch (err) {
updateStatus('error'); const codeText = cm.getValue();
document.querySelector('#response-content code').textContent = const streamFlag = /stream\s*=\s*True/i.test(codeText);
JSON.stringify({ error: err.message }, null, 2); const isCrawlEndpoint = document.getElementById('endpoint').value === 'crawl';
forceHighlightElement(document.querySelector('#response-content code')); if (isCrawlEndpoint && streamFlag) {
return; // stop run // Fallback: proceed with minimal config only for stream
advConfig = { crawler_config: { stream: true } };
} else {
updateStatus('error');
document.querySelector('#response-content code').textContent =
JSON.stringify({ error: err.message }, null, 2);
forceHighlightElement(document.querySelector('#response-content code'));
return; // stop run
}
} }
const endpointMap = { const endpointMap = {
crawl: '/crawl', crawl: '/crawl',
// crawl_stream: '/crawl/stream', crawl_stream: '/crawl/stream', // Keep for backward compatibility
md: '/md', md: '/md',
llm: '/llm' llm: '/llm'
}; };
@@ -647,7 +663,7 @@
// This will be handled directly in the fetch below // This will be handled directly in the fetch below
payload = null; payload = null;
} else { } else {
// Default payload for /crawl and /crawl/stream // Default payload for /crawl (supports both streaming and batch modes)
payload = { payload = {
urls, urls,
...advConfig ...advConfig
@@ -659,6 +675,7 @@
try { try {
const startTime = performance.now(); const startTime = performance.now();
let response, responseData; let response, responseData;
const useStreamOverride = (endpoint === 'crawl') && shouldUseStream(payload);
if (endpoint === 'llm') { if (endpoint === 'llm') {
// Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query} // Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
@@ -681,8 +698,8 @@
document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2); document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
document.querySelector('#response-content code').className = 'json hljs'; document.querySelector('#response-content code').className = 'json hljs';
forceHighlightElement(document.querySelector('#response-content code')); forceHighlightElement(document.querySelector('#response-content code'));
} else if (endpoint === 'crawl_stream') { } else if (endpoint === 'crawl_stream' || useStreamOverride) {
// Stream processing // Stream processing - now handled directly by /crawl endpoint
response = await fetch(api, { response = await fetch(api, {
method: 'POST', method: 'POST',
headers: { 'Content-Type': 'application/json' }, headers: { 'Content-Type': 'application/json' },
@@ -757,6 +774,7 @@
const question = document.getElementById('llm-question').value.trim() || "What is this page about?"; const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET'); generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
} else { } else {
// Use the same API endpoint for both streaming and non-streaming
generateSnippets(api, payload); generateSnippets(api, payload);
} }
} catch (error) { } catch (error) {
@@ -786,7 +804,7 @@
document.getElementById('stress-avg-time').textContent = '0'; document.getElementById('stress-avg-time').textContent = '0';
document.getElementById('stress-peak-mem').textContent = '0'; document.getElementById('stress-peak-mem').textContent = '0';
const api = useStream ? '/crawl/stream' : '/crawl'; const api = '/crawl'; // Always use /crawl - backend handles streaming internally
const urls = Array.from({ length: total }, (_, i) => `https://httpbin.org/anything/stress-${i}-${Date.now()}`); const urls = Array.from({ length: total }, (_, i) => `https://httpbin.org/anything/stress-${i}-${Date.now()}`);
const chunks = []; const chunks = [];

View File

@@ -143,7 +143,40 @@ class TestCrawlEndpoints:
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"] assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
# We don't specify a markdown generator in this test, so don't make assumptions about markdown field # We don't specify a markdown generator in this test, so don't make assumptions about markdown field
# It might be null, missing, or populated depending on the server's default behavior # It might be null, missing, or populated depending on the server's default behavior
async def test_crawl_with_stream_direct(self, async_client: httpx.AsyncClient):
"""Test that /crawl endpoint handles stream=True directly without redirect."""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {
"type": "BrowserConfig",
"params": {
"headless": True,
}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": True, # Set stream to True for direct streaming
"screenshot": False,
"cache_mode": CacheMode.BYPASS.value
}
}
}
# Send a request to the /crawl endpoint - should handle streaming directly
async with async_client.stream("POST", "/crawl", json=payload) as response:
assert response.status_code == 200
assert response.headers["content-type"] == "application/x-ndjson"
assert response.headers.get("x-stream-status") == "active"
results = await process_streaming_response(response)
assert len(results) == 1
result = results[0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] == SIMPLE_HTML_URL
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient): async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
"""Test /crawl/stream with a single URL and simple config values.""" """Test /crawl/stream with a single URL and simple config values."""
payload = { payload = {