Commit without API
This commit is contained in:
@@ -482,9 +482,14 @@ async def crawl(
|
|||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Crawl a list of URLs and return the results as JSON.
|
Crawl a list of URLs and return the results as JSON.
|
||||||
|
For streaming responses, use /crawl/stream endpoint.
|
||||||
"""
|
"""
|
||||||
if not crawl_request.urls:
|
if not crawl_request.urls:
|
||||||
raise HTTPException(400, "At least one URL required")
|
raise HTTPException(400, "At least one URL required")
|
||||||
|
# Check whether it is a redirection for a streaming request
|
||||||
|
crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
|
||||||
|
if crawler_config.stream:
|
||||||
|
return await stream_process(crawl_request=crawl_request)
|
||||||
results = await handle_crawl_request(
|
results = await handle_crawl_request(
|
||||||
urls=crawl_request.urls,
|
urls=crawl_request.urls,
|
||||||
browser_config=crawl_request.browser_config,
|
browser_config=crawl_request.browser_config,
|
||||||
@@ -506,12 +511,16 @@ async def crawl_stream(
|
|||||||
):
|
):
|
||||||
if not crawl_request.urls:
|
if not crawl_request.urls:
|
||||||
raise HTTPException(400, "At least one URL required")
|
raise HTTPException(400, "At least one URL required")
|
||||||
|
|
||||||
|
return await stream_process(crawl_request=crawl_request)
|
||||||
|
|
||||||
|
async def stream_process(crawl_request: CrawlRequest):
|
||||||
crawler, gen = await handle_stream_crawl_request(
|
crawler, gen = await handle_stream_crawl_request(
|
||||||
urls=crawl_request.urls,
|
urls=crawl_request.urls,
|
||||||
browser_config=crawl_request.browser_config,
|
browser_config=crawl_request.browser_config,
|
||||||
crawler_config=crawl_request.crawler_config,
|
crawler_config=crawl_request.crawler_config,
|
||||||
config=config,
|
config=config,
|
||||||
)
|
)
|
||||||
return StreamingResponse(
|
return StreamingResponse(
|
||||||
stream_results(crawler, gen),
|
stream_results(crawler, gen),
|
||||||
media_type="application/x-ndjson",
|
media_type="application/x-ndjson",
|
||||||
|
|||||||
@@ -182,8 +182,8 @@
|
|||||||
<div class="px-4 py-2 border-b border-border flex items-center">
|
<div class="px-4 py-2 border-b border-border flex items-center">
|
||||||
<h2 class="font-medium">Request Builder</h2>
|
<h2 class="font-medium">Request Builder</h2>
|
||||||
<select id="endpoint" class="ml-auto bg-dark border border-border rounded px-2 py-1 text-sm">
|
<select id="endpoint" class="ml-auto bg-dark border border-border rounded px-2 py-1 text-sm">
|
||||||
<option value="crawl">/crawl (batch)</option>
|
<option value="crawl">/crawl (supports streaming)</option>
|
||||||
<option value="crawl_stream">/crawl/stream</option>
|
<option value="crawl_stream">/crawl/stream (legacy)</option>
|
||||||
<option value="md">/md</option>
|
<option value="md">/md</option>
|
||||||
<option value="llm">/llm</option>
|
<option value="llm">/llm</option>
|
||||||
</select>
|
</select>
|
||||||
@@ -371,7 +371,7 @@
|
|||||||
|
|
||||||
<div class="flex items-center">
|
<div class="flex items-center">
|
||||||
<input id="st-stream" type="checkbox" class="mr-2">
|
<input id="st-stream" type="checkbox" class="mr-2">
|
||||||
<label for="st-stream" class="text-sm">Use /crawl/stream</label>
|
<label for="st-stream" class="text-sm">Enable streaming mode</label>
|
||||||
<button id="st-run"
|
<button id="st-run"
|
||||||
class="ml-auto bg-accent text-dark px-4 py-2 rounded hover:bg-opacity-90 font-medium">
|
class="ml-auto bg-accent text-dark px-4 py-2 rounded hover:bg-opacity-90 font-medium">
|
||||||
Run Stress Test
|
Run Stress Test
|
||||||
@@ -596,6 +596,14 @@
|
|||||||
forceHighlightElement(curlCodeEl);
|
forceHighlightElement(curlCodeEl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Detect if stream is requested inside payload
|
||||||
|
function shouldUseStream(payload) {
|
||||||
|
const toBool = (v) => v === true || (typeof v === 'string' && v.toLowerCase() === 'true');
|
||||||
|
const fromCrawler = payload && payload.crawler_config && payload.crawler_config.params && payload.crawler_config.params.stream;
|
||||||
|
const direct = payload && payload.stream;
|
||||||
|
return toBool(fromCrawler) || toBool(direct);
|
||||||
|
}
|
||||||
|
|
||||||
// Main run function
|
// Main run function
|
||||||
async function runCrawl() {
|
async function runCrawl() {
|
||||||
const endpoint = document.getElementById('endpoint').value;
|
const endpoint = document.getElementById('endpoint').value;
|
||||||
@@ -611,16 +619,24 @@
|
|||||||
: { browser_config: cfgJson };
|
: { browser_config: cfgJson };
|
||||||
}
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
updateStatus('error');
|
const codeText = cm.getValue();
|
||||||
document.querySelector('#response-content code').textContent =
|
const streamFlag = /stream\s*=\s*True/i.test(codeText);
|
||||||
JSON.stringify({ error: err.message }, null, 2);
|
const isCrawlEndpoint = document.getElementById('endpoint').value === 'crawl';
|
||||||
forceHighlightElement(document.querySelector('#response-content code'));
|
if (isCrawlEndpoint && streamFlag) {
|
||||||
return; // stop run
|
// Fallback: proceed with minimal config only for stream
|
||||||
|
advConfig = { crawler_config: { stream: true } };
|
||||||
|
} else {
|
||||||
|
updateStatus('error');
|
||||||
|
document.querySelector('#response-content code').textContent =
|
||||||
|
JSON.stringify({ error: err.message }, null, 2);
|
||||||
|
forceHighlightElement(document.querySelector('#response-content code'));
|
||||||
|
return; // stop run
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const endpointMap = {
|
const endpointMap = {
|
||||||
crawl: '/crawl',
|
crawl: '/crawl',
|
||||||
// crawl_stream: '/crawl/stream',
|
crawl_stream: '/crawl/stream', // Keep for backward compatibility
|
||||||
md: '/md',
|
md: '/md',
|
||||||
llm: '/llm'
|
llm: '/llm'
|
||||||
};
|
};
|
||||||
@@ -647,7 +663,7 @@
|
|||||||
// This will be handled directly in the fetch below
|
// This will be handled directly in the fetch below
|
||||||
payload = null;
|
payload = null;
|
||||||
} else {
|
} else {
|
||||||
// Default payload for /crawl and /crawl/stream
|
// Default payload for /crawl (supports both streaming and batch modes)
|
||||||
payload = {
|
payload = {
|
||||||
urls,
|
urls,
|
||||||
...advConfig
|
...advConfig
|
||||||
@@ -659,6 +675,7 @@
|
|||||||
try {
|
try {
|
||||||
const startTime = performance.now();
|
const startTime = performance.now();
|
||||||
let response, responseData;
|
let response, responseData;
|
||||||
|
const useStreamOverride = (endpoint === 'crawl') && shouldUseStream(payload);
|
||||||
|
|
||||||
if (endpoint === 'llm') {
|
if (endpoint === 'llm') {
|
||||||
// Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
|
// Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
|
||||||
@@ -681,8 +698,8 @@
|
|||||||
document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
|
document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
|
||||||
document.querySelector('#response-content code').className = 'json hljs';
|
document.querySelector('#response-content code').className = 'json hljs';
|
||||||
forceHighlightElement(document.querySelector('#response-content code'));
|
forceHighlightElement(document.querySelector('#response-content code'));
|
||||||
} else if (endpoint === 'crawl_stream') {
|
} else if (endpoint === 'crawl_stream' || useStreamOverride) {
|
||||||
// Stream processing
|
// Stream processing - now handled directly by /crawl endpoint
|
||||||
response = await fetch(api, {
|
response = await fetch(api, {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: { 'Content-Type': 'application/json' },
|
headers: { 'Content-Type': 'application/json' },
|
||||||
@@ -757,6 +774,7 @@
|
|||||||
const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
|
const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
|
||||||
generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
|
generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
|
||||||
} else {
|
} else {
|
||||||
|
// Use the same API endpoint for both streaming and non-streaming
|
||||||
generateSnippets(api, payload);
|
generateSnippets(api, payload);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@@ -786,7 +804,7 @@
|
|||||||
document.getElementById('stress-avg-time').textContent = '0';
|
document.getElementById('stress-avg-time').textContent = '0';
|
||||||
document.getElementById('stress-peak-mem').textContent = '0';
|
document.getElementById('stress-peak-mem').textContent = '0';
|
||||||
|
|
||||||
const api = useStream ? '/crawl/stream' : '/crawl';
|
const api = '/crawl'; // Always use /crawl - backend handles streaming internally
|
||||||
const urls = Array.from({ length: total }, (_, i) => `https://httpbin.org/anything/stress-${i}-${Date.now()}`);
|
const urls = Array.from({ length: total }, (_, i) => `https://httpbin.org/anything/stress-${i}-${Date.now()}`);
|
||||||
const chunks = [];
|
const chunks = [];
|
||||||
|
|
||||||
|
|||||||
@@ -143,7 +143,40 @@ class TestCrawlEndpoints:
|
|||||||
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
|
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
|
||||||
# We don't specify a markdown generator in this test, so don't make assumptions about markdown field
|
# We don't specify a markdown generator in this test, so don't make assumptions about markdown field
|
||||||
# It might be null, missing, or populated depending on the server's default behavior
|
# It might be null, missing, or populated depending on the server's default behavior
|
||||||
|
async def test_crawl_with_stream_direct(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test that /crawl endpoint handles stream=True directly without redirect."""
|
||||||
|
payload = {
|
||||||
|
"urls": [SIMPLE_HTML_URL],
|
||||||
|
"browser_config": {
|
||||||
|
"type": "BrowserConfig",
|
||||||
|
"params": {
|
||||||
|
"headless": True,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"stream": True, # Set stream to True for direct streaming
|
||||||
|
"screenshot": False,
|
||||||
|
"cache_mode": CacheMode.BYPASS.value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Send a request to the /crawl endpoint - should handle streaming directly
|
||||||
|
async with async_client.stream("POST", "/crawl", json=payload) as response:
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.headers["content-type"] == "application/x-ndjson"
|
||||||
|
assert response.headers.get("x-stream-status") == "active"
|
||||||
|
|
||||||
|
results = await process_streaming_response(response)
|
||||||
|
|
||||||
|
assert len(results) == 1
|
||||||
|
result = results[0]
|
||||||
|
await assert_crawl_result_structure(result)
|
||||||
|
assert result["success"] is True
|
||||||
|
assert result["url"] == SIMPLE_HTML_URL
|
||||||
|
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
|
||||||
async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
|
async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
|
||||||
"""Test /crawl/stream with a single URL and simple config values."""
|
"""Test /crawl/stream with a single URL and simple config values."""
|
||||||
payload = {
|
payload = {
|
||||||
|
|||||||
Reference in New Issue
Block a user