Compare commits
2 Commits
fix/relati
...
fix/reques
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1874a7b8d2 | ||
|
|
6a3b3e9d38 |
@@ -1037,7 +1037,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
downloaded_files=(
|
||||
self._downloaded_files if self._downloaded_files else None
|
||||
),
|
||||
redirected_url=page.url, # Update to current URL in case of JavaScript navigation
|
||||
redirected_url=redirected_url,
|
||||
# Include captured data if enabled
|
||||
network_requests=captured_requests if config.capture_network_requests else None,
|
||||
console_messages=captured_console if config.capture_console_messages else None,
|
||||
|
||||
@@ -480,7 +480,7 @@ class AsyncWebCrawler:
|
||||
# Scraping Strategy Execution #
|
||||
################################
|
||||
result: ScrapingResult = scraping_strategy.scrap(
|
||||
kwargs.get("redirected_url", url), html, **params)
|
||||
url, html, **params)
|
||||
|
||||
if result is None:
|
||||
raise ValueError(
|
||||
|
||||
@@ -2149,10 +2149,8 @@ def normalize_url(
|
||||
*,
|
||||
drop_query_tracking=True,
|
||||
sort_query=True,
|
||||
keep_fragment=True,
|
||||
remove_fragments=None, # alias for keep_fragment=False
|
||||
keep_fragment=False,
|
||||
extra_drop_params=None,
|
||||
params_to_remove=None, # alias for extra_drop_params
|
||||
preserve_https=False,
|
||||
original_scheme=None
|
||||
):
|
||||
@@ -2177,20 +2175,10 @@ def normalize_url(
|
||||
Returns
|
||||
-------
|
||||
str | None
|
||||
A clean, canonical URL or the base URL if href is empty/None.
|
||||
A clean, canonical URL or None if href is empty/None.
|
||||
"""
|
||||
if not href:
|
||||
# For empty href, return the base URL (matching urljoin behavior)
|
||||
return base_url
|
||||
|
||||
# Validate base URL format
|
||||
parsed_base = urlparse(base_url)
|
||||
if not parsed_base.scheme or not parsed_base.netloc:
|
||||
raise ValueError(f"Invalid base URL format: {base_url}")
|
||||
|
||||
if parsed_base.scheme.lower() not in ["http", "https"]:
|
||||
# Handle special protocols
|
||||
raise ValueError(f"Invalid base URL format: {base_url}")
|
||||
return None
|
||||
|
||||
# Resolve relative paths first
|
||||
full_url = urljoin(base_url, href.strip())
|
||||
@@ -2211,12 +2199,6 @@ def normalize_url(
|
||||
|
||||
# ── netloc ──
|
||||
netloc = parsed.netloc.lower()
|
||||
|
||||
# Remove default ports (80 for http, 443 for https)
|
||||
if ':' in netloc:
|
||||
host, port = netloc.rsplit(':', 1)
|
||||
if (parsed.scheme == 'http' and port == '80') or (parsed.scheme == 'https' and port == '443'):
|
||||
netloc = host
|
||||
|
||||
# ── path ──
|
||||
# Strip duplicate slashes and trailing "/" (except root)
|
||||
@@ -2224,17 +2206,7 @@ def normalize_url(
|
||||
# The path from urlparse is already properly encoded
|
||||
path = parsed.path
|
||||
if path.endswith('/') and path != '/':
|
||||
# Only strip trailing slash if the original href didn't have a trailing slash
|
||||
# and the base_url didn't end with a slash
|
||||
base_parsed = urlparse(base_url)
|
||||
if not href.strip().endswith('/') and not base_parsed.path.endswith('/'):
|
||||
path = path.rstrip('/')
|
||||
# Add trailing slash for URLs without explicit paths (indicates directory)
|
||||
# But skip this for special protocols that don't use standard URL structure
|
||||
elif not path:
|
||||
special_protocols = {"javascript:", "mailto:", "tel:", "file:", "data:"}
|
||||
if not any(href.strip().lower().startswith(p) for p in special_protocols):
|
||||
path = '/'
|
||||
path = path.rstrip('/')
|
||||
|
||||
# ── query ──
|
||||
query = parsed.query
|
||||
@@ -2249,8 +2221,6 @@ def normalize_url(
|
||||
}
|
||||
if extra_drop_params:
|
||||
default_tracking |= {p.lower() for p in extra_drop_params}
|
||||
if params_to_remove:
|
||||
default_tracking |= {p.lower() for p in params_to_remove}
|
||||
params = [(k, v) for k, v in params if k not in default_tracking]
|
||||
|
||||
if sort_query:
|
||||
@@ -2259,10 +2229,7 @@ def normalize_url(
|
||||
query = urlencode(params, doseq=True) if params else ''
|
||||
|
||||
# ── fragment ──
|
||||
if remove_fragments is True:
|
||||
fragment = ''
|
||||
else:
|
||||
fragment = parsed.fragment if keep_fragment else ''
|
||||
fragment = parsed.fragment if keep_fragment else ''
|
||||
|
||||
# Re-assemble
|
||||
normalized = urlunparse((
|
||||
@@ -2486,19 +2453,9 @@ def is_external_url(url: str, base_domain: str) -> bool:
|
||||
if not parsed.netloc: # Relative URL
|
||||
return False
|
||||
|
||||
# Don't strip 'www.' from domains for comparison - treat www.example.com and example.com as different
|
||||
url_domain = parsed.netloc.lower()
|
||||
base = base_domain.lower()
|
||||
|
||||
# Strip user credentials from URL domain
|
||||
if '@' in url_domain:
|
||||
url_domain = url_domain.split('@', 1)[1]
|
||||
|
||||
# Strip ports from both for comparison (any port should be considered same domain)
|
||||
if ':' in url_domain:
|
||||
url_domain = url_domain.rsplit(':', 1)[0]
|
||||
if ':' in base:
|
||||
base = base.rsplit(':', 1)[0]
|
||||
# Strip 'www.' from both domains for comparison
|
||||
url_domain = parsed.netloc.lower().replace("www.", "")
|
||||
base = base_domain.lower().replace("www.", "")
|
||||
|
||||
# Check if URL domain ends with base domain
|
||||
return not url_domain.endswith(base)
|
||||
|
||||
@@ -28,43 +28,25 @@ def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -
|
||||
signing_key = get_jwk_from_secret(SECRET_KEY)
|
||||
return instance.encode(to_encode, signing_key, alg='HS256')
|
||||
|
||||
def verify_token(credentials: HTTPAuthorizationCredentials) -> Dict:
|
||||
def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
|
||||
"""Verify the JWT token from the Authorization header."""
|
||||
|
||||
if not credentials or not credentials.credentials:
|
||||
raise HTTPException(
|
||||
status_code=401,
|
||||
detail="No token provided",
|
||||
headers={"WWW-Authenticate": "Bearer"}
|
||||
)
|
||||
|
||||
|
||||
if credentials is None:
|
||||
return None
|
||||
token = credentials.credentials
|
||||
verifying_key = get_jwk_from_secret(SECRET_KEY)
|
||||
try:
|
||||
payload = instance.decode(token, verifying_key, do_time_check=True, algorithms='HS256')
|
||||
return payload
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=401,
|
||||
detail=f"Invalid or expired token: {str(e)}",
|
||||
headers={"WWW-Authenticate": "Bearer"}
|
||||
)
|
||||
except Exception:
|
||||
raise HTTPException(status_code=401, detail="Invalid or expired token")
|
||||
|
||||
|
||||
def get_token_dependency(config: Dict):
|
||||
"""Return the token dependency if JWT is enabled, else a function that returns None."""
|
||||
|
||||
|
||||
if config.get("security", {}).get("jwt_enabled", False):
|
||||
def jwt_required(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
|
||||
"""Enforce JWT authentication when enabled."""
|
||||
if credentials is None:
|
||||
raise HTTPException(
|
||||
status_code=401,
|
||||
detail="Authentication required. Please provide a valid Bearer token.",
|
||||
headers={"WWW-Authenticate": "Bearer"}
|
||||
)
|
||||
return verify_token(credentials)
|
||||
return jwt_required
|
||||
return verify_token
|
||||
else:
|
||||
return lambda: None
|
||||
|
||||
|
||||
@@ -38,8 +38,8 @@ rate_limiting:
|
||||
|
||||
# Security Configuration
|
||||
security:
|
||||
enabled: false
|
||||
jwt_enabled: false
|
||||
enabled: false
|
||||
jwt_enabled: false
|
||||
https_redirect: false
|
||||
trusted_hosts: ["*"]
|
||||
headers:
|
||||
|
||||
@@ -482,9 +482,14 @@ async def crawl(
|
||||
):
|
||||
"""
|
||||
Crawl a list of URLs and return the results as JSON.
|
||||
For streaming responses, use /crawl/stream endpoint.
|
||||
"""
|
||||
if not crawl_request.urls:
|
||||
raise HTTPException(400, "At least one URL required")
|
||||
# Check whether it is a redirection for a streaming request
|
||||
crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
|
||||
if crawler_config.stream:
|
||||
return await stream_process(crawl_request=crawl_request)
|
||||
results = await handle_crawl_request(
|
||||
urls=crawl_request.urls,
|
||||
browser_config=crawl_request.browser_config,
|
||||
@@ -506,12 +511,16 @@ async def crawl_stream(
|
||||
):
|
||||
if not crawl_request.urls:
|
||||
raise HTTPException(400, "At least one URL required")
|
||||
|
||||
return await stream_process(crawl_request=crawl_request)
|
||||
|
||||
async def stream_process(crawl_request: CrawlRequest):
|
||||
crawler, gen = await handle_stream_crawl_request(
|
||||
urls=crawl_request.urls,
|
||||
browser_config=crawl_request.browser_config,
|
||||
crawler_config=crawl_request.crawler_config,
|
||||
config=config,
|
||||
)
|
||||
)
|
||||
return StreamingResponse(
|
||||
stream_results(crawler, gen),
|
||||
media_type="application/x-ndjson",
|
||||
|
||||
@@ -371,7 +371,7 @@
|
||||
|
||||
<div class="flex items-center">
|
||||
<input id="st-stream" type="checkbox" class="mr-2">
|
||||
<label for="st-stream" class="text-sm">Use /crawl/stream</label>
|
||||
<label for="st-stream" class="text-sm">Enable streaming mode</label>
|
||||
<button id="st-run"
|
||||
class="ml-auto bg-accent text-dark px-4 py-2 rounded hover:bg-opacity-90 font-medium">
|
||||
Run Stress Test
|
||||
@@ -596,6 +596,14 @@
|
||||
forceHighlightElement(curlCodeEl);
|
||||
}
|
||||
|
||||
// Detect if stream is requested inside payload
|
||||
function shouldUseStream(payload) {
|
||||
const toBool = (v) => v === true || (typeof v === 'string' && v.toLowerCase() === 'true');
|
||||
const fromCrawler = payload && payload.crawler_config && payload.crawler_config.params && payload.crawler_config.params.stream;
|
||||
const direct = payload && payload.stream;
|
||||
return toBool(fromCrawler) || toBool(direct);
|
||||
}
|
||||
|
||||
// Main run function
|
||||
async function runCrawl() {
|
||||
const endpoint = document.getElementById('endpoint').value;
|
||||
@@ -611,16 +619,24 @@
|
||||
: { browser_config: cfgJson };
|
||||
}
|
||||
} catch (err) {
|
||||
updateStatus('error');
|
||||
document.querySelector('#response-content code').textContent =
|
||||
JSON.stringify({ error: err.message }, null, 2);
|
||||
forceHighlightElement(document.querySelector('#response-content code'));
|
||||
return; // stop run
|
||||
const codeText = cm.getValue();
|
||||
const streamFlag = /stream\s*=\s*True/i.test(codeText);
|
||||
const isCrawlEndpoint = document.getElementById('endpoint').value === 'crawl';
|
||||
if (isCrawlEndpoint && streamFlag) {
|
||||
// Fallback: proceed with minimal config only for stream
|
||||
advConfig = { crawler_config: { stream: true } };
|
||||
} else {
|
||||
updateStatus('error');
|
||||
document.querySelector('#response-content code').textContent =
|
||||
JSON.stringify({ error: err.message }, null, 2);
|
||||
forceHighlightElement(document.querySelector('#response-content code'));
|
||||
return; // stop run
|
||||
}
|
||||
}
|
||||
|
||||
const endpointMap = {
|
||||
crawl: '/crawl',
|
||||
// crawl_stream: '/crawl/stream',
|
||||
crawl_stream: '/crawl/stream', // Keep for backward compatibility
|
||||
md: '/md',
|
||||
llm: '/llm'
|
||||
};
|
||||
@@ -647,7 +663,7 @@
|
||||
// This will be handled directly in the fetch below
|
||||
payload = null;
|
||||
} else {
|
||||
// Default payload for /crawl and /crawl/stream
|
||||
// Default payload for /crawl (supports both streaming and batch modes)
|
||||
payload = {
|
||||
urls,
|
||||
...advConfig
|
||||
@@ -659,6 +675,7 @@
|
||||
try {
|
||||
const startTime = performance.now();
|
||||
let response, responseData;
|
||||
const useStreamOverride = (endpoint === 'crawl') && shouldUseStream(payload);
|
||||
|
||||
if (endpoint === 'llm') {
|
||||
// Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
|
||||
@@ -681,8 +698,8 @@
|
||||
document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
|
||||
document.querySelector('#response-content code').className = 'json hljs';
|
||||
forceHighlightElement(document.querySelector('#response-content code'));
|
||||
} else if (endpoint === 'crawl_stream') {
|
||||
// Stream processing
|
||||
} else if (endpoint === 'crawl_stream' || useStreamOverride) {
|
||||
// Stream processing - now handled directly by /crawl endpoint
|
||||
response = await fetch(api, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
@@ -757,6 +774,7 @@
|
||||
const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
|
||||
generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
|
||||
} else {
|
||||
// Use the same API endpoint for both streaming and non-streaming
|
||||
generateSnippets(api, payload);
|
||||
}
|
||||
} catch (error) {
|
||||
@@ -786,7 +804,7 @@
|
||||
document.getElementById('stress-avg-time').textContent = '0';
|
||||
document.getElementById('stress-peak-mem').textContent = '0';
|
||||
|
||||
const api = useStream ? '/crawl/stream' : '/crawl';
|
||||
const api = '/crawl'; // Always use /crawl - backend handles streaming internally
|
||||
const urls = Array.from({ length: total }, (_, i) => `https://httpbin.org/anything/stress-${i}-${Date.now()}`);
|
||||
const chunks = [];
|
||||
|
||||
|
||||
@@ -143,7 +143,40 @@ class TestCrawlEndpoints:
|
||||
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
|
||||
# We don't specify a markdown generator in this test, so don't make assumptions about markdown field
|
||||
# It might be null, missing, or populated depending on the server's default behavior
|
||||
async def test_crawl_with_stream_direct(self, async_client: httpx.AsyncClient):
|
||||
"""Test that /crawl endpoint handles stream=True directly without redirect."""
|
||||
payload = {
|
||||
"urls": [SIMPLE_HTML_URL],
|
||||
"browser_config": {
|
||||
"type": "BrowserConfig",
|
||||
"params": {
|
||||
"headless": True,
|
||||
}
|
||||
},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": True, # Set stream to True for direct streaming
|
||||
"screenshot": False,
|
||||
"cache_mode": CacheMode.BYPASS.value
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Send a request to the /crawl endpoint - should handle streaming directly
|
||||
async with async_client.stream("POST", "/crawl", json=payload) as response:
|
||||
assert response.status_code == 200
|
||||
assert response.headers["content-type"] == "application/x-ndjson"
|
||||
assert response.headers.get("x-stream-status") == "active"
|
||||
|
||||
results = await process_streaming_response(response)
|
||||
|
||||
assert len(results) == 1
|
||||
result = results[0]
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert result["url"] == SIMPLE_HTML_URL
|
||||
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
|
||||
async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
|
||||
"""Test /crawl/stream with a single URL and simple config values."""
|
||||
payload = {
|
||||
|
||||
Reference in New Issue
Block a user