fix: update option labels in request builder for clarity

Commit without API
2025-09-05 17:06:25 +08:00 · 2025-09-03 17:02:40 +08:00
8 changed files with 92 additions and 93 deletions
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -1037,7 +1037,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                downloaded_files=(
                    self._downloaded_files if self._downloaded_files else None
                ),
-                redirected_url=page.url,  # Update to current URL in case of JavaScript navigation
+                redirected_url=redirected_url,
                # Include captured data if enabled
                network_requests=captured_requests if config.capture_network_requests else None,
                console_messages=captured_console if config.capture_console_messages else None,
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -480,7 +480,7 @@ class AsyncWebCrawler:
            # Scraping Strategy Execution  #
            ################################
            result: ScrapingResult = scraping_strategy.scrap(
-                kwargs.get("redirected_url", url), html, **params)
+                url, html, **params)

            if result is None:
                raise ValueError(
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -2149,10 +2149,8 @@ def normalize_url(
    *,
    drop_query_tracking=True,
    sort_query=True,
-    keep_fragment=True,
-    remove_fragments=None,  # alias for keep_fragment=False
+    keep_fragment=False,
    extra_drop_params=None,
-    params_to_remove=None,  # alias for extra_drop_params
    preserve_https=False,
    original_scheme=None
 ):
@@ -2177,20 +2175,10 @@ def normalize_url(
    Returns
    -------
    str | None
-        A clean, canonical URL or the base URL if href is empty/None.
+        A clean, canonical URL or None if href is empty/None.
    """
    if not href:
-        # For empty href, return the base URL (matching urljoin behavior)
-        return base_url
-
-    # Validate base URL format
-    parsed_base = urlparse(base_url)
-    if not parsed_base.scheme or not parsed_base.netloc:
-        raise ValueError(f"Invalid base URL format: {base_url}")
-    
-    if parsed_base.scheme.lower() not in ["http", "https"]:
-        # Handle special protocols
-        raise ValueError(f"Invalid base URL format: {base_url}")
+        return None

    # Resolve relative paths first
    full_url = urljoin(base_url, href.strip())
@@ -2211,12 +2199,6 @@ def normalize_url(

    # ── netloc ──
    netloc = parsed.netloc.lower()
-    
-    # Remove default ports (80 for http, 443 for https)
-    if ':' in netloc:
-        host, port = netloc.rsplit(':', 1)
-        if (parsed.scheme == 'http' and port == '80') or (parsed.scheme == 'https' and port == '443'):
-            netloc = host

    # ── path ──
    # Strip duplicate slashes and trailing "/" (except root)
@@ -2224,17 +2206,7 @@ def normalize_url(
    # The path from urlparse is already properly encoded
    path = parsed.path
    if path.endswith('/') and path != '/':
-        # Only strip trailing slash if the original href didn't have a trailing slash
-        # and the base_url didn't end with a slash
-        base_parsed = urlparse(base_url)
-        if not href.strip().endswith('/') and not base_parsed.path.endswith('/'):
-            path = path.rstrip('/')
-    # Add trailing slash for URLs without explicit paths (indicates directory)
-    # But skip this for special protocols that don't use standard URL structure
-    elif not path:
-        special_protocols = {"javascript:", "mailto:", "tel:", "file:", "data:"}
-        if not any(href.strip().lower().startswith(p) for p in special_protocols):
-            path = '/'
+        path = path.rstrip('/')

    # ── query ──
    query = parsed.query
@@ -2249,8 +2221,6 @@ def normalize_url(
            }
            if extra_drop_params:
                default_tracking |= {p.lower() for p in extra_drop_params}
-            if params_to_remove:
-                default_tracking |= {p.lower() for p in params_to_remove}
            params = [(k, v) for k, v in params if k not in default_tracking]

        if sort_query:
@@ -2259,10 +2229,7 @@ def normalize_url(
        query = urlencode(params, doseq=True) if params else ''

    # ── fragment ──
-    if remove_fragments is True:
-        fragment = ''
-    else:
-        fragment = parsed.fragment if keep_fragment else ''
+    fragment = parsed.fragment if keep_fragment else ''

    # Re-assemble
    normalized = urlunparse((
@@ -2486,19 +2453,9 @@ def is_external_url(url: str, base_domain: str) -> bool:
        if not parsed.netloc:  # Relative URL
            return False

-        # Don't strip 'www.' from domains for comparison - treat www.example.com and example.com as different
-        url_domain = parsed.netloc.lower()
-        base = base_domain.lower()
-        
-        # Strip user credentials from URL domain
-        if '@' in url_domain:
-            url_domain = url_domain.split('@', 1)[1]
-        
-        # Strip ports from both for comparison (any port should be considered same domain)
-        if ':' in url_domain:
-            url_domain = url_domain.rsplit(':', 1)[0]
-        if ':' in base:
-            base = base.rsplit(':', 1)[0]
+        # Strip 'www.' from both domains for comparison
+        url_domain = parsed.netloc.lower().replace("www.", "")
+        base = base_domain.lower().replace("www.", "")

        # Check if URL domain ends with base domain
        return not url_domain.endswith(base)
--- a/deploy/docker/auth.py
+++ b/deploy/docker/auth.py
@@ -28,43 +28,25 @@ def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -
    signing_key = get_jwk_from_secret(SECRET_KEY)
    return instance.encode(to_encode, signing_key, alg='HS256')

-def verify_token(credentials: HTTPAuthorizationCredentials) -> Dict:
+def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
    """Verify the JWT token from the Authorization header."""
-    
-    if not credentials or not credentials.credentials:
-        raise HTTPException(
-            status_code=401, 
-            detail="No token provided",
-            headers={"WWW-Authenticate": "Bearer"}
-        )
-    
+
+    if credentials is None:
+        return None
    token = credentials.credentials
    verifying_key = get_jwk_from_secret(SECRET_KEY)
    try:
        payload = instance.decode(token, verifying_key, do_time_check=True, algorithms='HS256')
        return payload
-    except Exception as e:
-        raise HTTPException(
-            status_code=401, 
-            detail=f"Invalid or expired token: {str(e)}",
-            headers={"WWW-Authenticate": "Bearer"}
-        )
+    except Exception:
+        raise HTTPException(status_code=401, detail="Invalid or expired token")


 def get_token_dependency(config: Dict):
    """Return the token dependency if JWT is enabled, else a function that returns None."""
-    
+
    if config.get("security", {}).get("jwt_enabled", False):
-        def jwt_required(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
-            """Enforce JWT authentication when enabled."""
-            if credentials is None:
-                raise HTTPException(
-                    status_code=401, 
-                    detail="Authentication required. Please provide a valid Bearer token.",
-                    headers={"WWW-Authenticate": "Bearer"}
-                )
-            return verify_token(credentials)
-        return jwt_required
+        return verify_token
    else:
        return lambda: None

--- a/deploy/docker/config.yml
+++ b/deploy/docker/config.yml
@@ -38,8 +38,8 @@ rate_limiting:

 # Security Configuration
 security:
-  enabled: false
-  jwt_enabled: false
+  enabled: false 
+  jwt_enabled: false 
  https_redirect: false
  trusted_hosts: ["*"]
  headers:
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -482,9 +482,14 @@ async def crawl(
 ):
    """
    Crawl a list of URLs and return the results as JSON.
+    For streaming responses, use /crawl/stream endpoint.
    """
    if not crawl_request.urls:
        raise HTTPException(400, "At least one URL required")
+    # Check whether it is a redirection for a streaming request
+    crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
+    if crawler_config.stream:
+        return await stream_process(crawl_request=crawl_request)
    results = await handle_crawl_request(
        urls=crawl_request.urls,
        browser_config=crawl_request.browser_config,
@@ -506,12 +511,16 @@ async def crawl_stream(
 ):
    if not crawl_request.urls:
        raise HTTPException(400, "At least one URL required")
+
+    return await stream_process(crawl_request=crawl_request)
+
+async def stream_process(crawl_request: CrawlRequest):
    crawler, gen = await handle_stream_crawl_request(
        urls=crawl_request.urls,
        browser_config=crawl_request.browser_config,
        crawler_config=crawl_request.crawler_config,
        config=config,
-    )
+)
    return StreamingResponse(
        stream_results(crawler, gen),
        media_type="application/x-ndjson",
--- a/deploy/docker/static/playground/index.html
+++ b/deploy/docker/static/playground/index.html
@@ -371,7 +371,7 @@

                <div class="flex items-center">
                    <input id="st-stream" type="checkbox" class="mr-2">
-                    <label for="st-stream" class="text-sm">Use /crawl/stream</label>
+                    <label for="st-stream" class="text-sm">Enable streaming mode</label>
                    <button id="st-run"
                        class="ml-auto bg-accent text-dark px-4 py-2 rounded hover:bg-opacity-90 font-medium">
                        Run Stress Test
@@ -596,6 +596,14 @@
            forceHighlightElement(curlCodeEl);
        }

+        // Detect if stream is requested inside payload
+        function shouldUseStream(payload) {
+            const toBool = (v) => v === true || (typeof v === 'string' && v.toLowerCase() === 'true');
+            const fromCrawler = payload && payload.crawler_config && payload.crawler_config.params && payload.crawler_config.params.stream;
+            const direct = payload && payload.stream;
+            return toBool(fromCrawler) || toBool(direct);
+        }
+
        // Main run function
        async function runCrawl() {
            const endpoint = document.getElementById('endpoint').value;
@@ -611,16 +619,24 @@
                        : { browser_config: cfgJson };
                }
            } catch (err) {
-                updateStatus('error');
-                document.querySelector('#response-content code').textContent =
-                    JSON.stringify({ error: err.message }, null, 2);
-                forceHighlightElement(document.querySelector('#response-content code'));
-                return; // stop run
+                const codeText = cm.getValue();
+                const streamFlag = /stream\s*=\s*True/i.test(codeText);
+                const isCrawlEndpoint = document.getElementById('endpoint').value === 'crawl';
+                if (isCrawlEndpoint && streamFlag) {
+                    // Fallback: proceed with minimal config only for stream
+                    advConfig = { crawler_config: { stream: true } };
+                } else {
+                    updateStatus('error');
+                    document.querySelector('#response-content code').textContent =
+                        JSON.stringify({ error: err.message }, null, 2);
+                    forceHighlightElement(document.querySelector('#response-content code'));
+                    return; // stop run
+                }
            }

            const endpointMap = {
                crawl: '/crawl',
-                // crawl_stream: '/crawl/stream',
+                crawl_stream: '/crawl/stream', // Keep for backward compatibility
                md: '/md',
                llm: '/llm'
            };
@@ -647,7 +663,7 @@
                // This will be handled directly in the fetch below
                payload = null;
            } else {
-                // Default payload for /crawl and /crawl/stream
+                // Default payload for /crawl (supports both streaming and batch modes)
                payload = {
                    urls,
                    ...advConfig
@@ -659,6 +675,7 @@
            try {
                const startTime = performance.now();
                let response, responseData;
+                const useStreamOverride = (endpoint === 'crawl') && shouldUseStream(payload);

                if (endpoint === 'llm') {
                    // Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
@@ -681,8 +698,8 @@
                    document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
                    document.querySelector('#response-content code').className = 'json hljs';
                    forceHighlightElement(document.querySelector('#response-content code'));
-                } else if (endpoint === 'crawl_stream') {
-                    // Stream processing
+                } else if (endpoint === 'crawl_stream' || useStreamOverride) {
+                    // Stream processing - now handled directly by /crawl endpoint
                    response = await fetch(api, {
                        method: 'POST',
                        headers: { 'Content-Type': 'application/json' },
@@ -757,6 +774,7 @@
                    const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
                    generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
                } else {
+                    // Use the same API endpoint for both streaming and non-streaming
                    generateSnippets(api, payload);
                }
            } catch (error) {
@@ -786,7 +804,7 @@
            document.getElementById('stress-avg-time').textContent = '0';
            document.getElementById('stress-peak-mem').textContent = '0';

-            const api = useStream ? '/crawl/stream' : '/crawl';
+            const api = '/crawl'; // Always use /crawl - backend handles streaming internally
            const urls = Array.from({ length: total }, (_, i) => `https://httpbin.org/anything/stress-${i}-${Date.now()}`);
            const chunks = [];

--- a/tests/docker/test_server_requests.py
+++ b/tests/docker/test_server_requests.py
@@ -143,7 +143,40 @@ class TestCrawlEndpoints:
        assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
        # We don't specify a markdown generator in this test, so don't make assumptions about markdown field
        # It might be null, missing, or populated depending on the server's default behavior
+    async def test_crawl_with_stream_direct(self, async_client: httpx.AsyncClient):
+        """Test that /crawl endpoint handles stream=True directly without redirect."""
+        payload = {
+            "urls": [SIMPLE_HTML_URL],
+            "browser_config": {
+                "type": "BrowserConfig",
+                "params": {
+                    "headless": True,
+                }
+            },
+            "crawler_config": {
+                "type": "CrawlerRunConfig", 
+                "params": {
+                    "stream": True,  # Set stream to True for direct streaming
+                    "screenshot": False,
+                    "cache_mode": CacheMode.BYPASS.value
+                }
+            }
+        }

+        # Send a request to the /crawl endpoint - should handle streaming directly
+        async with async_client.stream("POST", "/crawl", json=payload) as response:
+            assert response.status_code == 200
+            assert response.headers["content-type"] == "application/x-ndjson"
+            assert response.headers.get("x-stream-status") == "active"
+
+            results = await process_streaming_response(response)
+
+            assert len(results) == 1
+            result = results[0]
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            assert result["url"] == SIMPLE_HTML_URL
+            assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
    async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
        """Test /crawl/stream with a single URL and simple config values."""
        payload = {
Author	SHA1	Message	Date
AHMET YILMAZ	1874a7b8d2	fix: update option labels in request builder for clarity	2025-09-05 17:06:25 +08:00
AHMET YILMAZ	6a3b3e9d38	Commit without API	2025-09-03 17:02:40 +08:00