fix: update option labels in request builder for clarity

Commit without API
2025-09-05 17:06:25 +08:00 · 2025-09-03 17:02:40 +08:00
9 changed files with 101 additions and 421 deletions
--- a/crawl4ai/adaptive_crawler.py
+++ b/crawl4ai/adaptive_crawler.py
@@ -19,7 +19,7 @@ import re
 from pathlib import Path

 from crawl4ai.async_webcrawler import AsyncWebCrawler
-from crawl4ai.async_configs import CrawlerRunConfig, LinkPreviewConfig, LLMConfig
+from crawl4ai.async_configs import CrawlerRunConfig, LinkPreviewConfig
 from crawl4ai.models import Link, CrawlResult
 import numpy as np

@@ -178,7 +178,7 @@ class AdaptiveConfig:
    
    # Embedding strategy parameters
    embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
-    embedding_llm_config: Optional[Union[LLMConfig, Dict]] = None  # Separate config for embeddings
+    embedding_llm_config: Optional[Dict] = None  # Separate config for embeddings
    n_query_variations: int = 10
    coverage_threshold: float = 0.85
    alpha_shape_alpha: float = 0.5
@@ -250,30 +250,6 @@ class AdaptiveConfig:
        assert 0 <= self.embedding_quality_max_confidence <= 1, "embedding_quality_max_confidence must be between 0 and 1"
        assert self.embedding_quality_scale_factor > 0, "embedding_quality_scale_factor must be positive"
        assert 0 <= self.embedding_min_confidence_threshold <= 1, "embedding_min_confidence_threshold must be between 0 and 1"
-    
-    @property
-    def _embedding_llm_config_dict(self) -> Optional[Dict]:
-        """Convert LLMConfig to dict format for backward compatibility."""
-        if self.embedding_llm_config is None:
-            return None
-        
-        if isinstance(self.embedding_llm_config, dict):
-            # Already a dict - return as-is for backward compatibility
-            return self.embedding_llm_config
-        
-        # Convert LLMConfig object to dict format
-        return {
-            'provider': self.embedding_llm_config.provider,
-            'api_token': self.embedding_llm_config.api_token,
-            'base_url': getattr(self.embedding_llm_config, 'base_url', None),
-            'temperature': getattr(self.embedding_llm_config, 'temperature', None),
-            'max_tokens': getattr(self.embedding_llm_config, 'max_tokens', None),
-            'top_p': getattr(self.embedding_llm_config, 'top_p', None),
-            'frequency_penalty': getattr(self.embedding_llm_config, 'frequency_penalty', None),
-            'presence_penalty': getattr(self.embedding_llm_config, 'presence_penalty', None),
-            'stop': getattr(self.embedding_llm_config, 'stop', None),
-            'n': getattr(self.embedding_llm_config, 'n', None),
-        }


 class CrawlStrategy(ABC):
@@ -617,7 +593,7 @@ class StatisticalStrategy(CrawlStrategy):
 class EmbeddingStrategy(CrawlStrategy):
    """Embedding-based adaptive crawling using semantic space coverage"""
    
-    def __init__(self, embedding_model: str = None, llm_config: Union[LLMConfig, Dict] = None):
+    def __init__(self, embedding_model: str = None, llm_config: Dict = None):
        self.embedding_model = embedding_model or "sentence-transformers/all-MiniLM-L6-v2"
        self.llm_config = llm_config
        self._embedding_cache = {}
@@ -629,24 +605,14 @@ class EmbeddingStrategy(CrawlStrategy):
        self._kb_embeddings_hash = None  # Track KB changes
        self._validation_embeddings_cache = None  # Cache validation query embeddings
        self._kb_similarity_threshold = 0.95  # Threshold for deduplication
-    
-    def _get_embedding_llm_config_dict(self) -> Dict:
-        """Get embedding LLM config as dict with fallback to default."""
-        if hasattr(self, 'config') and self.config:
-            config_dict = self.config._embedding_llm_config_dict
-            if config_dict:
-                return config_dict
-        
-        # Fallback to default if no config provided
-        return {
-            'provider': 'openai/text-embedding-3-small',
-            'api_token': os.getenv('OPENAI_API_KEY')
-        }
        
    async def _get_embeddings(self, texts: List[str]) -> Any:
        """Get embeddings using configured method"""
        from .utils import get_text_embeddings
-        embedding_llm_config = self._get_embedding_llm_config_dict()
+        embedding_llm_config = {
+            'provider': 'openai/text-embedding-3-small',
+            'api_token': os.getenv('OPENAI_API_KEY')
+        }
        return await get_text_embeddings(
            texts, 
            embedding_llm_config,
@@ -713,20 +679,8 @@ class EmbeddingStrategy(CrawlStrategy):
        Return as a JSON array of strings."""
        
        # Use the LLM for query generation
-        # Convert LLMConfig to dict if needed
-        llm_config_dict = None
-        if self.llm_config:
-            if isinstance(self.llm_config, dict):
-                llm_config_dict = self.llm_config
-            else:
-                # Convert LLMConfig object to dict
-                llm_config_dict = {
-                    'provider': self.llm_config.provider,
-                    'api_token': self.llm_config.api_token
-                }
-        
-        provider = llm_config_dict.get('provider', 'openai/gpt-4o-mini') if llm_config_dict else 'openai/gpt-4o-mini'
-        api_token = llm_config_dict.get('api_token') if llm_config_dict else None
+        provider = self.llm_config.get('provider', 'openai/gpt-4o-mini') if self.llm_config else 'openai/gpt-4o-mini'
+        api_token = self.llm_config.get('api_token') if self.llm_config else None
        
        # response = perform_completion_with_backoff(
        #     provider=provider,
@@ -889,7 +843,10 @@ class EmbeddingStrategy(CrawlStrategy):
        
        # Batch embed only uncached links
        if texts_to_embed:
-            embedding_llm_config = self._get_embedding_llm_config_dict()
+            embedding_llm_config = {
+                'provider': 'openai/text-embedding-3-small',
+                'api_token': os.getenv('OPENAI_API_KEY')
+            }
            new_embeddings = await get_text_embeddings(texts_to_embed, embedding_llm_config, self.embedding_model)

            # Cache the new embeddings
@@ -1227,7 +1184,10 @@ class EmbeddingStrategy(CrawlStrategy):
            return
            
        # Get embeddings for new texts
-        embedding_llm_config = self._get_embedding_llm_config_dict()      
+        embedding_llm_config = {
+            'provider': 'openai/text-embedding-3-small',
+            'api_token': os.getenv('OPENAI_API_KEY')
+        }        
        new_embeddings = await get_text_embeddings(new_texts, embedding_llm_config, self.embedding_model)

        # Deduplicate embeddings before adding to KB
@@ -1296,12 +1256,10 @@ class AdaptiveCrawler:
        if strategy_name == "statistical":
            return StatisticalStrategy()
        elif strategy_name == "embedding":
-            strategy = EmbeddingStrategy(
+            return EmbeddingStrategy(
                embedding_model=self.config.embedding_model,
                llm_config=self.config.embedding_llm_config
            )
-            strategy.config = self.config  # Pass config to strategy
-            return strategy
        else:
            raise ValueError(f"Unknown strategy: {strategy_name}")
    
--- a/deploy/docker/auth.py
+++ b/deploy/docker/auth.py
@@ -28,43 +28,25 @@ def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -
    signing_key = get_jwk_from_secret(SECRET_KEY)
    return instance.encode(to_encode, signing_key, alg='HS256')

-def verify_token(credentials: HTTPAuthorizationCredentials) -> Dict:
+def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
    """Verify the JWT token from the Authorization header."""
-    
-    if not credentials or not credentials.credentials:
-        raise HTTPException(
-            status_code=401, 
-            detail="No token provided",
-            headers={"WWW-Authenticate": "Bearer"}
-        )
-    
+
+    if credentials is None:
+        return None
    token = credentials.credentials
    verifying_key = get_jwk_from_secret(SECRET_KEY)
    try:
        payload = instance.decode(token, verifying_key, do_time_check=True, algorithms='HS256')
        return payload
-    except Exception as e:
-        raise HTTPException(
-            status_code=401, 
-            detail=f"Invalid or expired token: {str(e)}",
-            headers={"WWW-Authenticate": "Bearer"}
-        )
+    except Exception:
+        raise HTTPException(status_code=401, detail="Invalid or expired token")


 def get_token_dependency(config: Dict):
    """Return the token dependency if JWT is enabled, else a function that returns None."""
-    
+
    if config.get("security", {}).get("jwt_enabled", False):
-        def jwt_required(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
-            """Enforce JWT authentication when enabled."""
-            if credentials is None:
-                raise HTTPException(
-                    status_code=401, 
-                    detail="Authentication required. Please provide a valid Bearer token.",
-                    headers={"WWW-Authenticate": "Bearer"}
-                )
-            return verify_token(credentials)
-        return jwt_required
+        return verify_token
    else:
        return lambda: None

--- a/deploy/docker/config.yml
+++ b/deploy/docker/config.yml
@@ -38,8 +38,8 @@ rate_limiting:

 # Security Configuration
 security:
-  enabled: false
-  jwt_enabled: false
+  enabled: false 
+  jwt_enabled: false 
  https_redirect: false
  trusted_hosts: ["*"]
  headers:
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -482,9 +482,14 @@ async def crawl(
 ):
    """
    Crawl a list of URLs and return the results as JSON.
+    For streaming responses, use /crawl/stream endpoint.
    """
    if not crawl_request.urls:
        raise HTTPException(400, "At least one URL required")
+    # Check whether it is a redirection for a streaming request
+    crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
+    if crawler_config.stream:
+        return await stream_process(crawl_request=crawl_request)
    results = await handle_crawl_request(
        urls=crawl_request.urls,
        browser_config=crawl_request.browser_config,
@@ -506,12 +511,16 @@ async def crawl_stream(
 ):
    if not crawl_request.urls:
        raise HTTPException(400, "At least one URL required")
+
+    return await stream_process(crawl_request=crawl_request)
+
+async def stream_process(crawl_request: CrawlRequest):
    crawler, gen = await handle_stream_crawl_request(
        urls=crawl_request.urls,
        browser_config=crawl_request.browser_config,
        crawler_config=crawl_request.crawler_config,
        config=config,
-    )
+)
    return StreamingResponse(
        stream_results(crawler, gen),
        media_type="application/x-ndjson",
--- a/deploy/docker/static/playground/index.html
+++ b/deploy/docker/static/playground/index.html
@@ -371,7 +371,7 @@

                <div class="flex items-center">
                    <input id="st-stream" type="checkbox" class="mr-2">
-                    <label for="st-stream" class="text-sm">Use /crawl/stream</label>
+                    <label for="st-stream" class="text-sm">Enable streaming mode</label>
                    <button id="st-run"
                        class="ml-auto bg-accent text-dark px-4 py-2 rounded hover:bg-opacity-90 font-medium">
                        Run Stress Test
@@ -596,6 +596,14 @@
            forceHighlightElement(curlCodeEl);
        }

+        // Detect if stream is requested inside payload
+        function shouldUseStream(payload) {
+            const toBool = (v) => v === true || (typeof v === 'string' && v.toLowerCase() === 'true');
+            const fromCrawler = payload && payload.crawler_config && payload.crawler_config.params && payload.crawler_config.params.stream;
+            const direct = payload && payload.stream;
+            return toBool(fromCrawler) || toBool(direct);
+        }
+
        // Main run function
        async function runCrawl() {
            const endpoint = document.getElementById('endpoint').value;
@@ -611,16 +619,24 @@
                        : { browser_config: cfgJson };
                }
            } catch (err) {
-                updateStatus('error');
-                document.querySelector('#response-content code').textContent =
-                    JSON.stringify({ error: err.message }, null, 2);
-                forceHighlightElement(document.querySelector('#response-content code'));
-                return; // stop run
+                const codeText = cm.getValue();
+                const streamFlag = /stream\s*=\s*True/i.test(codeText);
+                const isCrawlEndpoint = document.getElementById('endpoint').value === 'crawl';
+                if (isCrawlEndpoint && streamFlag) {
+                    // Fallback: proceed with minimal config only for stream
+                    advConfig = { crawler_config: { stream: true } };
+                } else {
+                    updateStatus('error');
+                    document.querySelector('#response-content code').textContent =
+                        JSON.stringify({ error: err.message }, null, 2);
+                    forceHighlightElement(document.querySelector('#response-content code'));
+                    return; // stop run
+                }
            }

            const endpointMap = {
                crawl: '/crawl',
-                // crawl_stream: '/crawl/stream',
+                crawl_stream: '/crawl/stream', // Keep for backward compatibility
                md: '/md',
                llm: '/llm'
            };
@@ -647,7 +663,7 @@
                // This will be handled directly in the fetch below
                payload = null;
            } else {
-                // Default payload for /crawl and /crawl/stream
+                // Default payload for /crawl (supports both streaming and batch modes)
                payload = {
                    urls,
                    ...advConfig
@@ -659,6 +675,7 @@
            try {
                const startTime = performance.now();
                let response, responseData;
+                const useStreamOverride = (endpoint === 'crawl') && shouldUseStream(payload);

                if (endpoint === 'llm') {
                    // Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
@@ -681,8 +698,8 @@
                    document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
                    document.querySelector('#response-content code').className = 'json hljs';
                    forceHighlightElement(document.querySelector('#response-content code'));
-                } else if (endpoint === 'crawl_stream') {
-                    // Stream processing
+                } else if (endpoint === 'crawl_stream' || useStreamOverride) {
+                    // Stream processing - now handled directly by /crawl endpoint
                    response = await fetch(api, {
                        method: 'POST',
                        headers: { 'Content-Type': 'application/json' },
@@ -757,6 +774,7 @@
                    const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
                    generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
                } else {
+                    // Use the same API endpoint for both streaming and non-streaming
                    generateSnippets(api, payload);
                }
            } catch (error) {
@@ -786,7 +804,7 @@
            document.getElementById('stress-avg-time').textContent = '0';
            document.getElementById('stress-peak-mem').textContent = '0';

-            const api = useStream ? '/crawl/stream' : '/crawl';
+            const api = '/crawl'; // Always use /crawl - backend handles streaming internally
            const urls = Array.from({ length: total }, (_, i) => `https://httpbin.org/anything/stress-${i}-${Date.now()}`);
            const chunks = [];

--- a/docs/examples/adaptive_crawling/llm_config_example.py
+++ b/docs/examples/adaptive_crawling/llm_config_example.py
@@ -1,154 +0,0 @@
-import asyncio
-import os
-from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig, LLMConfig
-
-
-async def test_configuration(name: str, config: AdaptiveConfig, url: str, query: str):
-    """Test a specific configuration"""
-    print(f"\n{'='*60}")
-    print(f"Configuration: {name}")
-    print(f"{'='*60}")
-    
-    async with AsyncWebCrawler(verbose=False) as crawler:
-        adaptive = AdaptiveCrawler(crawler, config)
-        result = await adaptive.digest(start_url=url, query=query)
-        
-        print("\n" + "="*50)
-        print("CRAWL STATISTICS")
-        print("="*50)
-        adaptive.print_stats(detailed=False)
-        
-        # Get the most relevant content found
-        print("\n" + "="*50)
-        print("MOST RELEVANT PAGES")
-        print("="*50)
-        
-        relevant_pages = adaptive.get_relevant_content(top_k=5)
-        for i, page in enumerate(relevant_pages, 1):
-            print(f"\n{i}. {page['url']}")
-            print(f"   Relevance Score: {page['score']:.2%}")
-            
-            # Show a snippet of the content
-            content = page['content'] or ""
-            if content:
-                snippet = content[:200].replace('\n', ' ')
-                if len(content) > 200:
-                    snippet += "..."
-                print(f"   Preview: {snippet}")
-        
-        print(f"\n{'='*50}")
-        print(f"Pages crawled: {len(result.crawled_urls)}")
-        print(f"Final confidence: {adaptive.confidence:.1%}")
-        print(f"Stopped reason: {result.metrics.get('stopped_reason', 'max_pages')}")
-        
-        if result.metrics.get('is_irrelevant', False):
-            print("⚠️  Query detected as irrelevant!")
-        
-        return result
-
-
-async def llm_embedding():
-    """Demonstrate various embedding configurations"""
-    
-    print("EMBEDDING STRATEGY CONFIGURATION EXAMPLES")
-    print("=" * 60)
-    
-    # Base URL and query for testing
-    test_url = "https://docs.python.org/3/library/asyncio.html"
-    
-    openai_llm_config = LLMConfig(
-        provider='openai/text-embedding-3-small',
-        api_token=os.getenv('OPENAI_API_KEY'),
-        temperature=0.7,
-        max_tokens=2000
-    )
-    config_openai = AdaptiveConfig(
-        strategy="embedding",
-        max_pages=10,
-        
-        # Use OpenAI embeddings
-        embedding_llm_config=openai_llm_config,
-        # embedding_llm_config={
-        #     'provider': 'openai/text-embedding-3-small',
-        #     'api_token': os.getenv('OPENAI_API_KEY')
-        # },
-        
-        # OpenAI embeddings are high quality, can be stricter
-        embedding_k_exp=4.0,
-        n_query_variations=12
-    )
-    
-    await test_configuration(
-        "OpenAI Embeddings",
-        config_openai,
-        test_url,
-        # "event-driven architecture patterns"
-        "async await context managers coroutines"
-    )
-    return
-    
-    
-
-async def basic_adaptive_crawling():
-    """Basic adaptive crawling example"""
-    
-    # Initialize the crawler
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        # Create an adaptive crawler with default settings (statistical strategy)
-        adaptive = AdaptiveCrawler(crawler)
-        
-        # Note: You can also use embedding strategy for semantic understanding:
-        # from crawl4ai import AdaptiveConfig
-        # config = AdaptiveConfig(strategy="embedding")
-        # adaptive = AdaptiveCrawler(crawler, config)
-        
-        # Start adaptive crawling
-        print("Starting adaptive crawl for Python async programming information...")
-        result = await adaptive.digest(
-            start_url="https://docs.python.org/3/library/asyncio.html",
-            query="async await context managers coroutines"
-        )
-        
-        # Display crawl statistics
-        print("\n" + "="*50)
-        print("CRAWL STATISTICS")
-        print("="*50)
-        adaptive.print_stats(detailed=False)
-        
-        # Get the most relevant content found
-        print("\n" + "="*50)
-        print("MOST RELEVANT PAGES")
-        print("="*50)
-        
-        relevant_pages = adaptive.get_relevant_content(top_k=5)
-        for i, page in enumerate(relevant_pages, 1):
-            print(f"\n{i}. {page['url']}")
-            print(f"   Relevance Score: {page['score']:.2%}")
-            
-            # Show a snippet of the content
-            content = page['content'] or ""
-            if content:
-                snippet = content[:200].replace('\n', ' ')
-                if len(content) > 200:
-                    snippet += "..."
-                print(f"   Preview: {snippet}")
-        
-        # Show final confidence
-        print(f"\n{'='*50}")
-        print(f"Final Confidence: {adaptive.confidence:.2%}")
-        print(f"Total Pages Crawled: {len(result.crawled_urls)}")
-        print(f"Knowledge Base Size: {len(adaptive.state.knowledge_base)} documents")
-        
-        
-        if adaptive.confidence >= 0.8:
-            print("✓ High confidence - can answer detailed questions about async Python")
-        elif adaptive.confidence >= 0.6:
-            print("~ Moderate confidence - can answer basic questions") 
-        else:
-            print("✗ Low confidence - need more information")
-
-
-
-if __name__ == "__main__":
-    asyncio.run(llm_embedding())
-    # asyncio.run(basic_adaptive_crawling())
--- a/docs/md_v2/core/adaptive-crawling.md
+++ b/docs/md_v2/core/adaptive-crawling.md
@@ -108,19 +108,7 @@ config = AdaptiveConfig(
    embedding_min_confidence_threshold=0.1  # Stop if completely irrelevant
 )

-# With custom LLM provider for query expansion (recommended)
-from crawl4ai import LLMConfig
-
-config = AdaptiveConfig(
-    strategy="embedding",
-    embedding_llm_config=LLMConfig(
-        provider='openai/text-embedding-3-small',
-        api_token='your-api-key',
-        temperature=0.7
-    )
-)
-
-# Alternative: Dictionary format (backward compatible)
+# With custom embedding provider (e.g., OpenAI)
 config = AdaptiveConfig(
    strategy="embedding",
    embedding_llm_config={
--- a/tests/adaptive/test_llm_embedding.py
+++ b/tests/adaptive/test_llm_embedding.py
@@ -1,154 +0,0 @@
-import asyncio
-import os
-from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig, LLMConfig
-
-
-async def test_configuration(name: str, config: AdaptiveConfig, url: str, query: str):
-    """Test a specific configuration"""
-    print(f"\n{'='*60}")
-    print(f"Configuration: {name}")
-    print(f"{'='*60}")
-    
-    async with AsyncWebCrawler(verbose=False) as crawler:
-        adaptive = AdaptiveCrawler(crawler, config)
-        result = await adaptive.digest(start_url=url, query=query)
-        
-        print("\n" + "="*50)
-        print("CRAWL STATISTICS")
-        print("="*50)
-        adaptive.print_stats(detailed=False)
-        
-        # Get the most relevant content found
-        print("\n" + "="*50)
-        print("MOST RELEVANT PAGES")
-        print("="*50)
-        
-        relevant_pages = adaptive.get_relevant_content(top_k=5)
-        for i, page in enumerate(relevant_pages, 1):
-            print(f"\n{i}. {page['url']}")
-            print(f"   Relevance Score: {page['score']:.2%}")
-            
-            # Show a snippet of the content
-            content = page['content'] or ""
-            if content:
-                snippet = content[:200].replace('\n', ' ')
-                if len(content) > 200:
-                    snippet += "..."
-                print(f"   Preview: {snippet}")
-        
-        print(f"\n{'='*50}")
-        print(f"Pages crawled: {len(result.crawled_urls)}")
-        print(f"Final confidence: {adaptive.confidence:.1%}")
-        print(f"Stopped reason: {result.metrics.get('stopped_reason', 'max_pages')}")
-        
-        if result.metrics.get('is_irrelevant', False):
-            print("⚠️  Query detected as irrelevant!")
-        
-        return result
-
-
-async def llm_embedding():
-    """Demonstrate various embedding configurations"""
-    
-    print("EMBEDDING STRATEGY CONFIGURATION EXAMPLES")
-    print("=" * 60)
-    
-    # Base URL and query for testing
-    test_url = "https://docs.python.org/3/library/asyncio.html"
-    
-    openai_llm_config = LLMConfig(
-        provider='openai/text-embedding-3-small',
-        api_token=os.getenv('OPENAI_API_KEY'),
-        temperature=0.7,
-        max_tokens=2000
-    )
-    config_openai = AdaptiveConfig(
-        strategy="embedding",
-        max_pages=10,
-        
-        # Use OpenAI embeddings
-        embedding_llm_config=openai_llm_config,
-        # embedding_llm_config={
-        #     'provider': 'openai/text-embedding-3-small',
-        #     'api_token': os.getenv('OPENAI_API_KEY')
-        # },
-        
-        # OpenAI embeddings are high quality, can be stricter
-        embedding_k_exp=4.0,
-        n_query_variations=12
-    )
-    
-    await test_configuration(
-        "OpenAI Embeddings",
-        config_openai,
-        test_url,
-        # "event-driven architecture patterns"
-        "async await context managers coroutines"
-    )
-    return
-    
-    
-
-async def basic_adaptive_crawling():
-    """Basic adaptive crawling example"""
-    
-    # Initialize the crawler
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        # Create an adaptive crawler with default settings (statistical strategy)
-        adaptive = AdaptiveCrawler(crawler)
-        
-        # Note: You can also use embedding strategy for semantic understanding:
-        # from crawl4ai import AdaptiveConfig
-        # config = AdaptiveConfig(strategy="embedding")
-        # adaptive = AdaptiveCrawler(crawler, config)
-        
-        # Start adaptive crawling
-        print("Starting adaptive crawl for Python async programming information...")
-        result = await adaptive.digest(
-            start_url="https://docs.python.org/3/library/asyncio.html",
-            query="async await context managers coroutines"
-        )
-        
-        # Display crawl statistics
-        print("\n" + "="*50)
-        print("CRAWL STATISTICS")
-        print("="*50)
-        adaptive.print_stats(detailed=False)
-        
-        # Get the most relevant content found
-        print("\n" + "="*50)
-        print("MOST RELEVANT PAGES")
-        print("="*50)
-        
-        relevant_pages = adaptive.get_relevant_content(top_k=5)
-        for i, page in enumerate(relevant_pages, 1):
-            print(f"\n{i}. {page['url']}")
-            print(f"   Relevance Score: {page['score']:.2%}")
-            
-            # Show a snippet of the content
-            content = page['content'] or ""
-            if content:
-                snippet = content[:200].replace('\n', ' ')
-                if len(content) > 200:
-                    snippet += "..."
-                print(f"   Preview: {snippet}")
-        
-        # Show final confidence
-        print(f"\n{'='*50}")
-        print(f"Final Confidence: {adaptive.confidence:.2%}")
-        print(f"Total Pages Crawled: {len(result.crawled_urls)}")
-        print(f"Knowledge Base Size: {len(adaptive.state.knowledge_base)} documents")
-        
-        
-        if adaptive.confidence >= 0.8:
-            print("✓ High confidence - can answer detailed questions about async Python")
-        elif adaptive.confidence >= 0.6:
-            print("~ Moderate confidence - can answer basic questions") 
-        else:
-            print("✗ Low confidence - need more information")
-
-
-
-if __name__ == "__main__":
-    asyncio.run(llm_embedding())
-    # asyncio.run(basic_adaptive_crawling())
--- a/tests/docker/test_server_requests.py
+++ b/tests/docker/test_server_requests.py
@@ -143,7 +143,40 @@ class TestCrawlEndpoints:
        assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
        # We don't specify a markdown generator in this test, so don't make assumptions about markdown field
        # It might be null, missing, or populated depending on the server's default behavior
+    async def test_crawl_with_stream_direct(self, async_client: httpx.AsyncClient):
+        """Test that /crawl endpoint handles stream=True directly without redirect."""
+        payload = {
+            "urls": [SIMPLE_HTML_URL],
+            "browser_config": {
+                "type": "BrowserConfig",
+                "params": {
+                    "headless": True,
+                }
+            },
+            "crawler_config": {
+                "type": "CrawlerRunConfig", 
+                "params": {
+                    "stream": True,  # Set stream to True for direct streaming
+                    "screenshot": False,
+                    "cache_mode": CacheMode.BYPASS.value
+                }
+            }
+        }

+        # Send a request to the /crawl endpoint - should handle streaming directly
+        async with async_client.stream("POST", "/crawl", json=payload) as response:
+            assert response.status_code == 200
+            assert response.headers["content-type"] == "application/x-ndjson"
+            assert response.headers.get("x-stream-status") == "active"
+
+            results = await process_streaming_response(response)
+
+            assert len(results) == 1
+            result = results[0]
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            assert result["url"] == SIMPLE_HTML_URL
+            assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
    async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
        """Test /crawl/stream with a single URL and simple config values."""
        payload = {
Author	SHA1	Message	Date
AHMET YILMAZ	1874a7b8d2	fix: update option labels in request builder for clarity	2025-09-05 17:06:25 +08:00
AHMET YILMAZ	6a3b3e9d38	Commit without API	2025-09-03 17:02:40 +08:00