feat(pipeline): add high-level Crawler utility class for simplified web crawling

Add new Crawler class that provides a simplified interface for both single and batch URL crawling operations. Key features include: - Simple single URL crawling with configurable options - Parallel batch crawling with concurrency control - Shared browser hub support for resource efficiency - Progress tracking and custom retry strategies - Comprehensive error handling and retry logic Remove demo and extended test files in favor of new focused test suite.
Add test file for Pipeline batch crawl.
2025-04-07 22:50:44 +08:00 · 2025-04-06 19:38:31 +08:00 · 2025-04-06 18:28:28 +08:00
16 changed files with 2991 additions and 9 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -4,6 +4,12 @@ import warnings
 from .async_webcrawler import AsyncWebCrawler, CacheMode
 from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig

+from .pipeline.pipeline import (
+    Pipeline,
+    create_pipeline,
+)
+from .pipeline.crawler import Crawler
+
 from .content_scraping_strategy import (
    ContentScrapingStrategy,
    WebScrapingStrategy,
@@ -65,7 +71,14 @@ from .deep_crawling import (
    DeepCrawlDecorator,
 )

+from .async_crawler_strategy import AsyncPlaywrightCrawlerStrategy, AsyncHTTPCrawlerStrategy
+
 __all__ = [
+    "Pipeline",
+    "AsyncPlaywrightCrawlerStrategy",
+    "AsyncHTTPCrawlerStrategy",
+    "create_pipeline",
+    "Crawler",
    "AsyncLoggerBase",
    "AsyncLogger",
    "AsyncWebCrawler",
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -270,7 +270,7 @@ class BrowserConfig:
        host: str = "localhost",
    ):
        self.browser_type = browser_type
-        self.headless = headless or True
+        self.headless = headless 
        self.browser_mode = browser_mode
        self.use_managed_browser = use_managed_browser
        self.cdp_url = cdp_url
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -625,7 +625,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            except Error:
                visibility_info = await self.check_visibility(page)

-                if self.config.verbose:
+                if config.verbose:
                    self.logger.debug(
                        message="Body visibility info: {info}",
                        tag="DEBUG",
--- a/crawl4ai/browser/init.py
+++ b/crawl4ai/browser/init.py
@@ -9,6 +9,7 @@ from .profiles import BrowserProfileManager
 from .models import DockerConfig
 from .docker_registry import DockerRegistry
 from .docker_utils import DockerUtils
+from .browser_hub import BrowserHub
 from .strategies import (
    BaseBrowserStrategy,
    PlaywrightBrowserStrategy,
@@ -19,4 +20,4 @@ from .strategies import (

 __all__ = ['BrowserManager', 'BrowserProfileManager', 'DockerConfig', 'DockerRegistry', 'DockerUtils', 'BaseBrowserStrategy',
           'PlaywrightBrowserStrategy', 'CDPBrowserStrategy', 'BuiltinBrowserStrategy',
-           'DockerBrowserStrategy']
+           'DockerBrowserStrategy', 'BrowserHub']
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -672,7 +672,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
                    block["error"] = False
            except Exception:
                parsed, unparsed = split_and_parse_json_objects(
-                    response.choices[0].message.content
+                    response
                )
                blocks = parsed
                if unparsed:
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -1,4 +1,4 @@
-from pydantic import BaseModel, HttpUrl, PrivateAttr
+from pydantic import BaseModel, HttpUrl, PrivateAttr, ConfigDict
 from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
 from typing import AsyncGenerator
 from typing import Generic, TypeVar
@@ -146,8 +146,9 @@ class CrawlResult(BaseModel):
    dispatch_result: Optional[DispatchResult] = None
    redirected_url: Optional[str] = None

-    class Config:
-        arbitrary_types_allowed = True
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    # class Config:
+    #     arbitrary_types_allowed = True

 # NOTE: The StringCompatibleMarkdown class, custom __init__ method, property getters/setters,
 # and model_dump override all exist to support a smooth transition from markdown as a string
@@ -312,8 +313,9 @@ class AsyncCrawlResponse(BaseModel):
    ssl_certificate: Optional[SSLCertificate] = None
    redirected_url: Optional[str] = None

-    class Config:
-        arbitrary_types_allowed = True
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    # class Config:
+    #     arbitrary_types_allowed = True

 ###############################
 # Scraping Models
--- a/crawl4ai/pipeline/init.py
+++ b/crawl4ai/pipeline/init.py
@@ -0,0 +1,6 @@
+"""Pipeline module providing high-level crawling functionality."""
+
+from .pipeline import Pipeline, create_pipeline
+from .crawler import Crawler
+
+__all__ = ["Pipeline", "create_pipeline", "Crawler"]
--- a/crawl4ai/pipeline/crawler.py
+++ b/crawl4ai/pipeline/crawler.py
@@ -0,0 +1,406 @@
+"""Crawler utility class for simplified crawling operations.
+
+This module provides a high-level utility class for crawling web pages
+with support for both single and multiple URL processing.
+"""
+
+import asyncio
+from typing import Dict, List, Optional, Tuple, Union, Callable
+
+from crawl4ai.models import CrawlResultContainer, CrawlResult
+from crawl4ai.pipeline.pipeline import create_pipeline
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_logger import AsyncLogger
+from crawl4ai.browser.browser_hub import BrowserHub
+
+# Type definitions
+UrlList = List[str]
+UrlBatch = Tuple[List[str], CrawlerRunConfig]
+UrlFullBatch = Tuple[List[str], BrowserConfig, CrawlerRunConfig]
+BatchType = Union[UrlList, UrlBatch, UrlFullBatch]
+ProgressCallback = Callable[[str, str, Optional[CrawlResultContainer]], None]
+RetryStrategy = Callable[[str, int, Exception], Tuple[bool, float]]
+
+class Crawler:
+    """High-level utility class for crawling web pages.
+    
+    This class provides simplified methods for crawling both single URLs
+    and batches of URLs, with parallel processing capabilities.
+    """
+    
+    @classmethod
+    async def crawl(
+        cls, 
+        urls: Union[str, List[str]],
+        browser_config: Optional[BrowserConfig] = None,
+        crawler_config: Optional[CrawlerRunConfig] = None,
+        browser_hub: Optional[BrowserHub] = None,
+        logger: Optional[AsyncLogger] = None,
+        max_retries: int = 0,
+        retry_delay: float = 1.0,
+        use_new_loop: bool = True  # By default use a new loop for safety
+    ) -> Union[CrawlResultContainer, Dict[str, CrawlResultContainer]]:
+        """Crawl one or more URLs with the specified configurations.
+        
+        Args:
+            urls: Single URL or list of URLs to crawl
+            browser_config: Optional browser configuration
+            crawler_config: Optional crawler run configuration
+            browser_hub: Optional shared browser hub
+            logger: Optional logger instance
+            max_retries: Maximum number of retries for failed requests
+            retry_delay: Delay between retries in seconds
+            
+        Returns:
+            For a single URL: CrawlResultContainer with crawl results
+            For multiple URLs: Dict mapping URLs to their CrawlResultContainer results
+        """
+        # Handle single URL case
+        if isinstance(urls, str):
+            return await cls._crawl_single_url(
+                urls,
+                browser_config,
+                crawler_config,
+                browser_hub,
+                logger,
+                max_retries,
+                retry_delay,
+                use_new_loop
+            )
+        
+        # Handle multiple URLs case (sequential processing)
+        results = {}
+        for url in urls:
+            results[url] = await cls._crawl_single_url(
+                url,
+                browser_config,
+                crawler_config,
+                browser_hub,
+                logger,
+                max_retries,
+                retry_delay,
+                use_new_loop
+            )
+        
+        return results
+    
+    @classmethod
+    async def _crawl_single_url(
+        cls,
+        url: str,
+        browser_config: Optional[BrowserConfig] = None,
+        crawler_config: Optional[CrawlerRunConfig] = None,
+        browser_hub: Optional[BrowserHub] = None,
+        logger: Optional[AsyncLogger] = None,
+        max_retries: int = 0,
+        retry_delay: float = 1.0,
+        use_new_loop: bool = False
+    ) -> CrawlResultContainer:
+        """Internal method to crawl a single URL with retry logic."""
+        # Create a logger if none provided
+        if logger is None:
+            logger = AsyncLogger(verbose=True)
+            
+        # Create or use the provided crawler config
+        if crawler_config is None:
+            crawler_config = CrawlerRunConfig()
+            
+        attempts = 0
+        last_error = None
+        
+        # For testing purposes, each crawler gets a new event loop to avoid conflicts
+        # This is especially important in test suites where multiple tests run in sequence
+        if use_new_loop:
+            old_loop = asyncio.get_event_loop()
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            
+        while attempts <= max_retries:
+            try:
+                # Create a pipeline
+                pipeline_args = {}
+                if browser_config:
+                    pipeline_args["browser_config"] = browser_config
+                if browser_hub:
+                    pipeline_args["browser_hub"] = browser_hub
+                if logger:
+                    pipeline_args["logger"] = logger
+                    
+                pipeline = await create_pipeline(**pipeline_args)
+                
+                # Perform the crawl
+                result = await pipeline.crawl(url=url, config=crawler_config)
+                
+                # Close the pipeline if we created it (not using a shared hub)
+                if not browser_hub:
+                    await pipeline.close()
+                    
+                # Restore the original event loop if we created a new one
+                if use_new_loop:
+                    asyncio.set_event_loop(old_loop)
+                    loop.close()
+                    
+                return result
+                
+            except Exception as e:
+                last_error = e
+                attempts += 1
+                
+                if attempts <= max_retries:
+                    logger.warning(
+                        message="Crawl attempt {attempt} failed for {url}: {error}. Retrying in {delay}s...",
+                        tag="RETRY",
+                        params={
+                            "attempt": attempts,
+                            "url": url,
+                            "error": str(e),
+                            "delay": retry_delay
+                        }
+                    )
+                    await asyncio.sleep(retry_delay)
+                else:
+                    logger.error(
+                        message="All {attempts} crawl attempts failed for {url}: {error}",
+                        tag="FAILED",
+                        params={
+                            "attempts": attempts,
+                            "url": url,
+                            "error": str(e)
+                        }
+                    )
+        
+        # If we get here, all attempts failed
+        result = CrawlResultContainer(
+            CrawlResult(
+                url=url,
+                html="",
+                success=False,
+                error_message=f"All {attempts} crawl attempts failed: {str(last_error)}"
+            )
+        )
+        
+        # Restore the original event loop if we created a new one
+        if use_new_loop:
+            asyncio.set_event_loop(old_loop)
+            loop.close()
+            
+        return result
+    
+    @classmethod
+    async def parallel_crawl(
+        cls,
+        url_batches: Union[List[str], List[Union[UrlBatch, UrlFullBatch]]],
+        browser_config: Optional[BrowserConfig] = None,
+        crawler_config: Optional[CrawlerRunConfig] = None,
+        browser_hub: Optional[BrowserHub] = None,
+        logger: Optional[AsyncLogger] = None,
+        concurrency: int = 5,
+        max_retries: int = 0,
+        retry_delay: float = 1.0,
+        retry_strategy: Optional[RetryStrategy] = None,
+        progress_callback: Optional[ProgressCallback] = None,
+        use_new_loop: bool = True  # By default use a new loop for safety
+    ) -> Dict[str, CrawlResultContainer]:
+        """Crawl multiple URLs in parallel with concurrency control.
+        
+        Args:
+            url_batches: List of URLs or list of URL batches with configurations
+            browser_config: Default browser configuration (used if not in batch)
+            crawler_config: Default crawler configuration (used if not in batch)
+            browser_hub: Optional shared browser hub for resource efficiency
+            logger: Optional logger instance
+            concurrency: Maximum number of concurrent crawls
+            max_retries: Maximum number of retries for failed requests
+            retry_delay: Delay between retries in seconds
+            retry_strategy: Optional custom retry strategy function
+            progress_callback: Optional callback for progress reporting
+            
+        Returns:
+            Dict mapping URLs to their CrawlResultContainer results
+        """
+        # Create a logger if none provided
+        if logger is None:
+            logger = AsyncLogger(verbose=True)
+        
+        # For testing purposes, each crawler gets a new event loop to avoid conflicts
+        # This is especially important in test suites where multiple tests run in sequence
+        if use_new_loop:
+            old_loop = asyncio.get_event_loop()
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            
+        # Process batches to consistent format
+        processed_batches = cls._process_url_batches(
+            url_batches, browser_config, crawler_config
+        )
+        
+        # Initialize results dictionary
+        results = {}
+        
+        # Create semaphore for concurrency control
+        semaphore = asyncio.Semaphore(concurrency)
+        
+        # Create shared browser hub if not provided
+        shared_hub = browser_hub
+        if not shared_hub:
+            shared_hub = await BrowserHub.get_browser_manager(
+                config=browser_config or BrowserConfig(),
+                logger=logger,
+                max_browsers_per_config=concurrency,
+                max_pages_per_browser=1,
+                initial_pool_size=min(concurrency, 3)  # Start with a reasonable number
+            )
+        
+        try:
+            # Create worker function for each URL
+            async def process_url(url, b_config, c_config):
+                async with semaphore:
+                    # Report start if callback provided
+                    if progress_callback:
+                        await progress_callback("started", url)
+                    
+                    attempts = 0
+                    last_error = None
+                    
+                    while attempts <= max_retries:
+                        try:
+                            # Create a pipeline using the shared hub
+                            pipeline = await create_pipeline(
+                                browser_config=b_config,
+                                browser_hub=shared_hub,
+                                logger=logger
+                            )
+                            
+                            # Perform the crawl
+                            result = await pipeline.crawl(url=url, config=c_config)
+                            
+                            # Report completion if callback provided
+                            if progress_callback:
+                                await progress_callback("completed", url, result)
+                                
+                            return url, result
+                            
+                        except Exception as e:
+                            last_error = e
+                            attempts += 1
+                            
+                            # Determine if we should retry and with what delay
+                            should_retry = attempts <= max_retries
+                            delay = retry_delay
+                            
+                            # Use custom retry strategy if provided
+                            if retry_strategy and should_retry:
+                                try:
+                                    should_retry, delay = await retry_strategy(url, attempts, e)
+                                except Exception as strategy_error:
+                                    logger.error(
+                                        message="Error in retry strategy: {error}",
+                                        tag="RETRY",
+                                        params={"error": str(strategy_error)}
+                                    )
+                            
+                            if should_retry:
+                                logger.warning(
+                                    message="Crawl attempt {attempt} failed for {url}: {error}. Retrying in {delay}s...",
+                                    tag="RETRY",
+                                    params={
+                                        "attempt": attempts,
+                                        "url": url,
+                                        "error": str(e),
+                                        "delay": delay
+                                    }
+                                )
+                                await asyncio.sleep(delay)
+                            else:
+                                logger.error(
+                                    message="All {attempts} crawl attempts failed for {url}: {error}",
+                                    tag="FAILED",
+                                    params={
+                                        "attempts": attempts,
+                                        "url": url,
+                                        "error": str(e)
+                                    }
+                                )
+                                break
+                    
+                    # If we get here, all attempts failed
+                    error_result = CrawlResultContainer(
+                        CrawlResult(
+                            url=url,
+                            html="",
+                            success=False,
+                            error_message=f"All {attempts} crawl attempts failed: {str(last_error)}"
+                        )
+                    )
+                    
+                    # Report completion with error if callback provided
+                    if progress_callback:
+                        await progress_callback("completed", url, error_result)
+                        
+                    return url, error_result
+            
+            # Create tasks for all URLs
+            tasks = []
+            for urls, b_config, c_config in processed_batches:
+                for url in urls:
+                    tasks.append(process_url(url, b_config, c_config))
+            
+            # Run all tasks and collect results
+            for completed_task in asyncio.as_completed(tasks):
+                url, result = await completed_task
+                results[url] = result
+                
+            return results
+            
+        finally:
+            # Clean up the hub only if we created it
+            if not browser_hub and shared_hub:
+                await shared_hub.close()
+                
+            # Restore the original event loop if we created a new one
+            if use_new_loop:
+                asyncio.set_event_loop(old_loop)
+                loop.close()
+    
+    @classmethod
+    def _process_url_batches(
+        cls,
+        url_batches: Union[List[str], List[Union[UrlBatch, UrlFullBatch]]],
+        default_browser_config: Optional[BrowserConfig],
+        default_crawler_config: Optional[CrawlerRunConfig]
+    ) -> List[Tuple[List[str], BrowserConfig, CrawlerRunConfig]]:
+        """Process URL batches into a consistent format.
+        
+        Converts various input formats into a consistent list of
+        (urls, browser_config, crawler_config) tuples.
+        """
+        processed_batches = []
+        
+        # Handle case where input is just a list of URLs
+        if all(isinstance(item, str) for item in url_batches):
+            urls = url_batches
+            browser_config = default_browser_config or BrowserConfig()
+            crawler_config = default_crawler_config or CrawlerRunConfig()
+            processed_batches.append((urls, browser_config, crawler_config))
+            return processed_batches
+        
+        # Process each batch
+        for batch in url_batches:
+            # Handle case: (urls, crawler_config)
+            if len(batch) == 2 and isinstance(batch[1], CrawlerRunConfig):
+                urls, c_config = batch
+                b_config = default_browser_config or BrowserConfig()
+                processed_batches.append((urls, b_config, c_config))
+                
+            # Handle case: (urls, browser_config, crawler_config)
+            elif len(batch) == 3 and isinstance(batch[1], BrowserConfig) and isinstance(batch[2], CrawlerRunConfig):
+                processed_batches.append(batch)
+                
+            # Fallback for unknown formats - assume it's just a list of URLs
+            else:
+                urls = batch
+                browser_config = default_browser_config or BrowserConfig()
+                crawler_config = default_crawler_config or CrawlerRunConfig()
+                processed_batches.append((urls, browser_config, crawler_config))
+                
+        return processed_batches
--- a/crawl4ai/pipeline/middlewares.py
+++ b/crawl4ai/pipeline/middlewares.py
@@ -0,0 +1,702 @@
+import time
+import sys
+from typing import Dict, Any, List
+import json
+
+from crawl4ai.models import (
+    CrawlResult,
+    MarkdownGenerationResult,
+    ScrapingResult,
+    CrawlResultContainer,
+)
+from crawl4ai.async_database import async_db_manager
+from crawl4ai.cache_context import CacheMode, CacheContext
+from crawl4ai.utils import (
+    sanitize_input_encode,
+    InvalidCSSSelectorError,
+    fast_format_html,
+    create_box_message,
+    get_error_context,
+)
+
+
+async def initialize_context_middleware(context: Dict[str, Any]) -> int:
+    """Initialize the context with basic configuration and validation"""
+    url = context.get("url")
+    config = context.get("config")
+    
+    if not isinstance(url, str) or not url:
+        context["error_message"] = "Invalid URL, make sure the URL is a non-empty string"
+        return 0
+    
+    # Default to ENABLED if no cache mode specified
+    if config.cache_mode is None:
+        config.cache_mode = CacheMode.ENABLED
+    
+    # Create cache context
+    context["cache_context"] = CacheContext(url, config.cache_mode, False)
+    context["start_time"] = time.perf_counter()
+    
+    return 1
+
+# middlewares.py additions
+
+async def browser_hub_middleware(context: Dict[str, Any]) -> int:
+    """
+    Initialize or connect to a Browser-Hub and add it to the pipeline context.
+    
+    This middleware handles browser hub initialization for all three scenarios:
+    1. Default configuration when nothing is specified
+    2. Custom configuration when browser_config is provided
+    3. Connection to existing hub when browser_hub_connection is provided
+    
+    Args:
+        context: The pipeline context dictionary
+        
+    Returns:
+        int: 1 for success, 0 for failure
+    """
+    from crawl4ai.browser.browser_hub import BrowserHub
+    
+    try:
+        # Get configuration from context
+        browser_config = context.get("browser_config")
+        browser_hub_id = context.get("browser_hub_id")
+        browser_hub_connection = context.get("browser_hub_connection")
+        logger = context.get("logger")
+        
+        # If we already have a browser hub in context, use it
+        if context.get("browser_hub"):
+            return 1
+        
+        # Get or create Browser-Hub
+        browser_hub = await BrowserHub.get_browser_manager(
+            config=browser_config,
+            hub_id=browser_hub_id,
+            connection_info=browser_hub_connection,
+            logger=logger
+        )
+        
+        # Add to context
+        context["browser_hub"] = browser_hub
+        return 1
+    except Exception as e:
+        context["error_message"] = f"Failed to initialize browser hub: {str(e)}"
+        return 0
+
+
+async def fetch_content_middleware(context: Dict[str, Any]) -> int:
+    """
+    Fetch content from the web using the browser hub.
+    
+    This middleware uses the browser hub to get pages for crawling,
+    and properly releases them back to the pool when done.
+    
+    Args:
+        context: The pipeline context dictionary
+        
+    Returns:
+        int: 1 for success, 0 for failure
+    """
+    url = context.get("url")
+    config = context.get("config")
+    browser_hub = context.get("browser_hub")
+    logger = context.get("logger")
+    
+    # Skip if using cached result
+    if context.get("cached_result") and context.get("html"):
+        return 1
+    
+    try:
+        # Create crawler strategy without initializing its browser manager
+        from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
+        
+        crawler_strategy = AsyncPlaywrightCrawlerStrategy(
+            browser_config=browser_hub.config if browser_hub else None,
+            logger=logger
+        )
+        
+        # Replace the browser manager with our shared instance
+        crawler_strategy.browser_manager = browser_hub
+        
+        # Perform crawl without trying to initialize the browser
+        # The crawler will use the provided browser_manager to get pages
+        async_response = await crawler_strategy.crawl(url, config=config)
+        
+        # Store results in context
+        context["html"] = async_response.html
+        context["screenshot_data"] = async_response.screenshot
+        context["pdf_data"] = async_response.pdf_data
+        context["js_execution_result"] = async_response.js_execution_result
+        context["async_response"] = async_response
+        
+        return 1
+    except Exception as e:
+        context["error_message"] = f"Error fetching content: {str(e)}"
+        return 0
+
+
+async def check_cache_middleware(context: Dict[str, Any]) -> int:
+    """Check if there's a cached result and load it if available"""
+    url = context.get("url")
+    config = context.get("config")
+    cache_context = context.get("cache_context")
+    logger = context.get("logger")
+    
+    # Initialize variables
+    context["cached_result"] = None
+    context["html"] = None
+    context["extracted_content"] = None
+    context["screenshot_data"] = None
+    context["pdf_data"] = None
+    
+    # Try to get cached result if appropriate
+    if cache_context.should_read():
+        cached_result = await async_db_manager.aget_cached_url(url)
+        context["cached_result"] = cached_result
+        
+        if cached_result:
+            html = sanitize_input_encode(cached_result.html)
+            extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
+            extracted_content = None if not extracted_content or extracted_content == "[]" else extracted_content
+            
+            # If screenshot is requested but its not in cache, then set cache_result to None
+            screenshot_data = cached_result.screenshot
+            pdf_data = cached_result.pdf
+            
+            if config.screenshot and not screenshot_data:
+                context["cached_result"] = None
+            
+            if config.pdf and not pdf_data:
+                context["cached_result"] = None
+            
+            context["html"] = html
+            context["extracted_content"] = extracted_content
+            context["screenshot_data"] = screenshot_data
+            context["pdf_data"] = pdf_data
+            
+            logger.url_status(
+                url=cache_context.display_url,
+                success=bool(html),
+                timing=time.perf_counter() - context["start_time"],
+                tag="FETCH",
+            )
+    
+    return 1
+
+
+async def configure_proxy_middleware(context: Dict[str, Any]) -> int:
+    """Configure proxy if a proxy rotation strategy is available"""
+    config = context.get("config")
+    logger = context.get("logger")
+    
+    # Skip if using cached result
+    if context.get("cached_result") and context.get("html"):
+        return 1
+    
+    # Update proxy configuration from rotation strategy if available
+    if config and config.proxy_rotation_strategy:
+        next_proxy = await config.proxy_rotation_strategy.get_next_proxy()
+        if next_proxy:
+            logger.info(
+                message="Switch proxy: {proxy}",
+                tag="PROXY",
+                params={"proxy": next_proxy.server},
+            )
+            config.proxy_config = next_proxy
+    
+    return 1
+
+
+async def check_robots_txt_middleware(context: Dict[str, Any]) -> int:
+    """Check if the URL is allowed by robots.txt if enabled"""
+    url = context.get("url")
+    config = context.get("config")
+    browser_config = context.get("browser_config")
+    robots_parser = context.get("robots_parser")
+    
+    # Skip if using cached result
+    if context.get("cached_result") and context.get("html"):
+        return 1
+    
+    # Check robots.txt if enabled
+    if config and config.check_robots_txt:
+        if not await robots_parser.can_fetch(url, browser_config.user_agent):
+            context["crawl_result"] = CrawlResult(
+                url=url,
+                html="",
+                success=False,
+                status_code=403,
+                error_message="Access denied by robots.txt",
+                response_headers={"X-Robots-Status": "Blocked by robots.txt"}
+            )
+            return 0
+    
+    return 1
+
+
+async def fetch_content_middleware_(context: Dict[str, Any]) -> int:
+    """Fetch content from the web using the crawler strategy"""
+    url = context.get("url")
+    config = context.get("config")
+    crawler_strategy = context.get("crawler_strategy")
+    logger = context.get("logger")
+    
+    # Skip if using cached result
+    if context.get("cached_result") and context.get("html"):
+        return 1
+    
+    try:
+        t1 = time.perf_counter()
+        
+        if config.user_agent:
+            crawler_strategy.update_user_agent(config.user_agent)
+        
+        # Call CrawlerStrategy.crawl
+        async_response = await crawler_strategy.crawl(url, config=config)
+        
+        html = sanitize_input_encode(async_response.html)
+        screenshot_data = async_response.screenshot
+        pdf_data = async_response.pdf_data
+        js_execution_result = async_response.js_execution_result
+        
+        t2 = time.perf_counter()
+        logger.url_status(
+            url=context["cache_context"].display_url,
+            success=bool(html),
+            timing=t2 - t1,
+            tag="FETCH",
+        )
+        
+        context["html"] = html
+        context["screenshot_data"] = screenshot_data
+        context["pdf_data"] = pdf_data
+        context["js_execution_result"] = js_execution_result
+        context["async_response"] = async_response
+        
+        return 1
+    except Exception as e:
+        context["error_message"] = f"Error fetching content: {str(e)}"
+        return 0
+
+
+async def scrape_content_middleware(context: Dict[str, Any]) -> int:
+    """Apply scraping strategy to extract content"""
+    url = context.get("url")
+    html = context.get("html")
+    config = context.get("config")
+    extracted_content = context.get("extracted_content")
+    logger = context.get("logger")
+    
+    # Skip if already have a crawl result
+    if context.get("crawl_result"):
+        return 1
+    
+    try:
+        _url = url if not context.get("is_raw_html", False) else "Raw HTML"
+        t1 = time.perf_counter()
+        
+        # Get scraping strategy and ensure it has a logger
+        scraping_strategy = config.scraping_strategy
+        if not scraping_strategy.logger:
+            scraping_strategy.logger = logger
+        
+        # Process HTML content
+        params = config.__dict__.copy()
+        params.pop("url", None)
+        # Add keys from kwargs to params that don't exist in params
+        kwargs = context.get("kwargs", {})
+        params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
+        
+        # Scraping Strategy Execution
+        result: ScrapingResult = scraping_strategy.scrap(url, html, **params)
+        
+        if result is None:
+            raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
+        
+        # Extract results - handle both dict and ScrapingResult
+        if isinstance(result, dict):
+            cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
+            media = result.get("media", {})
+            links = result.get("links", {})
+            metadata = result.get("metadata", {})
+        else:
+            cleaned_html = sanitize_input_encode(result.cleaned_html)
+            media = result.media.model_dump()
+            links = result.links.model_dump()
+            metadata = result.metadata
+        
+        context["cleaned_html"] = cleaned_html
+        context["media"] = media
+        context["links"] = links
+        context["metadata"] = metadata
+        
+        # Log processing completion
+        logger.info(
+            message="{url:.50}... | Time: {timing}s",
+            tag="SCRAPE",
+            params={
+                "url": _url,
+                "timing": int((time.perf_counter() - t1) * 1000) / 1000,
+            },
+        )
+        
+        return 1
+    except InvalidCSSSelectorError as e:
+        context["error_message"] = str(e)
+        return 0
+    except Exception as e:
+        context["error_message"] = f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}"
+        return 0
+
+
+async def generate_markdown_middleware(context: Dict[str, Any]) -> int:
+    """Generate markdown from cleaned HTML"""
+    url = context.get("url")
+    cleaned_html = context.get("cleaned_html")
+    config = context.get("config")
+    
+    # Skip if already have a crawl result
+    if context.get("crawl_result"):
+        return 1
+    
+    # Generate Markdown
+    markdown_generator = config.markdown_generator
+    
+    markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
+        cleaned_html=cleaned_html,
+        base_url=url,
+    )
+    
+    context["markdown_result"] = markdown_result
+    
+    return 1
+
+
+async def extract_structured_content_middleware(context: Dict[str, Any]) -> int:
+    """Extract structured content using extraction strategy"""
+    url = context.get("url")
+    extracted_content = context.get("extracted_content")
+    config = context.get("config")
+    markdown_result = context.get("markdown_result")
+    cleaned_html = context.get("cleaned_html")
+    logger = context.get("logger")
+    
+    # Skip if already have a crawl result or extracted content
+    if context.get("crawl_result") or bool(extracted_content):
+        return 1
+    
+    from crawl4ai.chunking_strategy import IdentityChunking
+    from crawl4ai.extraction_strategy import NoExtractionStrategy
+    
+    if config.extraction_strategy and not isinstance(config.extraction_strategy, NoExtractionStrategy):
+        t1 = time.perf_counter()
+        _url = url if not context.get("is_raw_html", False) else "Raw HTML"
+        
+        # Choose content based on input_format
+        content_format = config.extraction_strategy.input_format
+        if content_format == "fit_markdown" and not markdown_result.fit_markdown:
+            logger.warning(
+                message="Fit markdown requested but not available. Falling back to raw markdown.",
+                tag="EXTRACT",
+                params={"url": _url},
+            )
+            content_format = "markdown"
+        
+        content = {
+            "markdown": markdown_result.raw_markdown,
+            "html": context.get("html"),
+            "cleaned_html": cleaned_html,
+            "fit_markdown": markdown_result.fit_markdown,
+        }.get(content_format, markdown_result.raw_markdown)
+        
+        # Use IdentityChunking for HTML input, otherwise use provided chunking strategy
+        chunking = (
+            IdentityChunking()
+            if content_format in ["html", "cleaned_html"]
+            else config.chunking_strategy
+        )
+        sections = chunking.chunk(content)
+        extracted_content = config.extraction_strategy.run(url, sections)
+        extracted_content = json.dumps(
+            extracted_content, indent=4, default=str, ensure_ascii=False
+        )
+        
+        context["extracted_content"] = extracted_content
+        
+        # Log extraction completion
+        logger.info(
+            message="Completed for {url:.50}... | Time: {timing}s",
+            tag="EXTRACT",
+            params={"url": _url, "timing": time.perf_counter() - t1},
+        )
+    
+    return 1
+
+
+async def format_html_middleware(context: Dict[str, Any]) -> int:
+    """Format HTML if prettify is enabled"""
+    config = context.get("config")
+    cleaned_html = context.get("cleaned_html")
+    
+    # Skip if already have a crawl result
+    if context.get("crawl_result"):
+        return 1
+    
+    # Apply HTML formatting if requested
+    if config.prettiify and cleaned_html:
+        context["cleaned_html"] = fast_format_html(cleaned_html)
+    
+    return 1
+
+
+async def write_cache_middleware(context: Dict[str, Any]) -> int:
+    """Write result to cache if appropriate"""
+    cache_context = context.get("cache_context")
+    cached_result = context.get("cached_result")
+    
+    # Skip if already have a crawl result or not using cache
+    if context.get("crawl_result") or not cache_context.should_write() or bool(cached_result):
+        return 1
+    
+    # We'll create the CrawlResult in build_result_middleware and cache it there
+    # to avoid creating it twice
+    
+    return 1
+
+
+async def build_result_middleware(context: Dict[str, Any]) -> int:
+    """Build the final CrawlResult object"""
+    url = context.get("url")
+    html = context.get("html", "")
+    cache_context = context.get("cache_context")
+    cached_result = context.get("cached_result")
+    config = context.get("config")
+    logger = context.get("logger")
+    
+    # If we already have a crawl result (from an earlier middleware like robots.txt check)
+    if context.get("crawl_result"):
+        result = context["crawl_result"]
+        context["final_result"] = CrawlResultContainer(result)
+        return 1
+    
+    # If we have a cached result
+    if cached_result and html:
+        logger.success(
+            message="{url:.50}... | Status: {status} | Total: {timing}",
+            tag="COMPLETE",
+            params={
+                "url": cache_context.display_url,
+                "status": True,
+                "timing": f"{time.perf_counter() - context['start_time']:.2f}s",
+            },
+            colors={"status": "green", "timing": "yellow"},
+        )
+        
+        cached_result.success = bool(html)
+        cached_result.session_id = getattr(config, "session_id", None)
+        cached_result.redirected_url = cached_result.redirected_url or url
+        context["final_result"] = CrawlResultContainer(cached_result)
+        return 1
+    
+    # Build a new result
+    try:
+        # Get all necessary components from context
+        cleaned_html = context.get("cleaned_html", "")
+        markdown_result = context.get("markdown_result")
+        media = context.get("media", {})
+        links = context.get("links", {})
+        metadata = context.get("metadata", {})
+        screenshot_data = context.get("screenshot_data")
+        pdf_data = context.get("pdf_data")
+        extracted_content = context.get("extracted_content")
+        async_response = context.get("async_response")
+        
+        # Create the CrawlResult
+        crawl_result = CrawlResult(
+            url=url,
+            html=html,
+            cleaned_html=cleaned_html,
+            markdown=markdown_result,
+            media=media,
+            links=links,
+            metadata=metadata,
+            screenshot=screenshot_data,
+            pdf=pdf_data,
+            extracted_content=extracted_content,
+            success=bool(html),
+            error_message="",
+        )
+        
+        # Add response details if available
+        if async_response:
+            crawl_result.status_code = async_response.status_code
+            crawl_result.redirected_url = async_response.redirected_url or url
+            crawl_result.response_headers = async_response.response_headers
+            crawl_result.downloaded_files = async_response.downloaded_files
+            crawl_result.js_execution_result = context.get("js_execution_result")
+            crawl_result.ssl_certificate = async_response.ssl_certificate
+        
+        crawl_result.session_id = getattr(config, "session_id", None)
+        
+        # Log completion
+        logger.success(
+            message="{url:.50}... | Status: {status} | Total: {timing}",
+            tag="COMPLETE",
+            params={
+                "url": cache_context.display_url,
+                "status": crawl_result.success,
+                "timing": f"{time.perf_counter() - context['start_time']:.2f}s",
+            },
+            colors={
+                "status": "green" if crawl_result.success else "red",
+                "timing": "yellow",
+            },
+        )
+        
+        # Update cache if appropriate
+        if cache_context.should_write() and not bool(cached_result):
+            await async_db_manager.acache_url(crawl_result)
+        
+        context["final_result"] = CrawlResultContainer(crawl_result)
+        return 1
+    except Exception as e:
+        error_context = get_error_context(sys.exc_info())
+        
+        error_message = (
+            f"Unexpected error in build_result at line {error_context['line_no']} "
+            f"in {error_context['function']} ({error_context['filename']}):\n"
+            f"Error: {str(e)}\n\n"
+            f"Code context:\n{error_context['code_context']}"
+        )
+        
+        logger.error_status(
+            url=url,
+            error=create_box_message(error_message, type="error"),
+            tag="ERROR",
+        )
+        
+        context["final_result"] = CrawlResultContainer(
+            CrawlResult(
+                url=url, html="", success=False, error_message=error_message
+            )
+        )
+        return 1
+
+
+async def handle_error_middleware(context: Dict[str, Any]) -> Dict[str, Any]:
+    """Error handler middleware"""
+    url = context.get("url", "")
+    error_message = context.get("error_message", "Unknown error")
+    logger = context.get("logger")
+    
+    # Log the error
+    if logger:
+        logger.error_status(
+            url=url,
+            error=create_box_message(error_message, type="error"),
+            tag="ERROR",
+        )
+    
+    # Create a failure result
+    context["final_result"] = CrawlResultContainer(
+        CrawlResult(
+            url=url, html="", success=False, error_message=error_message
+        )
+    )
+    
+    return context
+
+
+# Custom middlewares as requested
+
+async def sentiment_analysis_middleware(context: Dict[str, Any]) -> int:
+    """Analyze sentiment of generated markdown using TextBlob"""
+    from textblob import TextBlob
+    
+    markdown_result = context.get("markdown_result")
+    
+    # Skip if no markdown or already failed
+    if not markdown_result or not context.get("success", True):
+        return 1
+    
+    try:
+        # Get raw markdown text
+        raw_markdown = markdown_result.raw_markdown
+        
+        # Analyze sentiment
+        blob = TextBlob(raw_markdown)
+        sentiment = blob.sentiment
+        
+        # Add sentiment to context
+        context["sentiment_analysis"] = {
+            "polarity": sentiment.polarity,  # -1.0 to 1.0 (negative to positive)
+            "subjectivity": sentiment.subjectivity,  # 0.0 to 1.0 (objective to subjective)
+            "classification": "positive" if sentiment.polarity > 0.1 else 
+                             "negative" if sentiment.polarity < -0.1 else "neutral"
+        }
+        
+        return 1
+    except Exception as e:
+        # Don't fail the pipeline on sentiment analysis failure
+        context["sentiment_analysis_error"] = str(e)
+        return 1
+
+
+async def log_timing_middleware(context: Dict[str, Any], name: str) -> int:
+    """Log timing information for a specific point in the pipeline"""
+    context[f"_timing_mark_{name}"] = time.perf_counter()
+    
+    # Calculate duration if we have a start time
+    start_key = f"_timing_start_{name}"
+    if start_key in context:
+        duration = context[f"_timing_mark_{name}"] - context[start_key]
+        context[f"_timing_duration_{name}"] = duration
+        
+        # Log the timing if we have a logger
+        logger = context.get("logger")
+        if logger:
+            logger.info(
+                message="{name} completed in {duration:.2f}s",
+                tag="TIMING",
+                params={"name": name, "duration": duration},
+            )
+    
+    return 1
+
+
+async def validate_url_middleware(context: Dict[str, Any], patterns: List[str]) -> int:
+    """Validate URL against glob patterns"""
+    import fnmatch
+    url = context.get("url", "")
+    
+    # If no patterns provided, allow all
+    if not patterns:
+        return 1
+    
+    # Check if URL matches any of the allowed patterns
+    for pattern in patterns:
+        if fnmatch.fnmatch(url, pattern):
+            return 1
+    
+    # If we get here, URL didn't match any patterns
+    context["error_message"] = f"URL '{url}' does not match any allowed patterns"
+    return 0
+
+
+# Update the default middleware list function
+def create_default_middleware_list():
+    """Return the default list of middleware functions for the pipeline."""
+    return [
+        initialize_context_middleware,
+        check_cache_middleware,
+        browser_hub_middleware,  # Add browser hub middleware before fetch_content
+        configure_proxy_middleware,
+        check_robots_txt_middleware,
+        fetch_content_middleware,
+        scrape_content_middleware,
+        generate_markdown_middleware,
+        extract_structured_content_middleware,
+        format_html_middleware,
+        build_result_middleware
+    ]
--- a/crawl4ai/pipeline/pipeline.py
+++ b/crawl4ai/pipeline/pipeline.py
@@ -0,0 +1,297 @@
+
+import time
+import asyncio
+from typing import Callable, Dict, List, Any, Optional, Awaitable, Union, TypedDict, Tuple, Coroutine
+
+from .middlewares import create_default_middleware_list, handle_error_middleware
+from crawl4ai.models import CrawlResultContainer, CrawlResult
+from crawl4ai.async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_logger import AsyncLogger
+
+
+class CrawlSpec(TypedDict, total=False):
+    """Specification for a single crawl operation in batch_crawl."""
+    url: str
+    config: Optional[CrawlerRunConfig]
+    browser_config: Optional[BrowserConfig]
+
+class BatchStatus(TypedDict, total=False):
+    """Status information for batch crawl operations."""
+    total: int
+    processed: int
+    succeeded: int
+    failed: int
+    in_progress: int
+    duration: float
+    
+class Pipeline:
+    """
+    A pipeline processor that executes a series of async middleware functions.
+    Each middleware function receives a context dictionary, updates it,
+    and returns 1 for success or 0 for failure.
+    """
+
+    def __init__(
+        self, 
+        middleware: List[Callable[[Dict[str, Any]], Awaitable[int]]] = None,
+        error_handler: Optional[Callable[[Dict[str, Any]], Awaitable[Dict[str, Any]]]] = None,
+        after_middleware_callback: Optional[Callable[[str, Dict[str, Any]], Awaitable[None]]] = None,
+        crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
+        browser_config: Optional[BrowserConfig] = None,
+        logger: Optional[AsyncLogger] = None,
+        _initial_context: Optional[Dict[str, Any]] = None
+    ):
+        self.middleware = middleware or create_default_middleware_list()
+        self.error_handler = error_handler or handle_error_middleware
+        self.after_middleware_callback = after_middleware_callback
+        self.browser_config = browser_config or BrowserConfig()
+        self.logger = logger or AsyncLogger(verbose=self.browser_config.verbose)
+        self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
+            browser_config=self.browser_config,
+            logger=self.logger
+        )
+        self._initial_context = _initial_context
+        self._strategy_initialized = False
+    
+    async def _initialize_strategy__(self):
+        """Initialize the crawler strategy if not already initialized"""
+        if not self.crawler_strategy:
+            self.crawler_strategy = AsyncPlaywrightCrawlerStrategy(
+                browser_config=self.browser_config,
+                logger=self.logger
+            )
+        
+        if not self._strategy_initialized:
+            await self.crawler_strategy.__aenter__()
+            self._strategy_initialized = True
+    
+    async def _initialize_strategy(self):
+        """Initialize the crawler strategy if not already initialized"""
+        # With our new approach, we don't need to create the crawler strategy here
+        # as it will be created on-demand in fetch_content_middleware
+        
+        # Just ensure browser hub is available if needed
+        if hasattr(self, "_initial_context") and "browser_hub" not in self._initial_context:
+            # If a browser_config was provided but no browser_hub yet, 
+            # we'll let the browser_hub_middleware handle creating it
+            pass
+        
+        # Mark as initialized to prevent repeated initialization attempts
+        self._strategy_initialized = True
+
+    async def start(self):
+        """Start the crawler strategy and prepare it for use"""
+        if not self._strategy_initialized:
+            await self._initialize_strategy()
+            self._strategy_initialized = True
+        if self.crawler_strategy:
+            await self.crawler_strategy.__aenter__()
+            self._strategy_initialized = True
+        else:
+            raise ValueError("Crawler strategy is not initialized.")
+    
+    async def close(self):
+        """Close the crawler strategy and clean up resources"""
+        await self.stop()
+
+    async def stop(self):
+        """Close the crawler strategy and clean up resources"""
+        if self._strategy_initialized and self.crawler_strategy:
+            await self.crawler_strategy.__aexit__(None, None, None)
+            self._strategy_initialized = False
+    
+    async def __aenter__(self):
+        await self.start()
+        return self
+    
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.close()
+    
+    async def crawl(self, url: str, config: Optional[CrawlerRunConfig] = None, **kwargs) -> CrawlResultContainer:
+        """
+        Crawl a URL and process it through the pipeline.
+        
+        Args:
+            url: The URL to crawl
+            config: Optional configuration for the crawl
+            **kwargs: Additional arguments to pass to the middleware
+            
+        Returns:
+            CrawlResultContainer: The result of the crawl
+        """
+        # Initialize strategy if needed
+        await self._initialize_strategy()
+        
+        # Create the initial context
+        context = {
+            "url": url,
+            "config": config or CrawlerRunConfig(),
+            "browser_config": self.browser_config,
+            "logger": self.logger,
+            "crawler_strategy": self.crawler_strategy,
+            "kwargs": kwargs
+        }
+        
+        # Process the pipeline
+        result_context = await self.process(context)
+        
+        # Return the final result
+        return result_context.get("final_result")
+    
+    async def process(self, initial_context: Dict[str, Any] = None) -> Dict[str, Any]:
+        """
+        Process all middleware functions with the given context.
+        
+        Args:
+            initial_context: Initial context dictionary, defaults to empty dict
+            
+        Returns:
+            Updated context dictionary after all middleware have been processed
+        """
+        context = {**self._initial_context}
+        if initial_context:
+            context.update(initial_context)
+        
+        # Record pipeline start time
+        context["_pipeline_start_time"] = time.perf_counter()
+        
+        for middleware_fn in self.middleware:
+            # Get middleware name for logging
+            middleware_name = getattr(middleware_fn, '__name__', str(middleware_fn))
+            
+            # Record start time for this middleware
+            start_time = time.perf_counter()
+            context[f"_timing_start_{middleware_name}"] = start_time
+            
+            try:
+                # Execute middleware (all middleware functions are async)
+                result = await middleware_fn(context)
+                
+                # Record completion time
+                end_time = time.perf_counter()
+                context[f"_timing_end_{middleware_name}"] = end_time
+                context[f"_timing_duration_{middleware_name}"] = end_time - start_time
+                
+                # Execute after-middleware callback if provided
+                if self.after_middleware_callback:
+                    await self.after_middleware_callback(middleware_name, context)
+                
+                # Convert boolean returns to int (True->1, False->0)
+                if isinstance(result, bool):
+                    result = 1 if result else 0
+                
+                # Handle failure
+                if result == 0:
+                    if self.error_handler:
+                        context["_error_in"] = middleware_name
+                        context["_error_at"] = time.perf_counter()
+                        return await self._handle_error(context)
+                    else:
+                        context["success"] = False
+                        context["error_message"] = f"Pipeline failed at {middleware_name}"
+                        break
+            except Exception as e:
+                # Record error information
+                context["_error_in"] = middleware_name
+                context["_error_at"] = time.perf_counter()
+                context["_exception"] = e
+                context["success"] = False
+                context["error_message"] = f"Exception in {middleware_name}: {str(e)}"
+                
+                # Call error handler if available
+                if self.error_handler:
+                    return await self._handle_error(context)
+                break
+        
+        # Record pipeline completion time
+        pipeline_end_time = time.perf_counter()
+        context["_pipeline_end_time"] = pipeline_end_time
+        context["_pipeline_duration"] = pipeline_end_time - context["_pipeline_start_time"]
+        
+        # Set success to True if not already set (no failures)
+        if "success" not in context:
+            context["success"] = True
+        
+        return context
+    
+    async def _handle_error(self, context: Dict[str, Any]) -> Dict[str, Any]:
+        """Handle errors by calling the error handler"""
+        try:
+            return await self.error_handler(context)
+        except Exception as e:
+            # If error handler fails, update context with this new error
+            context["_error_handler_exception"] = e
+            context["error_message"] = f"Error handler failed: {str(e)}"
+            return context
+
+
+
+async def create_pipeline(
+    middleware_list=None, 
+    error_handler=None,
+    after_middleware_callback=None,
+    browser_config=None,
+    browser_hub_id=None,
+    browser_hub_connection=None,
+    browser_hub=None,
+    logger=None
+) -> Pipeline:
+    """
+    Factory function to create a pipeline with Browser-Hub integration.
+    
+    Args:
+        middleware_list: List of middleware functions
+        error_handler: Error handler middleware
+        after_middleware_callback: Callback after middleware execution
+        browser_config: Configuration for the browser
+        browser_hub_id: ID for browser hub instance
+        browser_hub_connection: Connection string for existing browser hub
+        browser_hub: Existing browser hub instance to use
+        logger: Logger instance
+        
+    Returns:
+        Pipeline: Configured pipeline instance
+    """
+    # Use default middleware list if none provided
+    middleware = middleware_list or create_default_middleware_list()
+    
+    # Create the pipeline
+    pipeline = Pipeline(
+        middleware=middleware,
+        error_handler=error_handler,
+        after_middleware_callback=after_middleware_callback,
+        logger=logger
+    )
+    
+    # Set browser-related attributes in the initial context
+    pipeline._initial_context = {
+        "browser_config": browser_config,
+        "browser_hub_id": browser_hub_id,
+        "browser_hub_connection": browser_hub_connection,
+        "browser_hub": browser_hub,
+        "logger": logger
+    }
+    
+    return pipeline
+
+
+
+
+# async def create_pipeline(
+#     middleware_list: Optional[List[Callable[[Dict[str, Any]], Awaitable[int]]]] = None, 
+#     error_handler: Optional[Callable[[Dict[str, Any]], Awaitable[Dict[str, Any]]]] = None,
+#     after_middleware_callback: Optional[Callable[[str, Dict[str, Any]], Awaitable[None]]] = None,
+#     crawler_strategy = None,
+#     browser_config = None,
+#     logger = None
+# ) -> Pipeline:
+#     """Factory function to create a pipeline with the given middleware"""
+#     return Pipeline(
+#         middleware=middleware_list,
+#         error_handler=error_handler,
+#         after_middleware_callback=after_middleware_callback,
+#         crawler_strategy=crawler_strategy,
+#         browser_config=browser_config,
+#         logger=logger
+#     )
--- a/crawl4ai/pipeline/test_pipeline.py
+++ b/crawl4ai/pipeline/test_pipeline.py
@@ -0,0 +1,109 @@
+import asyncio
+from crawl4ai import (
+    BrowserConfig,
+    CrawlerRunConfig,
+    CacheMode,
+    DefaultMarkdownGenerator,
+    PruningContentFilter
+)
+from pipeline import Pipeline
+
+async def main():
+    # Create configuration objects
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(
+                threshold=0.48, 
+                threshold_type="fixed", 
+                min_word_threshold=0
+            )
+        ),
+    )
+    
+    # Create and use pipeline with context manager
+    async with Pipeline(browser_config=browser_config) as pipeline:
+        result = await pipeline.crawl(
+            url="https://www.example.com", 
+            config=crawler_config
+        )
+        
+        # Print the result
+        print(f"URL: {result.url}")
+        print(f"Success: {result.success}")
+        
+        if result.success:
+            print("\nMarkdown excerpt:")
+            print(result.markdown.raw_markdown[:500] + "...")
+        else:
+            print(f"Error: {result.error_message}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
+
+class CrawlTarget:
+    def __init__(self, urls, config=None):
+        self.urls = urls
+        self.config = config
+
+    def __repr__(self):
+        return f"CrawlTarget(urls={self.urls}, config={self.config})"
+    
+
+
+
+# async def main():
+#     # Create configuration objects
+#     browser_config = BrowserConfig(headless=True, verbose=True)
+    
+#     # Define different configurations
+#     config1 = CrawlerRunConfig(
+#         cache_mode=CacheMode.BYPASS,
+#         markdown_generator=DefaultMarkdownGenerator(
+#             content_filter=PruningContentFilter(threshold=0.48)
+#         ),
+#     )
+    
+#     config2 = CrawlerRunConfig(
+#         cache_mode=CacheMode.ENABLED,
+#         screenshot=True,
+#         pdf=True
+#     )
+    
+#     # Create crawl targets
+#     targets = [
+#         CrawlTarget(
+#             urls=["https://www.example.com", "https://www.wikipedia.org"],
+#             config=config1
+#         ),
+#         CrawlTarget(
+#             urls="https://news.ycombinator.com",  
+#             config=config2
+#         ),
+#         CrawlTarget(
+#             urls=["https://github.com", "https://stackoverflow.com", "https://python.org"],
+#             config=None
+#         )
+#     ]
+    
+#     # Create and use pipeline with context manager
+#     async with Pipeline(browser_config=browser_config) as pipeline:
+#         all_results = await pipeline.crawl_batch(targets)
+        
+#         for target_key, results in all_results.items():
+#             print(f"\n===== Results for {target_key} =====")
+#             print(f"Number of URLs crawled: {len(results)}")
+            
+#             for i, result in enumerate(results):
+#                 print(f"\nURL {i+1}: {result.url}")
+#                 print(f"Success: {result.success}")
+                
+#                 if result.success:
+#                     print(f"Content length: {len(result.markdown.raw_markdown)} chars")
+#                 else:
+#                     print(f"Error: {result.error_message}")
+
+# if __name__ == "__main__":
+#     asyncio.run(main())
--- a/tests/pipeline/demo_browser_hub_pipeline.py
+++ b/tests/pipeline/demo_browser_hub_pipeline.py
@@ -0,0 +1,222 @@
+# demo_browser_hub.py
+
+import asyncio
+from typing import List
+
+from crawl4ai.browser.browser_hub import BrowserHub
+from pipeline import create_pipeline
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_logger import AsyncLogger
+from crawl4ai.models import CrawlResultContainer
+from crawl4ai.cache_context import CacheMode
+from crawl4ai import DefaultMarkdownGenerator
+from crawl4ai import PruningContentFilter
+
+async def create_prewarmed_browser_hub(urls_to_crawl: List[str]):
+    """Create a pre-warmed browser hub with 10 browsers and 5 pages each."""
+    # Set up logging
+    logger = AsyncLogger(verbose=True)
+    logger.info("Setting up pre-warmed browser hub", tag="DEMO")
+    
+    # Create browser configuration
+    browser_config = BrowserConfig(
+        browser_type="chromium",
+        headless=True,  # Set to False to see the browsers in action
+        viewport_width=1280,
+        viewport_height=800,
+        light_mode=True,  # Optimize for performance
+        java_script_enabled=True
+    )
+    
+    # Create crawler configurations for pre-warming with different user agents
+    # This allows pages to be ready for different scenarios
+    crawler_configs = [
+        CrawlerRunConfig(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            wait_until="networkidle"
+        ),
+        # CrawlerRunConfig(
+        #     user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15",
+        #     wait_until="networkidle"
+        # ),
+        # CrawlerRunConfig(
+        #     user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
+        #     wait_until="networkidle"
+        # )
+    ]
+    
+    # Number of browsers and pages per browser
+    num_browsers = 1
+    pages_per_browser = 1
+    
+    # Distribute pages across configurations
+    # We'll create a total of 50 pages (10 browsers × 5 pages)
+    page_configs = []
+    total_pages = num_browsers * pages_per_browser
+    pages_per_config = total_pages // len(crawler_configs)
+    
+    for i, config in enumerate(crawler_configs):
+        # For the last config, add any remaining pages
+        if i == len(crawler_configs) - 1:
+            remaining = total_pages - (pages_per_config * (len(crawler_configs) - 1))
+            page_configs.append((browser_config, config, remaining))
+        else:
+            page_configs.append((browser_config, config, pages_per_config))
+    
+    # Create browser hub with pre-warmed pages
+    start_time = asyncio.get_event_loop().time()
+    logger.info("Initializing browser hub with pre-warmed pages...", tag="DEMO")
+    
+    hub = await BrowserHub.get_browser_manager(
+        config=browser_config,
+        hub_id="demo_hub",
+        logger=logger,
+        max_browsers_per_config=num_browsers,
+        max_pages_per_browser=pages_per_browser,
+        initial_pool_size=num_browsers,
+        page_configs=page_configs
+    )
+    
+    end_time = asyncio.get_event_loop().time()
+    logger.success(
+        message="Browser hub initialized with {total_pages} pre-warmed pages in {duration:.2f} seconds",
+        tag="DEMO",
+        params={
+            "total_pages": total_pages,
+            "duration": end_time - start_time
+        }
+    )
+    
+    # Get and display pool status
+    status = await hub.get_pool_status()
+    logger.info(
+        message="Browser pool status: {status}",
+        tag="DEMO",
+        params={"status": status}
+    )
+    
+    return hub
+
+async def crawl_urls_with_hub(hub, urls: List[str]) -> List[CrawlResultContainer]:
+    """Crawl a list of URLs using a pre-warmed browser hub."""
+    logger = AsyncLogger(verbose=True)
+    
+    # Create crawler configuration
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(
+                threshold=0.48,
+                threshold_type="fixed",
+                min_word_threshold=0
+            )
+        ),
+        wait_until="networkidle",
+        screenshot=True
+    )
+    
+    # Create pipeline with the browser hub
+    pipeline = await create_pipeline(
+        browser_hub=hub,
+        logger=logger
+    )
+    
+    results = []
+    
+    # Crawl all URLs in parallel
+    async def crawl_url(url):
+        logger.info(f"Crawling {url}...", tag="CRAWL")
+        result = await pipeline.crawl(url=url, config=crawler_config)
+        logger.success(f"Completed crawl of {url}", tag="CRAWL")
+        return result
+    
+    # Create tasks for all URLs
+    tasks = [crawl_url(url) for url in urls]
+    
+    # Execute all tasks in parallel and collect results
+    results = await asyncio.gather(*tasks)
+    
+    return results
+
+async def main():
+    """Main demo function."""
+    # List of URLs to crawl
+    urls_to_crawl = [
+        "https://example.com",
+        # "https://www.python.org",
+        # "https://httpbin.org/html",
+        # "https://news.ycombinator.com",
+        # "https://github.com",
+        # "https://pypi.org",
+        # "https://docs.python.org/3/",
+        # "https://opensource.org",
+        # "https://whatismyipaddress.com",
+        # "https://en.wikipedia.org/wiki/Web_scraping"
+    ]
+    
+    # Set up logging
+    logger = AsyncLogger(verbose=True)
+    logger.info("Starting browser hub demo", tag="DEMO")
+    
+    try:
+        # Create pre-warmed browser hub
+        hub = await create_prewarmed_browser_hub(urls_to_crawl)
+        
+        # Use hub to crawl URLs
+        logger.info("Crawling URLs in parallel...", tag="DEMO")
+        start_time = asyncio.get_event_loop().time()
+        
+        results = await crawl_urls_with_hub(hub, urls_to_crawl)
+        
+        end_time = asyncio.get_event_loop().time()
+        
+        # Display results
+        logger.success(
+            message="Crawled {count} URLs in {duration:.2f} seconds (average: {avg:.2f} seconds per URL)",
+            tag="DEMO",
+            params={
+                "count": len(results),
+                "duration": end_time - start_time,
+                "avg": (end_time - start_time) / len(results)
+            }
+        )
+        
+        # Print summary of results
+        logger.info("Crawl results summary:", tag="DEMO")
+        for i, result in enumerate(results):
+            logger.info(
+                message="{idx}. {url}: Success={success}, Content length={length}",
+                tag="RESULT",
+                params={
+                    "idx": i+1,
+                    "url": result.url,
+                    "success": result.success,
+                    "length": len(result.html) if result.html else 0
+                }
+            )
+            
+            if result.success and result.markdown and result.markdown.raw_markdown:
+                # Print a snippet of the markdown
+                markdown_snippet = result.markdown.raw_markdown[:150] + "..."
+                logger.info(
+                    message="   Markdown: {snippet}",
+                    tag="RESULT",
+                    params={"snippet": markdown_snippet}
+                )
+        
+        # Display final browser pool status
+        status = await hub.get_pool_status()
+        logger.info(
+            message="Final browser pool status: {status}",
+            tag="DEMO",
+            params={"status": status}
+        )
+        
+    finally:
+        # Clean up
+        logger.info("Shutting down browser hub...", tag="DEMO")
+        await BrowserHub.shutdown_all()
+        logger.success("Demo completed", tag="DEMO")
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/tests/pipeline/extended_browser_hub_tests.py
+++ b/tests/pipeline/extended_browser_hub_tests.py
@@ -0,0 +1,505 @@
+# extended_browser_hub_tests.py
+
+import asyncio
+
+from crawl4ai.browser.browser_hub import BrowserHub
+from pipeline import create_pipeline
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_logger import AsyncLogger
+from crawl4ai.cache_context import CacheMode
+
+# Common test URLs
+TEST_URLS = [
+    "https://example.com",
+    "https://example.com/page1",
+    "https://httpbin.org/html",
+    "https://httpbin.org/headers",
+    "https://httpbin.org/ip",
+    "https://httpstat.us/200"
+]
+
+class TestResults:
+    """Simple container for test results"""
+    def __init__(self, name: str):
+        self.name = name
+        self.results = []
+        self.start_time = None
+        self.end_time = None
+        self.errors = []
+    
+    @property
+    def duration(self) -> float:
+        if self.start_time and self.end_time:
+            return self.end_time - self.start_time
+        return 0
+    
+    @property
+    def success_rate(self) -> float:
+        if not self.results:
+            return 0
+        return sum(1 for r in self.results if r.success) / len(self.results) * 100
+
+    def log_summary(self, logger: AsyncLogger):
+        logger.info(f"=== Test: {self.name} ===", tag="SUMMARY")
+        logger.info(
+            message="Duration: {duration:.2f}s, Success rate: {success_rate:.1f}%, Results: {count}",
+            tag="SUMMARY",
+            params={
+                "duration": self.duration,
+                "success_rate": self.success_rate,
+                "count": len(self.results)
+            }
+        )
+        
+        if self.errors:
+            logger.error(
+                message="Errors ({count}): {errors}",
+                tag="SUMMARY",
+                params={
+                    "count": len(self.errors),
+                    "errors": "; ".join(str(e) for e in self.errors)
+                }
+            )
+
+# ======== TEST SCENARIO 1: Simple default configuration ========
+async def test_default_configuration():
+    """
+    Test Scenario 1: Simple default configuration
+    
+    This tests the basic case where the user does not provide any specific
+    browser configuration, relying on default auto-setup.
+    """
+    logger = AsyncLogger(verbose=True)
+    results = TestResults("Default Configuration")
+    
+    try:
+        # Create pipeline with no browser config
+        pipeline = await create_pipeline(logger=logger)
+        
+        # Start timing
+        results.start_time = asyncio.get_event_loop().time()
+        
+        # Create basic crawler config
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            wait_until="domcontentloaded"
+        )
+        
+        # Process each URL sequentially
+        for url in TEST_URLS:
+            try:
+                logger.info(f"Crawling {url} with default configuration", tag="TEST")
+                result = await pipeline.crawl(url=url, config=crawler_config)
+                results.results.append(result)
+                
+                logger.success(
+                    message="Result: url={url}, success={success}, content_length={length}",
+                    tag="TEST",
+                    params={
+                        "url": url,
+                        "success": result.success,
+                        "length": len(result.html) if result.html else 0
+                    }
+                )
+            except Exception as e:
+                logger.error(f"Error crawling {url}: {str(e)}", tag="TEST")
+                results.errors.append(e)
+        
+        # End timing
+        results.end_time = asyncio.get_event_loop().time()
+        
+    except Exception as e:
+        logger.error(f"Test failed with error: {str(e)}", tag="TEST")
+        results.errors.append(e)
+    
+    # Log summary
+    results.log_summary(logger)
+    
+    return results
+
+# ======== TEST SCENARIO 2: Detailed custom configuration ========
+async def test_custom_configuration():
+    """
+    Test Scenario 2: Detailed custom configuration
+    
+    This tests the case where the user provides detailed browser configuration
+    to customize the browser behavior.
+    """
+    logger = AsyncLogger(verbose=True)
+    results = TestResults("Custom Configuration")
+    
+    try:
+        # Create custom browser config
+        browser_config = BrowserConfig(
+            browser_type="chromium",
+            headless=True,
+            viewport_width=1920,
+            viewport_height=1080,
+            user_agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
+            light_mode=True,
+            ignore_https_errors=True,
+            extra_args=["--disable-extensions"]
+        )
+        
+        # Create custom crawler config
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            wait_until="networkidle",
+            page_timeout=30000,
+            screenshot=True,
+            pdf=False,
+            screenshot_wait_for=0.5,
+            wait_for_images=True,
+            scan_full_page=True,
+            scroll_delay=0.2,
+            process_iframes=True,
+            remove_overlay_elements=True
+        )
+        
+        # Create pipeline with custom configuration
+        pipeline = await create_pipeline(
+            browser_config=browser_config,
+            logger=logger
+        )
+        
+        # Start timing
+        results.start_time = asyncio.get_event_loop().time()
+        
+        # Process each URL sequentially
+        for url in TEST_URLS:
+            try:
+                logger.info(f"Crawling {url} with custom configuration", tag="TEST")
+                result = await pipeline.crawl(url=url, config=crawler_config)
+                results.results.append(result)
+                
+                has_screenshot = result.screenshot is not None
+                
+                logger.success(
+                    message="Result: url={url}, success={success}, screenshot={screenshot}, content_length={length}",
+                    tag="TEST",
+                    params={
+                        "url": url,
+                        "success": result.success,
+                        "screenshot": has_screenshot,
+                        "length": len(result.html) if result.html else 0
+                    }
+                )
+            except Exception as e:
+                logger.error(f"Error crawling {url}: {str(e)}", tag="TEST")
+                results.errors.append(e)
+        
+        # End timing
+        results.end_time = asyncio.get_event_loop().time()
+        
+        # Get browser hub status from context
+        try:
+            # Run a dummy crawl to get the context with browser hub
+            context = await pipeline.process({"url": "about:blank", "config": crawler_config})
+            browser_hub = context.get("browser_hub")
+            if browser_hub:
+                status = await browser_hub.get_pool_status()
+                logger.info(
+                    message="Browser hub status: {status}",
+                    tag="TEST",
+                    params={"status": status}
+                )
+        except Exception as e:
+            logger.error(f"Failed to get browser hub status: {str(e)}", tag="TEST")
+    
+    except Exception as e:
+        logger.error(f"Test failed with error: {str(e)}", tag="TEST")
+        results.errors.append(e)
+    
+    # Log summary
+    results.log_summary(logger)
+    
+    return results
+
+# ======== TEST SCENARIO 3: Using pre-initialized browser hub ========
+async def test_preinitalized_browser_hub():
+    """
+    Test Scenario 3: Using pre-initialized browser hub
+    
+    This tests the case where a browser hub is initialized separately
+    and then passed to the pipeline.
+    """
+    logger = AsyncLogger(verbose=True)
+    results = TestResults("Pre-initialized Browser Hub")
+    
+    browser_hub = None
+    try:
+        # Create and initialize browser hub separately
+        logger.info("Initializing browser hub separately", tag="TEST")
+        
+        browser_config = BrowserConfig(
+            browser_type="chromium",
+            headless=True,
+            verbose=True
+        )
+        
+        browser_hub = await BrowserHub.get_browser_manager(
+            config=browser_config,
+            hub_id="test_preinitalized",
+            logger=logger,
+            max_browsers_per_config=2,
+            max_pages_per_browser=3,
+            initial_pool_size=2
+        )
+        
+        # Display initial status
+        status = await browser_hub.get_pool_status()
+        logger.info(
+            message="Initial browser hub status: {status}",
+            tag="TEST",
+            params={"status": status}
+        )
+        
+        # Create pipeline with pre-initialized browser hub
+        pipeline = await create_pipeline(
+            browser_hub=browser_hub,
+            logger=logger
+        )
+        
+        # Create crawler config
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            wait_until="networkidle",
+            screenshot=True
+        )
+        
+        # Start timing
+        results.start_time = asyncio.get_event_loop().time()
+        
+        # Process URLs in parallel
+        async def crawl_url(url):
+            try:
+                logger.info(f"Crawling {url} with pre-initialized hub", tag="TEST")
+                result = await pipeline.crawl(url=url, config=crawler_config)
+                logger.success(f"Completed crawl of {url}", tag="TEST")
+                return result
+            except Exception as e:
+                logger.error(f"Error crawling {url}: {str(e)}", tag="TEST")
+                results.errors.append(e)
+                return None
+        
+        # Create tasks for all URLs
+        tasks = [crawl_url(url) for url in TEST_URLS]
+        
+        # Execute all tasks in parallel and collect results
+        all_results = await asyncio.gather(*tasks)
+        results.results = [r for r in all_results if r is not None]
+        
+        # End timing
+        results.end_time = asyncio.get_event_loop().time()
+        
+        # Display final status
+        status = await browser_hub.get_pool_status()
+        logger.info(
+            message="Final browser hub status: {status}",
+            tag="TEST",
+            params={"status": status}
+        )
+        
+    except Exception as e:
+        logger.error(f"Test failed with error: {str(e)}", tag="TEST")
+        results.errors.append(e)
+    
+    # Log summary
+    results.log_summary(logger)
+    
+    return results, browser_hub
+
+# ======== TEST SCENARIO 4: Parallel pipelines sharing browser hub ========
+async def test_parallel_pipelines():
+    """
+    Test Scenario 4: Multiple parallel pipelines sharing browser hub
+    
+    This tests the case where multiple pipelines share the same browser hub,
+    demonstrating resource sharing and parallel operation.
+    """
+    logger = AsyncLogger(verbose=True)
+    results = TestResults("Parallel Pipelines")
+    
+    # We'll reuse the browser hub from the previous test
+    _, browser_hub = await test_preinitalized_browser_hub()
+    
+    try:
+        # Create 3 pipelines that all share the same browser hub
+        pipelines = []
+        for i in range(3):
+            pipeline = await create_pipeline(
+                browser_hub=browser_hub,
+                logger=logger
+            )
+            pipelines.append(pipeline)
+        
+        logger.info(f"Created {len(pipelines)} pipelines sharing the same browser hub", tag="TEST")
+        
+        # Create crawler configs with different settings
+        configs = [
+            CrawlerRunConfig(wait_until="domcontentloaded", screenshot=False),
+            CrawlerRunConfig(wait_until="networkidle", screenshot=True),
+            CrawlerRunConfig(wait_until="load", scan_full_page=True)
+        ]
+        
+        # Start timing
+        results.start_time = asyncio.get_event_loop().time()
+        
+        # Function to process URLs with a specific pipeline
+        async def process_with_pipeline(pipeline_idx, urls):
+            pipeline_results = []
+            for url in urls:
+                try:
+                    logger.info(f"Pipeline {pipeline_idx} crawling {url}", tag="TEST")
+                    result = await pipelines[pipeline_idx].crawl(
+                        url=url, 
+                        config=configs[pipeline_idx]
+                    )
+                    pipeline_results.append(result)
+                    logger.success(
+                        message="Pipeline {idx} completed: url={url}, success={success}",
+                        tag="TEST",
+                        params={
+                            "idx": pipeline_idx,
+                            "url": url,
+                            "success": result.success
+                        }
+                    )
+                except Exception as e:
+                    logger.error(
+                        message="Pipeline {idx} error: {error}",
+                        tag="TEST",
+                        params={
+                            "idx": pipeline_idx,
+                            "error": str(e)
+                        }
+                    )
+                    results.errors.append(e)
+            return pipeline_results
+        
+        # Distribute URLs among pipelines
+        pipeline_urls = [
+            TEST_URLS[:2],
+            TEST_URLS[2:4],
+            TEST_URLS[4:5] * 2  # Duplicate the last URL to have 2 for pipeline 3
+        ]
+        
+        # Execute all pipelines in parallel
+        tasks = [
+            process_with_pipeline(i, urls) 
+            for i, urls in enumerate(pipeline_urls)
+        ]
+        
+        pipeline_results = await asyncio.gather(*tasks)
+        
+        # Flatten results
+        for res_list in pipeline_results:
+            results.results.extend(res_list)
+        
+        # End timing
+        results.end_time = asyncio.get_event_loop().time()
+        
+        # Display browser hub status
+        status = await browser_hub.get_pool_status()
+        logger.info(
+            message="Browser hub status after parallel pipelines: {status}",
+            tag="TEST",
+            params={"status": status}
+        )
+        
+    except Exception as e:
+        logger.error(f"Test failed with error: {str(e)}", tag="TEST")
+        results.errors.append(e)
+    
+    # Log summary
+    results.log_summary(logger)
+    
+    return results
+
+# ======== TEST SCENARIO 5: Browser hub with connection string ========
+async def test_connection_string():
+    """
+    Test Scenario 5: Browser hub with connection string
+    
+    This tests the case where a browser hub is initialized from a connection string,
+    simulating connecting to a running browser hub service.
+    """
+    logger = AsyncLogger(verbose=True)
+    results = TestResults("Connection String")
+    
+    try:
+        # Create pipeline with connection string
+        # Note: In a real implementation, this would connect to an existing service
+        # For this test, we're using a simulated connection
+        connection_string = "localhost:9222"  # Simulated connection string
+        
+        pipeline = await create_pipeline(
+            browser_hub_connection=connection_string,
+            logger=logger
+        )
+        
+        # Create crawler config
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            wait_until="networkidle"
+        )
+        
+        # Start timing
+        results.start_time = asyncio.get_event_loop().time()
+        
+        # Test with a single URL
+        url = TEST_URLS[0]
+        try:
+            logger.info(f"Crawling {url} with connection string hub", tag="TEST")
+            result = await pipeline.crawl(url=url, config=crawler_config)
+            results.results.append(result)
+            
+            logger.success(
+                message="Result: url={url}, success={success}, content_length={length}",
+                tag="TEST",
+                params={
+                    "url": url,
+                    "success": result.success,
+                    "length": len(result.html) if result.html else 0
+                }
+            )
+        except Exception as e:
+            logger.error(f"Error crawling {url}: {str(e)}", tag="TEST")
+            results.errors.append(e)
+        
+        # End timing
+        results.end_time = asyncio.get_event_loop().time()
+        
+    except Exception as e:
+        logger.error(f"Test failed with error: {str(e)}", tag="TEST")
+        results.errors.append(e)
+    
+    # Log summary
+    results.log_summary(logger)
+    
+    return results
+
+# ======== RUN ALL TESTS ========
+async def run_all_tests():
+    """Run all test scenarios"""
+    logger = AsyncLogger(verbose=True)
+    logger.info("=== STARTING BROWSER HUB TESTS ===", tag="MAIN")
+    
+    try:
+        # Run each test scenario
+        await test_default_configuration()
+        # await test_custom_configuration()
+        # await test_preinitalized_browser_hub()
+        # await test_parallel_pipelines()
+        # await test_connection_string()
+        
+    except Exception as e:
+        logger.error(f"Test suite failed: {str(e)}", tag="MAIN")
+    finally:
+        # Clean up all browser hubs
+        logger.info("Shutting down all browser hubs...", tag="MAIN")
+        await BrowserHub.shutdown_all()
+        logger.success("All tests completed", tag="MAIN")
+
+if __name__ == "__main__":
+    asyncio.run(run_all_tests())
--- a/tests/pipeline/test_batch_crawl.py
+++ b/tests/pipeline/test_batch_crawl.py
@@ -0,0 +1,163 @@
+"""Test the Crawler class for batch crawling capabilities."""
+
+import asyncio
+import pytest
+from typing import List, Dict, Any, Optional, Tuple
+
+from crawl4ai import Crawler
+from crawl4ai import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_logger import AsyncLogger
+from crawl4ai.models import CrawlResult, CrawlResultContainer
+from crawl4ai.browser import BrowserHub
+from crawl4ai.cache_context import CacheMode
+
+# Test URLs for crawling
+SAFE_URLS = [
+   "https://example.com",
+   "https://httpbin.org/html",
+   "https://httpbin.org/headers",
+   "https://httpbin.org/ip",
+   "https://httpbin.org/user-agent",
+   "https://httpstat.us/200",
+   "https://jsonplaceholder.typicode.com/posts/1",
+   "https://jsonplaceholder.typicode.com/comments/1",
+   "https://iana.org",
+   "https://www.python.org"
+]
+
+# Simple test for batch crawling
+@pytest.mark.asyncio
+async def test_batch_crawl_simple():
+    """Test simple batch crawling with multiple URLs."""
+    # Use a few test URLs
+    urls = SAFE_URLS[:3]
+    
+    # Custom crawler config
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        wait_until="domcontentloaded"
+    )
+    
+    # Crawl multiple URLs using batch crawl
+    results = await Crawler.crawl(
+        urls,
+        crawler_config=crawler_config
+    )
+    
+    # Verify the results
+    assert isinstance(results, dict)
+    assert len(results) == len(urls)
+    
+    for url in urls:
+        assert url in results
+        assert results[url].success
+        assert results[url].html is not None
+
+# Test parallel batch crawling
+@pytest.mark.asyncio
+async def test_parallel_batch_crawl():
+    """Test parallel batch crawling with multiple URLs."""
+    # Use several URLs for parallel crawling
+    urls = SAFE_URLS[:5]
+    
+    # Basic crawler config
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        wait_until="domcontentloaded"
+    )
+    
+    # Crawl in parallel
+    start_time = asyncio.get_event_loop().time()
+    results = await Crawler.parallel_crawl(
+        urls,
+        crawler_config=crawler_config
+    )
+    end_time = asyncio.get_event_loop().time()
+    
+    # Verify results
+    assert len(results) == len(urls)
+    successful = sum(1 for r in results.values() if r.success)
+    
+    print(f"Parallel crawl of {len(urls)} URLs completed in {end_time - start_time:.2f}s")
+    print(f"Success rate: {successful}/{len(urls)}")
+    
+    # At least 80% should succeed
+    assert successful / len(urls) >= 0.8
+
+# Test batch crawling with different configurations
+@pytest.mark.asyncio
+async def test_batch_crawl_mixed_configs():
+    """Test batch crawling with different configurations for different URLs."""
+    # Create URL batches with different configurations
+    batch1 = (SAFE_URLS[:2], CrawlerRunConfig(wait_until="domcontentloaded", screenshot=False))
+    batch2 = (SAFE_URLS[2:4], CrawlerRunConfig(wait_until="networkidle", screenshot=True))
+    
+    # Crawl with mixed configurations
+    start_time = asyncio.get_event_loop().time()
+    results = await Crawler.parallel_crawl([batch1, batch2])
+    end_time = asyncio.get_event_loop().time()
+    
+    # Extract all URLs
+    all_urls = batch1[0] + batch2[0]
+    
+    # Verify results
+    assert len(results) == len(all_urls)
+    
+    # Check that screenshots are present only for batch2
+    for url in batch1[0]:
+        assert results[url].screenshot is None
+    
+    for url in batch2[0]:
+        assert results[url].screenshot is not None
+    
+    print(f"Mixed-config parallel crawl of {len(all_urls)} URLs completed in {end_time - start_time:.2f}s")
+
+# Test shared browser hub
+@pytest.mark.asyncio
+async def test_batch_crawl_shared_hub():
+    """Test batch crawling with a shared browser hub."""
+    # Create and initialize a browser hub
+    browser_config = BrowserConfig(
+        browser_type="chromium",
+        headless=True
+    )
+    
+    browser_hub = await BrowserHub.get_browser_manager(
+        config=browser_config,
+        max_browsers_per_config=3,
+        max_pages_per_browser=4,
+        initial_pool_size=1
+    )
+    
+    try:
+        # Use the hub for parallel crawling
+        urls = SAFE_URLS[:3]
+        
+        start_time = asyncio.get_event_loop().time()
+        results = await Crawler.parallel_crawl(
+            urls,
+            browser_hub=browser_hub,
+            crawler_config=CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                wait_until="domcontentloaded"
+            )
+        )
+        end_time = asyncio.get_event_loop().time()
+        
+        # Verify results
+        assert len(results) == len(urls)
+        successful = sum(1 for r in results.values() if r.success)
+        
+        print(f"Shared hub parallel crawl of {len(urls)} URLs completed in {end_time - start_time:.2f}s")
+        print(f"Success rate: {successful}/{len(urls)}")
+        
+        # Get browser hub statistics
+        hub_stats = await browser_hub.get_pool_status()
+        print(f"Browser hub stats: {hub_stats}")
+        
+        # At least 80% should succeed
+        assert successful / len(urls) >= 0.8
+        
+    finally:
+        # Clean up the browser hub
+        await browser_hub.close()
--- a/tests/pipeline/test_crawler.py
+++ b/tests/pipeline/test_crawler.py
@@ -0,0 +1,447 @@
+# test_crawler.py
+import asyncio
+import warnings
+import pytest
+import pytest_asyncio
+from typing import Optional, Tuple
+
+# Define test fixtures
+@pytest_asyncio.fixture
+async def clean_browser_hub():
+    """Fixture to ensure clean browser hub state between tests."""
+    # Yield control to the test
+    yield
+    
+    # After test, cleanup all browser hubs
+    from crawl4ai.browser import BrowserHub
+    try:
+        await BrowserHub.shutdown_all()
+    except Exception as e:
+        print(f"Error during browser cleanup: {e}")
+
+from crawl4ai import Crawler
+from crawl4ai import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_logger import AsyncLogger
+from crawl4ai.models import CrawlResultContainer
+from crawl4ai.browser import BrowserHub
+from crawl4ai.cache_context import CacheMode
+
+import warnings
+from pydantic import PydanticDeprecatedSince20
+
+
+
+# Test URLs for crawling
+SAFE_URLS = [
+    "https://example.com",
+    "https://httpbin.org/html",
+    "https://httpbin.org/headers",
+    "https://httpbin.org/ip",
+    "https://httpbin.org/user-agent",
+    "https://httpstat.us/200",
+    "https://jsonplaceholder.typicode.com/posts/1",
+    "https://jsonplaceholder.typicode.com/comments/1",
+    "https://iana.org",
+    "https://www.python.org",
+]
+
+
+class TestCrawlerBasic:
+    """Basic tests for the Crawler utility class"""
+
+    @pytest.mark.asyncio
+    async def test_simple_crawl_single_url(self, clean_browser_hub):
+        """Test crawling a single URL with default configuration"""
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", category=Warning)
+            # Basic logger
+            logger = AsyncLogger(verbose=True)
+
+            # Basic single URL crawl with default configuration
+            url = "https://example.com"
+            result = await Crawler.crawl(url)
+
+            # Verify the result
+            assert isinstance(result, CrawlResultContainer)
+            assert result.success
+            assert result.url == url
+            assert result.html is not None
+            assert len(result.html) > 0
+
+    @pytest.mark.asyncio
+    async def test_crawl_with_custom_config(self, clean_browser_hub):
+        """Test crawling with custom browser and crawler configuration"""
+        # Custom browser config
+        browser_config = BrowserConfig(
+            browser_type="chromium",
+            headless=True,
+            viewport_width=1280,
+            viewport_height=800,
+        )
+
+        # Custom crawler config
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS, wait_until="networkidle", screenshot=True
+        )
+
+        # Crawl with custom configuration
+        url = "https://httpbin.org/html"
+        result = await Crawler.crawl(
+            url, browser_config=browser_config, crawler_config=crawler_config
+        )
+
+        # Verify the result
+        assert result.success
+        assert result.url == url
+        assert result.screenshot is not None
+
+    @pytest.mark.asyncio
+    async def test_crawl_multiple_urls_sequential(self, clean_browser_hub):
+        """Test crawling multiple URLs sequentially"""
+        # Use a few test URLs
+        urls = SAFE_URLS[:3]
+
+        # Custom crawler config
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS, wait_until="domcontentloaded"
+        )
+
+        # Crawl multiple URLs sequentially
+        results = await Crawler.crawl(urls, crawler_config=crawler_config)
+
+        # Verify the results
+        assert isinstance(results, dict)
+        assert len(results) == len(urls)
+
+        for url in urls:
+            assert url in results
+            assert results[url].success
+            assert results[url].html is not None
+
+    @pytest.mark.asyncio
+    async def test_crawl_with_error_handling(self, clean_browser_hub):
+        """Test error handling during crawling"""
+        # Include a valid URL and a non-existent URL
+        urls = ["https://example.com", "https://non-existent-domain-123456789.com"]
+
+        # Crawl with retries
+        results = await Crawler.crawl(urls, max_retries=2, retry_delay=1.0)
+
+        # Verify results for both URLs
+        assert len(results) == 2
+
+        # Valid URL should succeed
+        assert results[urls[0]].success
+
+        # Invalid URL should fail but be in results
+        assert urls[1] in results
+        assert not results[urls[1]].success
+        assert results[urls[1]].error_message is not None
+
+
+class TestCrawlerParallel:
+    """Tests for the parallel crawling capabilities of Crawler"""
+
+    @pytest.mark.asyncio
+    async def test_parallel_crawl_simple(self, clean_browser_hub):
+        """Test basic parallel crawling with same configuration"""
+        # Use several URLs for parallel crawling
+        urls = SAFE_URLS[:5]
+
+        # Basic crawler config
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS, wait_until="domcontentloaded"
+        )
+
+        # Crawl in parallel with default concurrency
+        start_time = asyncio.get_event_loop().time()
+        results = await Crawler.parallel_crawl(urls, crawler_config=crawler_config)
+        end_time = asyncio.get_event_loop().time()
+
+        # Verify results
+        assert len(results) == len(urls)
+        successful = sum(1 for r in results.values() if r.success)
+
+        print(
+            f"Parallel crawl of {len(urls)} URLs completed in {end_time - start_time:.2f}s"
+        )
+        print(f"Success rate: {successful}/{len(urls)}")
+
+        # At least 80% should succeed
+        assert successful / len(urls) >= 0.8
+
+    @pytest.mark.asyncio
+    async def test_parallel_crawl_with_concurrency_limit(self, clean_browser_hub):
+        """Test parallel crawling with concurrency limit"""
+        # Use more URLs to test concurrency control
+        urls = SAFE_URLS[:8]
+
+        # Custom crawler config
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS, wait_until="domcontentloaded"
+        )
+
+        # Limited concurrency
+        concurrency = 2
+
+        # Time the crawl
+        start_time = asyncio.get_event_loop().time()
+        results = await Crawler.parallel_crawl(
+            urls, crawler_config=crawler_config, concurrency=concurrency
+        )
+        end_time = asyncio.get_event_loop().time()
+
+        # Verify results
+        assert len(results) == len(urls)
+        successful = sum(1 for r in results.values() if r.success)
+
+        print(
+            f"Parallel crawl with concurrency={concurrency} of {len(urls)} URLs completed in {end_time - start_time:.2f}s"
+        )
+        print(f"Success rate: {successful}/{len(urls)}")
+
+        # At least 80% should succeed
+        assert successful / len(urls) >= 0.8
+
+    @pytest.mark.asyncio
+    async def test_parallel_crawl_with_different_configs(self, clean_browser_hub):
+        """Test parallel crawling with different configurations for different URLs"""
+        # Create URL batches with different configurations
+        batch1 = (
+            SAFE_URLS[:2],
+            CrawlerRunConfig(wait_until="domcontentloaded", screenshot=False),
+        )
+        batch2 = (
+            SAFE_URLS[2:4],
+            CrawlerRunConfig(wait_until="networkidle", screenshot=True),
+        )
+        batch3 = (
+            SAFE_URLS[4:6],
+            CrawlerRunConfig(wait_until="load", scan_full_page=True),
+        )
+
+        # Crawl with mixed configurations
+        start_time = asyncio.get_event_loop().time()
+        results = await Crawler.parallel_crawl([batch1, batch2, batch3])
+        end_time = asyncio.get_event_loop().time()
+
+        # Extract all URLs
+        all_urls = batch1[0] + batch2[0] + batch3[0]
+
+        # Verify results
+        assert len(results) == len(all_urls)
+
+        # Check that screenshots are present only for batch2
+        for url in batch1[0]:
+            assert not results[url].screenshot
+
+        for url in batch2[0]:
+            assert results[url].screenshot
+
+        print(
+            f"Mixed-config parallel crawl of {len(all_urls)} URLs completed in {end_time - start_time:.2f}s"
+        )
+
+    @pytest.mark.asyncio
+    async def test_parallel_crawl_with_shared_browser_hub(self, clean_browser_hub):
+        """Test parallel crawling with a shared browser hub"""
+        # Create and initialize a browser hub
+        browser_config = BrowserConfig(browser_type="chromium", headless=True)
+
+        browser_hub = await BrowserHub.get_browser_manager(
+            config=browser_config,
+            max_browsers_per_config=3,
+            max_pages_per_browser=4,
+            initial_pool_size=1,
+        )
+
+        try:
+            # Use the hub for parallel crawling
+            urls = SAFE_URLS[:6]
+
+            start_time = asyncio.get_event_loop().time()
+            results = await Crawler.parallel_crawl(
+                urls,
+                browser_hub=browser_hub,
+                crawler_config=CrawlerRunConfig(
+                    cache_mode=CacheMode.BYPASS, wait_until="domcontentloaded"
+                ),
+            )
+            end_time = asyncio.get_event_loop().time()
+
+            # Verify results
+            # assert (len(results), len(urls))
+            assert len(results) == len(urls)
+            successful = sum(1 for r in results.values() if r.success)
+
+            print(
+                f"Shared hub parallel crawl of {len(urls)} URLs completed in {end_time - start_time:.2f}s"
+            )
+            print(f"Success rate: {successful}/{len(urls)}")
+
+            # Get browser hub statistics
+            hub_stats = await browser_hub.get_pool_status()
+            print(f"Browser hub stats: {hub_stats}")
+
+            # At least 80% should succeed
+            # assert (successful / len(urls), 0.8)
+            assert successful / len(urls) >= 0.8
+
+        finally:
+            # Clean up the browser hub
+            await browser_hub.close()
+
+
+class TestCrawlerAdvanced:
+    """Advanced tests for the Crawler utility class"""
+
+    @pytest.mark.asyncio
+    async def test_crawl_with_customized_batch_config(self, clean_browser_hub):
+        """Test crawling with fully customized batch configuration"""
+        # Create URL batches with different browser and crawler configurations
+        browser_config1 = BrowserConfig(browser_type="chromium", headless=True)
+        browser_config2 = BrowserConfig(
+            browser_type="chromium", headless=False, viewport_width=1920
+        )
+
+        crawler_config1 = CrawlerRunConfig(wait_until="domcontentloaded")
+        crawler_config2 = CrawlerRunConfig(wait_until="networkidle", screenshot=True)
+
+        batch1 = (SAFE_URLS[:2], browser_config1, crawler_config1)
+        batch2 = (SAFE_URLS[2:4], browser_config2, crawler_config2)
+
+        # Crawl with mixed configurations
+        results = await Crawler.parallel_crawl([batch1, batch2])
+
+        # Extract all URLs
+        all_urls = batch1[0] + batch2[0]
+
+        # Verify results
+        # assert (len(results), len(all_urls))
+        assert len(results) == len(all_urls)
+
+        # Verify batch-specific processing
+        for url in batch1[0]:
+            assert results[url].screenshot is None  # No screenshots for batch1
+
+        for url in batch2[0]:
+            assert results[url].screenshot is not None  # Should have screenshots for batch2
+
+    @pytest.mark.asyncio
+    async def test_crawl_with_progress_callback(self, clean_browser_hub):
+        """Test crawling with progress callback"""
+        # Use several URLs
+        urls = SAFE_URLS[:5]
+
+        # Track progress
+        progress_data = {"started": 0, "completed": 0, "failed": 0, "updates": []}
+
+        # Progress callback
+        async def on_progress(
+            status: str, url: str, result: Optional[CrawlResultContainer] = None
+        ):
+            if status == "started":
+                progress_data["started"] += 1
+            elif status == "completed":
+                progress_data["completed"] += 1
+                if not result.success:
+                    progress_data["failed"] += 1
+
+            progress_data["updates"].append((status, url))
+            print(f"Progress: {status} - {url}")
+
+        # Crawl with progress tracking
+        results = await Crawler.parallel_crawl(
+            urls,
+            crawler_config=CrawlerRunConfig(wait_until="domcontentloaded"),
+            progress_callback=on_progress,
+        )
+
+        # Verify progress tracking
+        assert progress_data["started"] == len(urls)
+        assert progress_data["completed"] == len(urls)
+        assert len(progress_data["updates"]) == len(urls) * 2  # start + complete events
+
+    @pytest.mark.asyncio
+    async def test_crawl_with_dynamic_retry_strategy(self, clean_browser_hub):
+        """Test crawling with a dynamic retry strategy"""
+        # Include URLs that might fail
+        urls = [
+            "https://example.com",
+            "https://httpstat.us/500",
+            "https://httpstat.us/404",
+        ]
+
+        # Custom retry strategy
+        async def retry_strategy(
+            url: str, attempt: int, error: Exception
+        ) -> Tuple[bool, float]:
+            # Only retry 500 errors, not 404s
+            if "500" in url:
+                return True, 1.0  # Retry with 1 second delay
+            return False, 0.0  # Don't retry other errors
+
+        # Crawl with custom retry strategy
+        results = await Crawler.parallel_crawl(
+            urls,
+            crawler_config=CrawlerRunConfig(wait_until="domcontentloaded"),
+            retry_strategy=retry_strategy,
+            max_retries=3,
+        )
+
+        # Verify results
+        assert len(results) == len(urls)
+
+        # Example.com should succeed
+        assert results[urls[0]].success
+
+        # httpstat.us pages return content even for error status codes
+        # so our crawler marks them as successful since it got HTML content
+        # Verify that we got the expected status code
+        assert results[urls[1]].status_code == 500
+        
+        # 404 should have the correct status code
+        assert results[urls[2]].status_code == 404
+
+    @pytest.mark.asyncio
+    async def test_crawl_with_very_large_batch(self, clean_browser_hub):
+        """Test crawling with a very large batch of URLs"""
+        # Create a batch by repeating our safe URLs
+        # Note: In a real test, we'd use more URLs, but for simplicity we'll use a smaller set
+        large_batch = list(dict.fromkeys(SAFE_URLS[:5] * 2))  # ~10 unique URLs
+
+        # Set a reasonable concurrency limit
+        concurrency = 10
+
+        # Time the crawl
+        start_time = asyncio.get_event_loop().time()
+        results = await Crawler.parallel_crawl(
+            large_batch,
+            crawler_config=CrawlerRunConfig(
+                wait_until="domcontentloaded",
+                page_timeout=10000,  # Shorter timeout for large batch
+            ),
+            concurrency=concurrency,
+        )
+        end_time = asyncio.get_event_loop().time()
+
+        # Verify results
+        # assert (len(results), len(large_batch))
+        assert len(results) == len(large_batch)
+        successful = sum(1 for r in results.values() if r.success)
+
+        print(
+            f"Large batch crawl of {len(large_batch)} URLs completed in {end_time - start_time:.2f}s"
+        )
+        print(f"Success rate: {successful}/{len(large_batch)}")
+        print(
+            f"Average time per URL: {(end_time - start_time) / len(large_batch):.2f}s"
+        )
+
+        # At least 80% should succeed (from our unique URLs)
+        assert successful / len(results) >= 0.8
+
+
+if __name__ == "__main__":
+    # Use pytest for async tests
+    pytest.main(["-xvs", __file__])
--- a/tests/pipeline/test_pipeline.py
+++ b/tests/pipeline/test_pipeline.py
@@ -0,0 +1,109 @@
+import asyncio
+from crawl4ai import (
+    BrowserConfig,
+    CrawlerRunConfig,
+    CacheMode,
+    DefaultMarkdownGenerator,
+    PruningContentFilter
+)
+from pipeline import Pipeline
+
+async def main():
+    # Create configuration objects
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(
+                threshold=0.48, 
+                threshold_type="fixed", 
+                min_word_threshold=0
+            )
+        ),
+    )
+    
+    # Create and use pipeline with context manager
+    async with Pipeline(browser_config=browser_config) as pipeline:
+        result = await pipeline.crawl(
+            url="https://www.example.com", 
+            config=crawler_config
+        )
+        
+        # Print the result
+        print(f"URL: {result.url}")
+        print(f"Success: {result.success}")
+        
+        if result.success:
+            print("\nMarkdown excerpt:")
+            print(result.markdown.raw_markdown[:500] + "...")
+        else:
+            print(f"Error: {result.error_message}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
+
+class CrawlTarget:
+    def __init__(self, urls, config=None):
+        self.urls = urls
+        self.config = config
+
+    def __repr__(self):
+        return f"CrawlTarget(urls={self.urls}, config={self.config})"
+    
+
+
+
+# async def main():
+#     # Create configuration objects
+#     browser_config = BrowserConfig(headless=True, verbose=True)
+    
+#     # Define different configurations
+#     config1 = CrawlerRunConfig(
+#         cache_mode=CacheMode.BYPASS,
+#         markdown_generator=DefaultMarkdownGenerator(
+#             content_filter=PruningContentFilter(threshold=0.48)
+#         ),
+#     )
+    
+#     config2 = CrawlerRunConfig(
+#         cache_mode=CacheMode.ENABLED,
+#         screenshot=True,
+#         pdf=True
+#     )
+    
+#     # Create crawl targets
+#     targets = [
+#         CrawlTarget(
+#             urls=["https://www.example.com", "https://www.wikipedia.org"],
+#             config=config1
+#         ),
+#         CrawlTarget(
+#             urls="https://news.ycombinator.com",  
+#             config=config2
+#         ),
+#         CrawlTarget(
+#             urls=["https://github.com", "https://stackoverflow.com", "https://python.org"],
+#             config=None
+#         )
+#     ]
+    
+#     # Create and use pipeline with context manager
+#     async with Pipeline(browser_config=browser_config) as pipeline:
+#         all_results = await pipeline.crawl_batch(targets)
+        
+#         for target_key, results in all_results.items():
+#             print(f"\n===== Results for {target_key} =====")
+#             print(f"Number of URLs crawled: {len(results)}")
+            
+#             for i, result in enumerate(results):
+#                 print(f"\nURL {i+1}: {result.url}")
+#                 print(f"Success: {result.success}")
+                
+#                 if result.success:
+#                     print(f"Content length: {len(result.markdown.raw_markdown)} chars")
+#                 else:
+#                     print(f"Error: {result.error_message}")
+
+# if __name__ == "__main__":
+#     asyncio.run(main())