Some refactoring, movie pipelin submodule folder into the main.

2025-04-06 18:28:28 +08:00
parent 591f55edc7
commit d95b2dc9f2
8 changed files with 2655 additions and 0 deletions
--- a/crawl4ai/pipeline/demo_browser_hub_pipeline.py
+++ b/crawl4ai/pipeline/demo_browser_hub_pipeline.py
@@ -0,0 +1,222 @@
 # demo_browser_hub.py
 import asyncio
 from typing import List
 from crawl4ai.browser.browser_hub import BrowserHub
 from pipeline import create_pipeline
 from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 from crawl4ai.async_logger import AsyncLogger
 from crawl4ai.models import CrawlResultContainer
 from crawl4ai.cache_context import CacheMode
 from crawl4ai import DefaultMarkdownGenerator
 from crawl4ai import PruningContentFilter
 async def create_prewarmed_browser_hub(urls_to_crawl: List[str]):
    """Create a pre-warmed browser hub with 10 browsers and 5 pages each."""
    # Set up logging
    logger = AsyncLogger(verbose=True)
    logger.info("Setting up pre-warmed browser hub", tag="DEMO")
    # Create browser configuration
    browser_config = BrowserConfig(
        browser_type="chromium",
        headless=True,  # Set to False to see the browsers in action
        viewport_width=1280,
        viewport_height=800,
        light_mode=True,  # Optimize for performance
        java_script_enabled=True
    )
    # Create crawler configurations for pre-warming with different user agents
    # This allows pages to be ready for different scenarios
    crawler_configs = [
        CrawlerRunConfig(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            wait_until="networkidle"
        ),
        # CrawlerRunConfig(
        #     user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15",
        #     wait_until="networkidle"
        # ),
        # CrawlerRunConfig(
        #     user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
        #     wait_until="networkidle"
        # )
    ]
    # Number of browsers and pages per browser
    num_browsers = 1
    pages_per_browser = 1
    # Distribute pages across configurations
    # We'll create a total of 50 pages (10 browsers × 5 pages)
    page_configs = []
    total_pages = num_browsers * pages_per_browser
    pages_per_config = total_pages // len(crawler_configs)
    for i, config in enumerate(crawler_configs):
        # For the last config, add any remaining pages
        if i == len(crawler_configs) - 1:
            remaining = total_pages - (pages_per_config * (len(crawler_configs) - 1))
            page_configs.append((browser_config, config, remaining))
        else:
            page_configs.append((browser_config, config, pages_per_config))
    # Create browser hub with pre-warmed pages
    start_time = asyncio.get_event_loop().time()
    logger.info("Initializing browser hub with pre-warmed pages...", tag="DEMO")
    hub = await BrowserHub.get_browser_manager(
        config=browser_config,
        hub_id="demo_hub",
        logger=logger,
        max_browsers_per_config=num_browsers,
        max_pages_per_browser=pages_per_browser,
        initial_pool_size=num_browsers,
        page_configs=page_configs
    )
    end_time = asyncio.get_event_loop().time()
    logger.success(
        message="Browser hub initialized with {total_pages} pre-warmed pages in {duration:.2f} seconds",
        tag="DEMO",
        params={
            "total_pages": total_pages,
            "duration": end_time - start_time
        }
    )
    # Get and display pool status
    status = await hub.get_pool_status()
    logger.info(
        message="Browser pool status: {status}",
        tag="DEMO",
        params={"status": status}
    )
    return hub
 async def crawl_urls_with_hub(hub, urls: List[str]) -> List[CrawlResultContainer]:
    """Crawl a list of URLs using a pre-warmed browser hub."""
    logger = AsyncLogger(verbose=True)
    # Create crawler configuration
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        markdown_generator=DefaultMarkdownGenerator(
            content_filter=PruningContentFilter(
                threshold=0.48,
                threshold_type="fixed",
                min_word_threshold=0
            )
        ),
        wait_until="networkidle",
        screenshot=True
    )
    # Create pipeline with the browser hub
    pipeline = await create_pipeline(
        browser_hub=hub,
        logger=logger
    )
    results = []
    # Crawl all URLs in parallel
    async def crawl_url(url):
        logger.info(f"Crawling {url}...", tag="CRAWL")
        result = await pipeline.crawl(url=url, config=crawler_config)
        logger.success(f"Completed crawl of {url}", tag="CRAWL")
        return result
    # Create tasks for all URLs
    tasks = [crawl_url(url) for url in urls]
    # Execute all tasks in parallel and collect results
    results = await asyncio.gather(*tasks)
    return results
 async def main():
    """Main demo function."""
    # List of URLs to crawl
    urls_to_crawl = [
        "https://example.com",
        # "https://www.python.org",
        # "https://httpbin.org/html",
        # "https://news.ycombinator.com",
        # "https://github.com",
        # "https://pypi.org",
        # "https://docs.python.org/3/",
        # "https://opensource.org",
        # "https://whatismyipaddress.com",
        # "https://en.wikipedia.org/wiki/Web_scraping"
    ]
    # Set up logging
    logger = AsyncLogger(verbose=True)
    logger.info("Starting browser hub demo", tag="DEMO")
    try:
        # Create pre-warmed browser hub
        hub = await create_prewarmed_browser_hub(urls_to_crawl)
        # Use hub to crawl URLs
        logger.info("Crawling URLs in parallel...", tag="DEMO")
        start_time = asyncio.get_event_loop().time()
        results = await crawl_urls_with_hub(hub, urls_to_crawl)
        end_time = asyncio.get_event_loop().time()
        # Display results
        logger.success(
            message="Crawled {count} URLs in {duration:.2f} seconds (average: {avg:.2f} seconds per URL)",
            tag="DEMO",
            params={
                "count": len(results),
                "duration": end_time - start_time,
                "avg": (end_time - start_time) / len(results)
            }
        )
        # Print summary of results
        logger.info("Crawl results summary:", tag="DEMO")
        for i, result in enumerate(results):
            logger.info(
                message="{idx}. {url}: Success={success}, Content length={length}",
                tag="RESULT",
                params={
                    "idx": i+1,
                    "url": result.url,
                    "success": result.success,
                    "length": len(result.html) if result.html else 0
                }
            )
            if result.success and result.markdown and result.markdown.raw_markdown:
                # Print a snippet of the markdown
                markdown_snippet = result.markdown.raw_markdown[:150] + "..."
                logger.info(
                    message="   Markdown: {snippet}",
                    tag="RESULT",
                    params={"snippet": markdown_snippet}
                )
        # Display final browser pool status
        status = await hub.get_pool_status()
        logger.info(
            message="Final browser pool status: {status}",
            tag="DEMO",
            params={"status": status}
        )
    finally:
        # Clean up
        logger.info("Shutting down browser hub...", tag="DEMO")
        await BrowserHub.shutdown_all()
        logger.success("Demo completed", tag="DEMO")
 if __name__ == "__main__":
    asyncio.run(main())
--- a/crawl4ai/pipeline/extended_browser_hub_tests.py
+++ b/crawl4ai/pipeline/extended_browser_hub_tests.py
@@ -0,0 +1,505 @@
 # extended_browser_hub_tests.py
 import asyncio
 from crawl4ai.browser.browser_hub import BrowserHub
 from pipeline import create_pipeline
 from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 from crawl4ai.async_logger import AsyncLogger
 from crawl4ai.cache_context import CacheMode
 # Common test URLs
 TEST_URLS = [
    "https://example.com",
    "https://example.com/page1",
    "https://httpbin.org/html",
    "https://httpbin.org/headers",
    "https://httpbin.org/ip",
    "https://httpstat.us/200"
 ]
 class TestResults:
    """Simple container for test results"""
    def __init__(self, name: str):
        self.name = name
        self.results = []
        self.start_time = None
        self.end_time = None
        self.errors = []
    @property
    def duration(self) -> float:
        if self.start_time and self.end_time:
            return self.end_time - self.start_time
        return 0
    @property
    def success_rate(self) -> float:
        if not self.results:
            return 0
        return sum(1 for r in self.results if r.success) / len(self.results) * 100
    def log_summary(self, logger: AsyncLogger):
        logger.info(f"=== Test: {self.name} ===", tag="SUMMARY")
        logger.info(
            message="Duration: {duration:.2f}s, Success rate: {success_rate:.1f}%, Results: {count}",
            tag="SUMMARY",
            params={
                "duration": self.duration,
                "success_rate": self.success_rate,
                "count": len(self.results)
            }
        )
        if self.errors:
            logger.error(
                message="Errors ({count}): {errors}",
                tag="SUMMARY",
                params={
                    "count": len(self.errors),
                    "errors": "; ".join(str(e) for e in self.errors)
                }
            )
 # ======== TEST SCENARIO 1: Simple default configuration ========
 async def test_default_configuration():
    """
    Test Scenario 1: Simple default configuration
    This tests the basic case where the user does not provide any specific
    browser configuration, relying on default auto-setup.
    """
    logger = AsyncLogger(verbose=True)
    results = TestResults("Default Configuration")
    try:
        # Create pipeline with no browser config
        pipeline = await create_pipeline(logger=logger)
        # Start timing
        results.start_time = asyncio.get_event_loop().time()
        # Create basic crawler config
        crawler_config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            wait_until="domcontentloaded"
        )
        # Process each URL sequentially
        for url in TEST_URLS:
            try:
                logger.info(f"Crawling {url} with default configuration", tag="TEST")
                result = await pipeline.crawl(url=url, config=crawler_config)
                results.results.append(result)
                logger.success(
                    message="Result: url={url}, success={success}, content_length={length}",
                    tag="TEST",
                    params={
                        "url": url,
                        "success": result.success,
                        "length": len(result.html) if result.html else 0
                    }
                )
            except Exception as e:
                logger.error(f"Error crawling {url}: {str(e)}", tag="TEST")
                results.errors.append(e)
        # End timing
        results.end_time = asyncio.get_event_loop().time()
    except Exception as e:
        logger.error(f"Test failed with error: {str(e)}", tag="TEST")
        results.errors.append(e)
    # Log summary
    results.log_summary(logger)
    return results
 # ======== TEST SCENARIO 2: Detailed custom configuration ========
 async def test_custom_configuration():
    """
    Test Scenario 2: Detailed custom configuration
    This tests the case where the user provides detailed browser configuration
    to customize the browser behavior.
    """
    logger = AsyncLogger(verbose=True)
    results = TestResults("Custom Configuration")
    try:
        # Create custom browser config
        browser_config = BrowserConfig(
            browser_type="chromium",
            headless=True,
            viewport_width=1920,
            viewport_height=1080,
            user_agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
            light_mode=True,
            ignore_https_errors=True,
            extra_args=["--disable-extensions"]
        )
        # Create custom crawler config
        crawler_config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            wait_until="networkidle",
            page_timeout=30000,
            screenshot=True,
            pdf=False,
            screenshot_wait_for=0.5,
            wait_for_images=True,
            scan_full_page=True,
            scroll_delay=0.2,
            process_iframes=True,
            remove_overlay_elements=True
        )
        # Create pipeline with custom configuration
        pipeline = await create_pipeline(
            browser_config=browser_config,
            logger=logger
        )
        # Start timing
        results.start_time = asyncio.get_event_loop().time()
        # Process each URL sequentially
        for url in TEST_URLS:
            try:
                logger.info(f"Crawling {url} with custom configuration", tag="TEST")
                result = await pipeline.crawl(url=url, config=crawler_config)
                results.results.append(result)
                has_screenshot = result.screenshot is not None
                logger.success(
                    message="Result: url={url}, success={success}, screenshot={screenshot}, content_length={length}",
                    tag="TEST",
                    params={
                        "url": url,
                        "success": result.success,
                        "screenshot": has_screenshot,
                        "length": len(result.html) if result.html else 0
                    }
                )
            except Exception as e:
                logger.error(f"Error crawling {url}: {str(e)}", tag="TEST")
                results.errors.append(e)
        # End timing
        results.end_time = asyncio.get_event_loop().time()
        # Get browser hub status from context
        try:
            # Run a dummy crawl to get the context with browser hub
            context = await pipeline.process({"url": "about:blank", "config": crawler_config})
            browser_hub = context.get("browser_hub")
            if browser_hub:
                status = await browser_hub.get_pool_status()
                logger.info(
                    message="Browser hub status: {status}",
                    tag="TEST",
                    params={"status": status}
                )
        except Exception as e:
            logger.error(f"Failed to get browser hub status: {str(e)}", tag="TEST")
    except Exception as e:
        logger.error(f"Test failed with error: {str(e)}", tag="TEST")
        results.errors.append(e)
    # Log summary
    results.log_summary(logger)
    return results
 # ======== TEST SCENARIO 3: Using pre-initialized browser hub ========
 async def test_preinitalized_browser_hub():
    """
    Test Scenario 3: Using pre-initialized browser hub
    This tests the case where a browser hub is initialized separately
    and then passed to the pipeline.
    """
    logger = AsyncLogger(verbose=True)
    results = TestResults("Pre-initialized Browser Hub")
    browser_hub = None
    try:
        # Create and initialize browser hub separately
        logger.info("Initializing browser hub separately", tag="TEST")
        browser_config = BrowserConfig(
            browser_type="chromium",
            headless=True,
            verbose=True
        )
        browser_hub = await BrowserHub.get_browser_manager(
            config=browser_config,
            hub_id="test_preinitalized",
            logger=logger,
            max_browsers_per_config=2,
            max_pages_per_browser=3,
            initial_pool_size=2
        )
        # Display initial status
        status = await browser_hub.get_pool_status()
        logger.info(
            message="Initial browser hub status: {status}",
            tag="TEST",
            params={"status": status}
        )
        # Create pipeline with pre-initialized browser hub
        pipeline = await create_pipeline(
            browser_hub=browser_hub,
            logger=logger
        )
        # Create crawler config
        crawler_config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            wait_until="networkidle",
            screenshot=True
        )
        # Start timing
        results.start_time = asyncio.get_event_loop().time()
        # Process URLs in parallel
        async def crawl_url(url):
            try:
                logger.info(f"Crawling {url} with pre-initialized hub", tag="TEST")
                result = await pipeline.crawl(url=url, config=crawler_config)
                logger.success(f"Completed crawl of {url}", tag="TEST")
                return result
            except Exception as e:
                logger.error(f"Error crawling {url}: {str(e)}", tag="TEST")
                results.errors.append(e)
                return None
        # Create tasks for all URLs
        tasks = [crawl_url(url) for url in TEST_URLS]
        # Execute all tasks in parallel and collect results
        all_results = await asyncio.gather(*tasks)
        results.results = [r for r in all_results if r is not None]
        # End timing
        results.end_time = asyncio.get_event_loop().time()
        # Display final status
        status = await browser_hub.get_pool_status()
        logger.info(
            message="Final browser hub status: {status}",
            tag="TEST",
            params={"status": status}
        )
    except Exception as e:
        logger.error(f"Test failed with error: {str(e)}", tag="TEST")
        results.errors.append(e)
    # Log summary
    results.log_summary(logger)
    return results, browser_hub
 # ======== TEST SCENARIO 4: Parallel pipelines sharing browser hub ========
 async def test_parallel_pipelines():
    """
    Test Scenario 4: Multiple parallel pipelines sharing browser hub
    This tests the case where multiple pipelines share the same browser hub,
    demonstrating resource sharing and parallel operation.
    """
    logger = AsyncLogger(verbose=True)
    results = TestResults("Parallel Pipelines")
    # We'll reuse the browser hub from the previous test
    _, browser_hub = await test_preinitalized_browser_hub()
    try:
        # Create 3 pipelines that all share the same browser hub
        pipelines = []
        for i in range(3):
            pipeline = await create_pipeline(
                browser_hub=browser_hub,
                logger=logger
            )
            pipelines.append(pipeline)
        logger.info(f"Created {len(pipelines)} pipelines sharing the same browser hub", tag="TEST")
        # Create crawler configs with different settings
        configs = [
            CrawlerRunConfig(wait_until="domcontentloaded", screenshot=False),
            CrawlerRunConfig(wait_until="networkidle", screenshot=True),
            CrawlerRunConfig(wait_until="load", scan_full_page=True)
        ]
        # Start timing
        results.start_time = asyncio.get_event_loop().time()
        # Function to process URLs with a specific pipeline
        async def process_with_pipeline(pipeline_idx, urls):
            pipeline_results = []
            for url in urls:
                try:
                    logger.info(f"Pipeline {pipeline_idx} crawling {url}", tag="TEST")
                    result = await pipelines[pipeline_idx].crawl(
                        url=url, 
                        config=configs[pipeline_idx]
                    )
                    pipeline_results.append(result)
                    logger.success(
                        message="Pipeline {idx} completed: url={url}, success={success}",
                        tag="TEST",
                        params={
                            "idx": pipeline_idx,
                            "url": url,
                            "success": result.success
                        }
                    )
                except Exception as e:
                    logger.error(
                        message="Pipeline {idx} error: {error}",
                        tag="TEST",
                        params={
                            "idx": pipeline_idx,
                            "error": str(e)
                        }
                    )
                    results.errors.append(e)
            return pipeline_results
        # Distribute URLs among pipelines
        pipeline_urls = [
            TEST_URLS[:2],
            TEST_URLS[2:4],
            TEST_URLS[4:5] * 2  # Duplicate the last URL to have 2 for pipeline 3
        ]
        # Execute all pipelines in parallel
        tasks = [
            process_with_pipeline(i, urls) 
            for i, urls in enumerate(pipeline_urls)
        ]
        pipeline_results = await asyncio.gather(*tasks)
        # Flatten results
        for res_list in pipeline_results:
            results.results.extend(res_list)
        # End timing
        results.end_time = asyncio.get_event_loop().time()
        # Display browser hub status
        status = await browser_hub.get_pool_status()
        logger.info(
            message="Browser hub status after parallel pipelines: {status}",
            tag="TEST",
            params={"status": status}
        )
    except Exception as e:
        logger.error(f"Test failed with error: {str(e)}", tag="TEST")
        results.errors.append(e)
    # Log summary
    results.log_summary(logger)
    return results
 # ======== TEST SCENARIO 5: Browser hub with connection string ========
 async def test_connection_string():
    """
    Test Scenario 5: Browser hub with connection string
    This tests the case where a browser hub is initialized from a connection string,
    simulating connecting to a running browser hub service.
    """
    logger = AsyncLogger(verbose=True)
    results = TestResults("Connection String")
    try:
        # Create pipeline with connection string
        # Note: In a real implementation, this would connect to an existing service
        # For this test, we're using a simulated connection
        connection_string = "localhost:9222"  # Simulated connection string
        pipeline = await create_pipeline(
            browser_hub_connection=connection_string,
            logger=logger
        )
        # Create crawler config
        crawler_config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            wait_until="networkidle"
        )
        # Start timing
        results.start_time = asyncio.get_event_loop().time()
        # Test with a single URL
        url = TEST_URLS[0]
        try:
            logger.info(f"Crawling {url} with connection string hub", tag="TEST")
            result = await pipeline.crawl(url=url, config=crawler_config)
            results.results.append(result)
            logger.success(
                message="Result: url={url}, success={success}, content_length={length}",
                tag="TEST",
                params={
                    "url": url,
                    "success": result.success,
                    "length": len(result.html) if result.html else 0
                }
            )
        except Exception as e:
            logger.error(f"Error crawling {url}: {str(e)}", tag="TEST")
            results.errors.append(e)
        # End timing
        results.end_time = asyncio.get_event_loop().time()
    except Exception as e:
        logger.error(f"Test failed with error: {str(e)}", tag="TEST")
        results.errors.append(e)
    # Log summary
    results.log_summary(logger)
    return results
 # ======== RUN ALL TESTS ========
 async def run_all_tests():
    """Run all test scenarios"""
    logger = AsyncLogger(verbose=True)
    logger.info("=== STARTING BROWSER HUB TESTS ===", tag="MAIN")
    try:
        # Run each test scenario
        await test_default_configuration()
        # await test_custom_configuration()
        # await test_preinitalized_browser_hub()
        # await test_parallel_pipelines()
        # await test_connection_string()
    except Exception as e:
        logger.error(f"Test suite failed: {str(e)}", tag="MAIN")
    finally:
        # Clean up all browser hubs
        logger.info("Shutting down all browser hubs...", tag="MAIN")
        await BrowserHub.shutdown_all()
        logger.success("All tests completed", tag="MAIN")
 if __name__ == "__main__":
    asyncio.run(run_all_tests())
--- a/crawl4ai/pipeline/middlewares.py
+++ b/crawl4ai/pipeline/middlewares.py
@@ -0,0 +1,702 @@
 import time
 import sys
 from typing import Dict, Any, List
 import json
 from crawl4ai.models import (
    CrawlResult,
    MarkdownGenerationResult,
    ScrapingResult,
    CrawlResultContainer,
 )
 from crawl4ai.async_database import async_db_manager
 from crawl4ai.cache_context import CacheMode, CacheContext
 from crawl4ai.utils import (
    sanitize_input_encode,
    InvalidCSSSelectorError,
    fast_format_html,
    create_box_message,
    get_error_context,
 )
 async def initialize_context_middleware(context: Dict[str, Any]) -> int:
    """Initialize the context with basic configuration and validation"""
    url = context.get("url")
    config = context.get("config")
    if not isinstance(url, str) or not url:
        context["error_message"] = "Invalid URL, make sure the URL is a non-empty string"
        return 0
    # Default to ENABLED if no cache mode specified
    if config.cache_mode is None:
        config.cache_mode = CacheMode.ENABLED
    # Create cache context
    context["cache_context"] = CacheContext(url, config.cache_mode, False)
    context["start_time"] = time.perf_counter()
    return 1
 # middlewares.py additions
 async def browser_hub_middleware(context: Dict[str, Any]) -> int:
    """
    Initialize or connect to a Browser-Hub and add it to the pipeline context.
    This middleware handles browser hub initialization for all three scenarios:
    1. Default configuration when nothing is specified
    2. Custom configuration when browser_config is provided
    3. Connection to existing hub when browser_hub_connection is provided
    Args:
        context: The pipeline context dictionary
    Returns:
        int: 1 for success, 0 for failure
    """
    from crawl4ai.browser.browser_hub import BrowserHub
    try:
        # Get configuration from context
        browser_config = context.get("browser_config")
        browser_hub_id = context.get("browser_hub_id")
        browser_hub_connection = context.get("browser_hub_connection")
        logger = context.get("logger")
        # If we already have a browser hub in context, use it
        if context.get("browser_hub"):
            return 1
        # Get or create Browser-Hub
        browser_hub = await BrowserHub.get_browser_manager(
            config=browser_config,
            hub_id=browser_hub_id,
            connection_info=browser_hub_connection,
            logger=logger
        )
        # Add to context
        context["browser_hub"] = browser_hub
        return 1
    except Exception as e:
        context["error_message"] = f"Failed to initialize browser hub: {str(e)}"
        return 0
 async def fetch_content_middleware(context: Dict[str, Any]) -> int:
    """
    Fetch content from the web using the browser hub.
    This middleware uses the browser hub to get pages for crawling,
    and properly releases them back to the pool when done.
    Args:
        context: The pipeline context dictionary
    Returns:
        int: 1 for success, 0 for failure
    """
    url = context.get("url")
    config = context.get("config")
    browser_hub = context.get("browser_hub")
    logger = context.get("logger")
    # Skip if using cached result
    if context.get("cached_result") and context.get("html"):
        return 1
    try:
        # Create crawler strategy without initializing its browser manager
        from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
        crawler_strategy = AsyncPlaywrightCrawlerStrategy(
            browser_config=browser_hub.config if browser_hub else None,
            logger=logger
        )
        # Replace the browser manager with our shared instance
        crawler_strategy.browser_manager = browser_hub
        # Perform crawl without trying to initialize the browser
        # The crawler will use the provided browser_manager to get pages
        async_response = await crawler_strategy.crawl(url, config=config)
        # Store results in context
        context["html"] = async_response.html
        context["screenshot_data"] = async_response.screenshot
        context["pdf_data"] = async_response.pdf_data
        context["js_execution_result"] = async_response.js_execution_result
        context["async_response"] = async_response
        return 1
    except Exception as e:
        context["error_message"] = f"Error fetching content: {str(e)}"
        return 0
 async def check_cache_middleware(context: Dict[str, Any]) -> int:
    """Check if there's a cached result and load it if available"""
    url = context.get("url")
    config = context.get("config")
    cache_context = context.get("cache_context")
    logger = context.get("logger")
    # Initialize variables
    context["cached_result"] = None
    context["html"] = None
    context["extracted_content"] = None
    context["screenshot_data"] = None
    context["pdf_data"] = None
    # Try to get cached result if appropriate
    if cache_context.should_read():
        cached_result = await async_db_manager.aget_cached_url(url)
        context["cached_result"] = cached_result
        if cached_result:
            html = sanitize_input_encode(cached_result.html)
            extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
            extracted_content = None if not extracted_content or extracted_content == "[]" else extracted_content
            # If screenshot is requested but its not in cache, then set cache_result to None
            screenshot_data = cached_result.screenshot
            pdf_data = cached_result.pdf
            if config.screenshot and not screenshot_data:
                context["cached_result"] = None
            if config.pdf and not pdf_data:
                context["cached_result"] = None
            context["html"] = html
            context["extracted_content"] = extracted_content
            context["screenshot_data"] = screenshot_data
            context["pdf_data"] = pdf_data
            logger.url_status(
                url=cache_context.display_url,
                success=bool(html),
                timing=time.perf_counter() - context["start_time"],
                tag="FETCH",
            )
    return 1
 async def configure_proxy_middleware(context: Dict[str, Any]) -> int:
    """Configure proxy if a proxy rotation strategy is available"""
    config = context.get("config")
    logger = context.get("logger")
    # Skip if using cached result
    if context.get("cached_result") and context.get("html"):
        return 1
    # Update proxy configuration from rotation strategy if available
    if config and config.proxy_rotation_strategy:
        next_proxy = await config.proxy_rotation_strategy.get_next_proxy()
        if next_proxy:
            logger.info(
                message="Switch proxy: {proxy}",
                tag="PROXY",
                params={"proxy": next_proxy.server},
            )
            config.proxy_config = next_proxy
    return 1
 async def check_robots_txt_middleware(context: Dict[str, Any]) -> int:
    """Check if the URL is allowed by robots.txt if enabled"""
    url = context.get("url")
    config = context.get("config")
    browser_config = context.get("browser_config")
    robots_parser = context.get("robots_parser")
    # Skip if using cached result
    if context.get("cached_result") and context.get("html"):
        return 1
    # Check robots.txt if enabled
    if config and config.check_robots_txt:
        if not await robots_parser.can_fetch(url, browser_config.user_agent):
            context["crawl_result"] = CrawlResult(
                url=url,
                html="",
                success=False,
                status_code=403,
                error_message="Access denied by robots.txt",
                response_headers={"X-Robots-Status": "Blocked by robots.txt"}
            )
            return 0
    return 1
 async def fetch_content_middleware_(context: Dict[str, Any]) -> int:
    """Fetch content from the web using the crawler strategy"""
    url = context.get("url")
    config = context.get("config")
    crawler_strategy = context.get("crawler_strategy")
    logger = context.get("logger")
    # Skip if using cached result
    if context.get("cached_result") and context.get("html"):
        return 1
    try:
        t1 = time.perf_counter()
        if config.user_agent:
            crawler_strategy.update_user_agent(config.user_agent)
        # Call CrawlerStrategy.crawl
        async_response = await crawler_strategy.crawl(url, config=config)
        html = sanitize_input_encode(async_response.html)
        screenshot_data = async_response.screenshot
        pdf_data = async_response.pdf_data
        js_execution_result = async_response.js_execution_result
        t2 = time.perf_counter()
        logger.url_status(
            url=context["cache_context"].display_url,
            success=bool(html),
            timing=t2 - t1,
            tag="FETCH",
        )
        context["html"] = html
        context["screenshot_data"] = screenshot_data
        context["pdf_data"] = pdf_data
        context["js_execution_result"] = js_execution_result
        context["async_response"] = async_response
        return 1
    except Exception as e:
        context["error_message"] = f"Error fetching content: {str(e)}"
        return 0
 async def scrape_content_middleware(context: Dict[str, Any]) -> int:
    """Apply scraping strategy to extract content"""
    url = context.get("url")
    html = context.get("html")
    config = context.get("config")
    extracted_content = context.get("extracted_content")
    logger = context.get("logger")
    # Skip if already have a crawl result
    if context.get("crawl_result"):
        return 1
    try:
        _url = url if not context.get("is_raw_html", False) else "Raw HTML"
        t1 = time.perf_counter()
        # Get scraping strategy and ensure it has a logger
        scraping_strategy = config.scraping_strategy
        if not scraping_strategy.logger:
            scraping_strategy.logger = logger
        # Process HTML content
        params = config.__dict__.copy()
        params.pop("url", None)
        # Add keys from kwargs to params that don't exist in params
        kwargs = context.get("kwargs", {})
        params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
        # Scraping Strategy Execution
        result: ScrapingResult = scraping_strategy.scrap(url, html, **params)
        if result is None:
            raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
        # Extract results - handle both dict and ScrapingResult
        if isinstance(result, dict):
            cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
            media = result.get("media", {})
            links = result.get("links", {})
            metadata = result.get("metadata", {})
        else:
            cleaned_html = sanitize_input_encode(result.cleaned_html)
            media = result.media.model_dump()
            links = result.links.model_dump()
            metadata = result.metadata
        context["cleaned_html"] = cleaned_html
        context["media"] = media
        context["links"] = links
        context["metadata"] = metadata
        # Log processing completion
        logger.info(
            message="{url:.50}... | Time: {timing}s",
            tag="SCRAPE",
            params={
                "url": _url,
                "timing": int((time.perf_counter() - t1) * 1000) / 1000,
            },
        )
        return 1
    except InvalidCSSSelectorError as e:
        context["error_message"] = str(e)
        return 0
    except Exception as e:
        context["error_message"] = f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}"
        return 0
 async def generate_markdown_middleware(context: Dict[str, Any]) -> int:
    """Generate markdown from cleaned HTML"""
    url = context.get("url")
    cleaned_html = context.get("cleaned_html")
    config = context.get("config")
    # Skip if already have a crawl result
    if context.get("crawl_result"):
        return 1
    # Generate Markdown
    markdown_generator = config.markdown_generator
    markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
        cleaned_html=cleaned_html,
        base_url=url,
    )
    context["markdown_result"] = markdown_result
    return 1
 async def extract_structured_content_middleware(context: Dict[str, Any]) -> int:
    """Extract structured content using extraction strategy"""
    url = context.get("url")
    extracted_content = context.get("extracted_content")
    config = context.get("config")
    markdown_result = context.get("markdown_result")
    cleaned_html = context.get("cleaned_html")
    logger = context.get("logger")
    # Skip if already have a crawl result or extracted content
    if context.get("crawl_result") or bool(extracted_content):
        return 1
    from crawl4ai.chunking_strategy import IdentityChunking
    from crawl4ai.extraction_strategy import NoExtractionStrategy
    if config.extraction_strategy and not isinstance(config.extraction_strategy, NoExtractionStrategy):
        t1 = time.perf_counter()
        _url = url if not context.get("is_raw_html", False) else "Raw HTML"
        # Choose content based on input_format
        content_format = config.extraction_strategy.input_format
        if content_format == "fit_markdown" and not markdown_result.fit_markdown:
            logger.warning(
                message="Fit markdown requested but not available. Falling back to raw markdown.",
                tag="EXTRACT",
                params={"url": _url},
            )
            content_format = "markdown"
        content = {
            "markdown": markdown_result.raw_markdown,
            "html": context.get("html"),
            "cleaned_html": cleaned_html,
            "fit_markdown": markdown_result.fit_markdown,
        }.get(content_format, markdown_result.raw_markdown)
        # Use IdentityChunking for HTML input, otherwise use provided chunking strategy
        chunking = (
            IdentityChunking()
            if content_format in ["html", "cleaned_html"]
            else config.chunking_strategy
        )
        sections = chunking.chunk(content)
        extracted_content = config.extraction_strategy.run(url, sections)
        extracted_content = json.dumps(
            extracted_content, indent=4, default=str, ensure_ascii=False
        )
        context["extracted_content"] = extracted_content
        # Log extraction completion
        logger.info(
            message="Completed for {url:.50}... | Time: {timing}s",
            tag="EXTRACT",
            params={"url": _url, "timing": time.perf_counter() - t1},
        )
    return 1
 async def format_html_middleware(context: Dict[str, Any]) -> int:
    """Format HTML if prettify is enabled"""
    config = context.get("config")
    cleaned_html = context.get("cleaned_html")
    # Skip if already have a crawl result
    if context.get("crawl_result"):
        return 1
    # Apply HTML formatting if requested
    if config.prettiify and cleaned_html:
        context["cleaned_html"] = fast_format_html(cleaned_html)
    return 1
 async def write_cache_middleware(context: Dict[str, Any]) -> int:
    """Write result to cache if appropriate"""
    cache_context = context.get("cache_context")
    cached_result = context.get("cached_result")
    # Skip if already have a crawl result or not using cache
    if context.get("crawl_result") or not cache_context.should_write() or bool(cached_result):
        return 1
    # We'll create the CrawlResult in build_result_middleware and cache it there
    # to avoid creating it twice
    return 1
 async def build_result_middleware(context: Dict[str, Any]) -> int:
    """Build the final CrawlResult object"""
    url = context.get("url")
    html = context.get("html", "")
    cache_context = context.get("cache_context")
    cached_result = context.get("cached_result")
    config = context.get("config")
    logger = context.get("logger")
    # If we already have a crawl result (from an earlier middleware like robots.txt check)
    if context.get("crawl_result"):
        result = context["crawl_result"]
        context["final_result"] = CrawlResultContainer(result)
        return 1
    # If we have a cached result
    if cached_result and html:
        logger.success(
            message="{url:.50}... | Status: {status} | Total: {timing}",
            tag="COMPLETE",
            params={
                "url": cache_context.display_url,
                "status": True,
                "timing": f"{time.perf_counter() - context['start_time']:.2f}s",
            },
            colors={"status": "green", "timing": "yellow"},
        )
        cached_result.success = bool(html)
        cached_result.session_id = getattr(config, "session_id", None)
        cached_result.redirected_url = cached_result.redirected_url or url
        context["final_result"] = CrawlResultContainer(cached_result)
        return 1
    # Build a new result
    try:
        # Get all necessary components from context
        cleaned_html = context.get("cleaned_html", "")
        markdown_result = context.get("markdown_result")
        media = context.get("media", {})
        links = context.get("links", {})
        metadata = context.get("metadata", {})
        screenshot_data = context.get("screenshot_data")
        pdf_data = context.get("pdf_data")
        extracted_content = context.get("extracted_content")
        async_response = context.get("async_response")
        # Create the CrawlResult
        crawl_result = CrawlResult(
            url=url,
            html=html,
            cleaned_html=cleaned_html,
            markdown=markdown_result,
            media=media,
            links=links,
            metadata=metadata,
            screenshot=screenshot_data,
            pdf=pdf_data,
            extracted_content=extracted_content,
            success=bool(html),
            error_message="",
        )
        # Add response details if available
        if async_response:
            crawl_result.status_code = async_response.status_code
            crawl_result.redirected_url = async_response.redirected_url or url
            crawl_result.response_headers = async_response.response_headers
            crawl_result.downloaded_files = async_response.downloaded_files
            crawl_result.js_execution_result = context.get("js_execution_result")
            crawl_result.ssl_certificate = async_response.ssl_certificate
        crawl_result.session_id = getattr(config, "session_id", None)
        # Log completion
        logger.success(
            message="{url:.50}... | Status: {status} | Total: {timing}",
            tag="COMPLETE",
            params={
                "url": cache_context.display_url,
                "status": crawl_result.success,
                "timing": f"{time.perf_counter() - context['start_time']:.2f}s",
            },
            colors={
                "status": "green" if crawl_result.success else "red",
                "timing": "yellow",
            },
        )
        # Update cache if appropriate
        if cache_context.should_write() and not bool(cached_result):
            await async_db_manager.acache_url(crawl_result)
        context["final_result"] = CrawlResultContainer(crawl_result)
        return 1
    except Exception as e:
        error_context = get_error_context(sys.exc_info())
        error_message = (
            f"Unexpected error in build_result at line {error_context['line_no']} "
            f"in {error_context['function']} ({error_context['filename']}):\n"
            f"Error: {str(e)}\n\n"
            f"Code context:\n{error_context['code_context']}"
        )
        logger.error_status(
            url=url,
            error=create_box_message(error_message, type="error"),
            tag="ERROR",
        )
        context["final_result"] = CrawlResultContainer(
            CrawlResult(
                url=url, html="", success=False, error_message=error_message
            )
        )
        return 1
 async def handle_error_middleware(context: Dict[str, Any]) -> Dict[str, Any]:
    """Error handler middleware"""
    url = context.get("url", "")
    error_message = context.get("error_message", "Unknown error")
    logger = context.get("logger")
    # Log the error
    if logger:
        logger.error_status(
            url=url,
            error=create_box_message(error_message, type="error"),
            tag="ERROR",
        )
    # Create a failure result
    context["final_result"] = CrawlResultContainer(
        CrawlResult(
            url=url, html="", success=False, error_message=error_message
        )
    )
    return context
 # Custom middlewares as requested
 async def sentiment_analysis_middleware(context: Dict[str, Any]) -> int:
    """Analyze sentiment of generated markdown using TextBlob"""
    from textblob import TextBlob
    markdown_result = context.get("markdown_result")
    # Skip if no markdown or already failed
    if not markdown_result or not context.get("success", True):
        return 1
    try:
        # Get raw markdown text
        raw_markdown = markdown_result.raw_markdown
        # Analyze sentiment
        blob = TextBlob(raw_markdown)
        sentiment = blob.sentiment
        # Add sentiment to context
        context["sentiment_analysis"] = {
            "polarity": sentiment.polarity,  # -1.0 to 1.0 (negative to positive)
            "subjectivity": sentiment.subjectivity,  # 0.0 to 1.0 (objective to subjective)
            "classification": "positive" if sentiment.polarity > 0.1 else 
                             "negative" if sentiment.polarity < -0.1 else "neutral"
        }
        return 1
    except Exception as e:
        # Don't fail the pipeline on sentiment analysis failure
        context["sentiment_analysis_error"] = str(e)
        return 1
 async def log_timing_middleware(context: Dict[str, Any], name: str) -> int:
    """Log timing information for a specific point in the pipeline"""
    context[f"_timing_mark_{name}"] = time.perf_counter()
    # Calculate duration if we have a start time
    start_key = f"_timing_start_{name}"
    if start_key in context:
        duration = context[f"_timing_mark_{name}"] - context[start_key]
        context[f"_timing_duration_{name}"] = duration
        # Log the timing if we have a logger
        logger = context.get("logger")
        if logger:
            logger.info(
                message="{name} completed in {duration:.2f}s",
                tag="TIMING",
                params={"name": name, "duration": duration},
            )
    return 1
 async def validate_url_middleware(context: Dict[str, Any], patterns: List[str]) -> int:
    """Validate URL against glob patterns"""
    import fnmatch
    url = context.get("url", "")
    # If no patterns provided, allow all
    if not patterns:
        return 1
    # Check if URL matches any of the allowed patterns
    for pattern in patterns:
        if fnmatch.fnmatch(url, pattern):
            return 1
    # If we get here, URL didn't match any patterns
    context["error_message"] = f"URL '{url}' does not match any allowed patterns"
    return 0
 # Update the default middleware list function
 def create_default_middleware_list():
    """Return the default list of middleware functions for the pipeline."""
    return [
        initialize_context_middleware,
        check_cache_middleware,
        browser_hub_middleware,  # Add browser hub middleware before fetch_content
        configure_proxy_middleware,
        check_robots_txt_middleware,
        fetch_content_middleware,
        scrape_content_middleware,
        generate_markdown_middleware,
        extract_structured_content_middleware,
        format_html_middleware,
        build_result_middleware
    ]
--- a/crawl4ai/pipeline/pipeline.py
+++ b/crawl4ai/pipeline/pipeline.py
@@ -0,0 +1,281 @@
 import time
 from typing import Callable, Dict, List, Any, Optional, Awaitable
 from middlewares import create_default_middleware_list, handle_error_middleware
 from crawl4ai.models import CrawlResultContainer
 from crawl4ai.async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy
 from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 from crawl4ai.async_logger import AsyncLogger
 class Pipeline:
    """
    A pipeline processor that executes a series of async middleware functions.
    Each middleware function receives a context dictionary, updates it,
    and returns 1 for success or 0 for failure.
    """
    def __init__(
        self, 
        middleware: List[Callable[[Dict[str, Any]], Awaitable[int]]] = None,
        error_handler: Optional[Callable[[Dict[str, Any]], Awaitable[Dict[str, Any]]]] = None,
        after_middleware_callback: Optional[Callable[[str, Dict[str, Any]], Awaitable[None]]] = None,
        crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
        browser_config: Optional[BrowserConfig] = None,
        logger: Optional[AsyncLogger] = None,
        _initial_context: Optional[Dict[str, Any]] = None
    ):
        self.middleware = middleware or create_default_middleware_list()
        self.error_handler = error_handler or handle_error_middleware
        self.after_middleware_callback = after_middleware_callback
        self.browser_config = browser_config or BrowserConfig()
        self.logger = logger or AsyncLogger(verbose=self.browser_config.verbose)
        self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
            browser_config=self.browser_config,
            logger=self.logger
        )
        self._initial_context = _initial_context
        self._strategy_initialized = False
    async def _initialize_strategy__(self):
        """Initialize the crawler strategy if not already initialized"""
        if not self.crawler_strategy:
            self.crawler_strategy = AsyncPlaywrightCrawlerStrategy(
                browser_config=self.browser_config,
                logger=self.logger
            )
        if not self._strategy_initialized:
            await self.crawler_strategy.__aenter__()
            self._strategy_initialized = True
    async def _initialize_strategy(self):
        """Initialize the crawler strategy if not already initialized"""
        # With our new approach, we don't need to create the crawler strategy here
        # as it will be created on-demand in fetch_content_middleware
        # Just ensure browser hub is available if needed
        if hasattr(self, "_initial_context") and "browser_hub" not in self._initial_context:
            # If a browser_config was provided but no browser_hub yet, 
            # we'll let the browser_hub_middleware handle creating it
            pass
        # Mark as initialized to prevent repeated initialization attempts
        self._strategy_initialized = True
    async def start(self):
        """Start the crawler strategy and prepare it for use"""
        if not self._strategy_initialized:
            await self._initialize_strategy()
            self._strategy_initialized = True
        if self.crawler_strategy:
            await self.crawler_strategy.__aenter__()
            self._strategy_initialized = True
        else:
            raise ValueError("Crawler strategy is not initialized.")
    async def close(self):
        """Close the crawler strategy and clean up resources"""
        await self.stop()
    async def stop(self):
        """Close the crawler strategy and clean up resources"""
        if self._strategy_initialized and self.crawler_strategy:
            await self.crawler_strategy.__aexit__(None, None, None)
            self._strategy_initialized = False
    async def __aenter__(self):
        await self.start()
        return self
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        await self.close()
    async def crawl(self, url: str, config: Optional[CrawlerRunConfig] = None, **kwargs) -> CrawlResultContainer:
        """
        Crawl a URL and process it through the pipeline.
        Args:
            url: The URL to crawl
            config: Optional configuration for the crawl
            **kwargs: Additional arguments to pass to the middleware
        Returns:
            CrawlResultContainer: The result of the crawl
        """
        # Initialize strategy if needed
        await self._initialize_strategy()
        # Create the initial context
        context = {
            "url": url,
            "config": config or CrawlerRunConfig(),
            "browser_config": self.browser_config,
            "logger": self.logger,
            "crawler_strategy": self.crawler_strategy,
            "kwargs": kwargs
        }
        # Process the pipeline
        result_context = await self.process(context)
        # Return the final result
        return result_context.get("final_result")
    async def process(self, initial_context: Dict[str, Any] = None) -> Dict[str, Any]:
        """
        Process all middleware functions with the given context.
        Args:
            initial_context: Initial context dictionary, defaults to empty dict
        Returns:
            Updated context dictionary after all middleware have been processed
        """
        context = {**self._initial_context}
        if initial_context:
            context.update(initial_context)
        # Record pipeline start time
        context["_pipeline_start_time"] = time.perf_counter()
        for middleware_fn in self.middleware:
            # Get middleware name for logging
            middleware_name = getattr(middleware_fn, '__name__', str(middleware_fn))
            # Record start time for this middleware
            start_time = time.perf_counter()
            context[f"_timing_start_{middleware_name}"] = start_time
            try:
                # Execute middleware (all middleware functions are async)
                result = await middleware_fn(context)
                # Record completion time
                end_time = time.perf_counter()
                context[f"_timing_end_{middleware_name}"] = end_time
                context[f"_timing_duration_{middleware_name}"] = end_time - start_time
                # Execute after-middleware callback if provided
                if self.after_middleware_callback:
                    await self.after_middleware_callback(middleware_name, context)
                # Convert boolean returns to int (True->1, False->0)
                if isinstance(result, bool):
                    result = 1 if result else 0
                # Handle failure
                if result == 0:
                    if self.error_handler:
                        context["_error_in"] = middleware_name
                        context["_error_at"] = time.perf_counter()
                        return await self._handle_error(context)
                    else:
                        context["success"] = False
                        context["error_message"] = f"Pipeline failed at {middleware_name}"
                        break
            except Exception as e:
                # Record error information
                context["_error_in"] = middleware_name
                context["_error_at"] = time.perf_counter()
                context["_exception"] = e
                context["success"] = False
                context["error_message"] = f"Exception in {middleware_name}: {str(e)}"
                # Call error handler if available
                if self.error_handler:
                    return await self._handle_error(context)
                break
        # Record pipeline completion time
        pipeline_end_time = time.perf_counter()
        context["_pipeline_end_time"] = pipeline_end_time
        context["_pipeline_duration"] = pipeline_end_time - context["_pipeline_start_time"]
        # Set success to True if not already set (no failures)
        if "success" not in context:
            context["success"] = True
        return context
    async def _handle_error(self, context: Dict[str, Any]) -> Dict[str, Any]:
        """Handle errors by calling the error handler"""
        try:
            return await self.error_handler(context)
        except Exception as e:
            # If error handler fails, update context with this new error
            context["_error_handler_exception"] = e
            context["error_message"] = f"Error handler failed: {str(e)}"
            return context
 async def create_pipeline(
    middleware_list=None, 
    error_handler=None,
    after_middleware_callback=None,
    browser_config=None,
    browser_hub_id=None,
    browser_hub_connection=None,
    browser_hub=None,
    logger=None
 ) -> Pipeline:
    """
    Factory function to create a pipeline with Browser-Hub integration.
    Args:
        middleware_list: List of middleware functions
        error_handler: Error handler middleware
        after_middleware_callback: Callback after middleware execution
        browser_config: Configuration for the browser
        browser_hub_id: ID for browser hub instance
        browser_hub_connection: Connection string for existing browser hub
        browser_hub: Existing browser hub instance to use
        logger: Logger instance
    Returns:
        Pipeline: Configured pipeline instance
    """
    # Use default middleware list if none provided
    middleware = middleware_list or create_default_middleware_list()
    # Create the pipeline
    pipeline = Pipeline(
        middleware=middleware,
        error_handler=error_handler,
        after_middleware_callback=after_middleware_callback,
        logger=logger
    )
    # Set browser-related attributes in the initial context
    pipeline._initial_context = {
        "browser_config": browser_config,
        "browser_hub_id": browser_hub_id,
        "browser_hub_connection": browser_hub_connection,
        "browser_hub": browser_hub,
        "logger": logger
    }
    return pipeline
 # async def create_pipeline(
 #     middleware_list: Optional[List[Callable[[Dict[str, Any]], Awaitable[int]]]] = None, 
 #     error_handler: Optional[Callable[[Dict[str, Any]], Awaitable[Dict[str, Any]]]] = None,
 #     after_middleware_callback: Optional[Callable[[str, Dict[str, Any]], Awaitable[None]]] = None,
 #     crawler_strategy = None,
 #     browser_config = None,
 #     logger = None
 # ) -> Pipeline:
 #     """Factory function to create a pipeline with the given middleware"""
 #     return Pipeline(
 #         middleware=middleware_list,
 #         error_handler=error_handler,
 #         after_middleware_callback=after_middleware_callback,
 #         crawler_strategy=crawler_strategy,
 #         browser_config=browser_config,
 #         logger=logger
 #     )
--- a/crawl4ai/pipeline/test_pipeline.py
+++ b/crawl4ai/pipeline/test_pipeline.py
@@ -0,0 +1,109 @@
 import asyncio
 from crawl4ai import (
    BrowserConfig,
    CrawlerRunConfig,
    CacheMode,
    DefaultMarkdownGenerator,
    PruningContentFilter
 )
 from pipeline import Pipeline
 async def main():
    # Create configuration objects
    browser_config = BrowserConfig(headless=True, verbose=True)
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        markdown_generator=DefaultMarkdownGenerator(
            content_filter=PruningContentFilter(
                threshold=0.48, 
                threshold_type="fixed", 
                min_word_threshold=0
            )
        ),
    )
    # Create and use pipeline with context manager
    async with Pipeline(browser_config=browser_config) as pipeline:
        result = await pipeline.crawl(
            url="https://www.example.com", 
            config=crawler_config
        )
        # Print the result
        print(f"URL: {result.url}")
        print(f"Success: {result.success}")
        if result.success:
            print("\nMarkdown excerpt:")
            print(result.markdown.raw_markdown[:500] + "...")
        else:
            print(f"Error: {result.error_message}")
 if __name__ == "__main__":
    asyncio.run(main())
 class CrawlTarget:
    def __init__(self, urls, config=None):
        self.urls = urls
        self.config = config
    def __repr__(self):
        return f"CrawlTarget(urls={self.urls}, config={self.config})"
 # async def main():
 #     # Create configuration objects
 #     browser_config = BrowserConfig(headless=True, verbose=True)
 #     # Define different configurations
 #     config1 = CrawlerRunConfig(
 #         cache_mode=CacheMode.BYPASS,
 #         markdown_generator=DefaultMarkdownGenerator(
 #             content_filter=PruningContentFilter(threshold=0.48)
 #         ),
 #     )
 #     config2 = CrawlerRunConfig(
 #         cache_mode=CacheMode.ENABLED,
 #         screenshot=True,
 #         pdf=True
 #     )
 #     # Create crawl targets
 #     targets = [
 #         CrawlTarget(
 #             urls=["https://www.example.com", "https://www.wikipedia.org"],
 #             config=config1
 #         ),
 #         CrawlTarget(
 #             urls="https://news.ycombinator.com",  
 #             config=config2
 #         ),
 #         CrawlTarget(
 #             urls=["https://github.com", "https://stackoverflow.com", "https://python.org"],
 #             config=None
 #         )
 #     ]
 #     # Create and use pipeline with context manager
 #     async with Pipeline(browser_config=browser_config) as pipeline:
 #         all_results = await pipeline.crawl_batch(targets)
 #         for target_key, results in all_results.items():
 #             print(f"\n===== Results for {target_key} =====")
 #             print(f"Number of URLs crawled: {len(results)}")
 #             for i, result in enumerate(results):
 #                 print(f"\nURL {i+1}: {result.url}")
 #                 print(f"Success: {result.success}")
 #                 if result.success:
 #                     print(f"Content length: {len(result.markdown.raw_markdown)} chars")
 #                 else:
 #                     print(f"Error: {result.error_message}")
 # if __name__ == "__main__":
 #     asyncio.run(main())
--- a/tests/pipeline/demo_browser_hub_pipeline.py
+++ b/tests/pipeline/demo_browser_hub_pipeline.py
@@ -0,0 +1,222 @@
 # demo_browser_hub.py
 import asyncio
 from typing import List
 from crawl4ai.browser.browser_hub import BrowserHub
 from pipeline import create_pipeline
 from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 from crawl4ai.async_logger import AsyncLogger
 from crawl4ai.models import CrawlResultContainer
 from crawl4ai.cache_context import CacheMode
 from crawl4ai import DefaultMarkdownGenerator
 from crawl4ai import PruningContentFilter
 async def create_prewarmed_browser_hub(urls_to_crawl: List[str]):
    """Create a pre-warmed browser hub with 10 browsers and 5 pages each."""
    # Set up logging
    logger = AsyncLogger(verbose=True)
    logger.info("Setting up pre-warmed browser hub", tag="DEMO")
    # Create browser configuration
    browser_config = BrowserConfig(
        browser_type="chromium",
        headless=True,  # Set to False to see the browsers in action
        viewport_width=1280,
        viewport_height=800,
        light_mode=True,  # Optimize for performance
        java_script_enabled=True
    )
    # Create crawler configurations for pre-warming with different user agents
    # This allows pages to be ready for different scenarios
    crawler_configs = [
        CrawlerRunConfig(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            wait_until="networkidle"
        ),
        # CrawlerRunConfig(
        #     user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15",
        #     wait_until="networkidle"
        # ),
        # CrawlerRunConfig(
        #     user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
        #     wait_until="networkidle"
        # )
    ]
    # Number of browsers and pages per browser
    num_browsers = 1
    pages_per_browser = 1
    # Distribute pages across configurations
    # We'll create a total of 50 pages (10 browsers × 5 pages)
    page_configs = []
    total_pages = num_browsers * pages_per_browser
    pages_per_config = total_pages // len(crawler_configs)
    for i, config in enumerate(crawler_configs):
        # For the last config, add any remaining pages
        if i == len(crawler_configs) - 1:
            remaining = total_pages - (pages_per_config * (len(crawler_configs) - 1))
            page_configs.append((browser_config, config, remaining))
        else:
            page_configs.append((browser_config, config, pages_per_config))
    # Create browser hub with pre-warmed pages
    start_time = asyncio.get_event_loop().time()
    logger.info("Initializing browser hub with pre-warmed pages...", tag="DEMO")
    hub = await BrowserHub.get_browser_manager(
        config=browser_config,
        hub_id="demo_hub",
        logger=logger,
        max_browsers_per_config=num_browsers,
        max_pages_per_browser=pages_per_browser,
        initial_pool_size=num_browsers,
        page_configs=page_configs
    )
    end_time = asyncio.get_event_loop().time()
    logger.success(
        message="Browser hub initialized with {total_pages} pre-warmed pages in {duration:.2f} seconds",
        tag="DEMO",
        params={
            "total_pages": total_pages,
            "duration": end_time - start_time
        }
    )
    # Get and display pool status
    status = await hub.get_pool_status()
    logger.info(
        message="Browser pool status: {status}",
        tag="DEMO",
        params={"status": status}
    )
    return hub
 async def crawl_urls_with_hub(hub, urls: List[str]) -> List[CrawlResultContainer]:
    """Crawl a list of URLs using a pre-warmed browser hub."""
    logger = AsyncLogger(verbose=True)
    # Create crawler configuration
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        markdown_generator=DefaultMarkdownGenerator(
            content_filter=PruningContentFilter(
                threshold=0.48,
                threshold_type="fixed",
                min_word_threshold=0
            )
        ),
        wait_until="networkidle",
        screenshot=True
    )
    # Create pipeline with the browser hub
    pipeline = await create_pipeline(
        browser_hub=hub,
        logger=logger
    )
    results = []
    # Crawl all URLs in parallel
    async def crawl_url(url):
        logger.info(f"Crawling {url}...", tag="CRAWL")
        result = await pipeline.crawl(url=url, config=crawler_config)
        logger.success(f"Completed crawl of {url}", tag="CRAWL")
        return result
    # Create tasks for all URLs
    tasks = [crawl_url(url) for url in urls]
    # Execute all tasks in parallel and collect results
    results = await asyncio.gather(*tasks)
    return results
 async def main():
    """Main demo function."""
    # List of URLs to crawl
    urls_to_crawl = [
        "https://example.com",
        # "https://www.python.org",
        # "https://httpbin.org/html",
        # "https://news.ycombinator.com",
        # "https://github.com",
        # "https://pypi.org",
        # "https://docs.python.org/3/",
        # "https://opensource.org",
        # "https://whatismyipaddress.com",
        # "https://en.wikipedia.org/wiki/Web_scraping"
    ]
    # Set up logging
    logger = AsyncLogger(verbose=True)
    logger.info("Starting browser hub demo", tag="DEMO")
    try:
        # Create pre-warmed browser hub
        hub = await create_prewarmed_browser_hub(urls_to_crawl)
        # Use hub to crawl URLs
        logger.info("Crawling URLs in parallel...", tag="DEMO")
        start_time = asyncio.get_event_loop().time()
        results = await crawl_urls_with_hub(hub, urls_to_crawl)
        end_time = asyncio.get_event_loop().time()
        # Display results
        logger.success(
            message="Crawled {count} URLs in {duration:.2f} seconds (average: {avg:.2f} seconds per URL)",
            tag="DEMO",
            params={
                "count": len(results),
                "duration": end_time - start_time,
                "avg": (end_time - start_time) / len(results)
            }
        )
        # Print summary of results
        logger.info("Crawl results summary:", tag="DEMO")
        for i, result in enumerate(results):
            logger.info(
                message="{idx}. {url}: Success={success}, Content length={length}",
                tag="RESULT",
                params={
                    "idx": i+1,
                    "url": result.url,
                    "success": result.success,
                    "length": len(result.html) if result.html else 0
                }
            )
            if result.success and result.markdown and result.markdown.raw_markdown:
                # Print a snippet of the markdown
                markdown_snippet = result.markdown.raw_markdown[:150] + "..."
                logger.info(
                    message="   Markdown: {snippet}",
                    tag="RESULT",
                    params={"snippet": markdown_snippet}
                )
        # Display final browser pool status
        status = await hub.get_pool_status()
        logger.info(
            message="Final browser pool status: {status}",
            tag="DEMO",
            params={"status": status}
        )
    finally:
        # Clean up
        logger.info("Shutting down browser hub...", tag="DEMO")
        await BrowserHub.shutdown_all()
        logger.success("Demo completed", tag="DEMO")
 if __name__ == "__main__":
    asyncio.run(main())
--- a/tests/pipeline/extended_browser_hub_tests.py
+++ b/tests/pipeline/extended_browser_hub_tests.py
@@ -0,0 +1,505 @@
 # extended_browser_hub_tests.py
 import asyncio
 from crawl4ai.browser.browser_hub import BrowserHub
 from pipeline import create_pipeline
 from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 from crawl4ai.async_logger import AsyncLogger
 from crawl4ai.cache_context import CacheMode
 # Common test URLs
 TEST_URLS = [
    "https://example.com",
    "https://example.com/page1",
    "https://httpbin.org/html",
    "https://httpbin.org/headers",
    "https://httpbin.org/ip",
    "https://httpstat.us/200"
 ]
 class TestResults:
    """Simple container for test results"""
    def __init__(self, name: str):
        self.name = name
        self.results = []
        self.start_time = None
        self.end_time = None
        self.errors = []
    @property
    def duration(self) -> float:
        if self.start_time and self.end_time:
            return self.end_time - self.start_time
        return 0
    @property
    def success_rate(self) -> float:
        if not self.results:
            return 0
        return sum(1 for r in self.results if r.success) / len(self.results) * 100
    def log_summary(self, logger: AsyncLogger):
        logger.info(f"=== Test: {self.name} ===", tag="SUMMARY")
        logger.info(
            message="Duration: {duration:.2f}s, Success rate: {success_rate:.1f}%, Results: {count}",
            tag="SUMMARY",
            params={
                "duration": self.duration,
                "success_rate": self.success_rate,
                "count": len(self.results)
            }
        )
        if self.errors:
            logger.error(
                message="Errors ({count}): {errors}",
                tag="SUMMARY",
                params={
                    "count": len(self.errors),
                    "errors": "; ".join(str(e) for e in self.errors)
                }
            )
 # ======== TEST SCENARIO 1: Simple default configuration ========
 async def test_default_configuration():
    """
    Test Scenario 1: Simple default configuration
    This tests the basic case where the user does not provide any specific
    browser configuration, relying on default auto-setup.
    """
    logger = AsyncLogger(verbose=True)
    results = TestResults("Default Configuration")
    try:
        # Create pipeline with no browser config
        pipeline = await create_pipeline(logger=logger)
        # Start timing
        results.start_time = asyncio.get_event_loop().time()
        # Create basic crawler config
        crawler_config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            wait_until="domcontentloaded"
        )
        # Process each URL sequentially
        for url in TEST_URLS:
            try:
                logger.info(f"Crawling {url} with default configuration", tag="TEST")
                result = await pipeline.crawl(url=url, config=crawler_config)
                results.results.append(result)
                logger.success(
                    message="Result: url={url}, success={success}, content_length={length}",
                    tag="TEST",
                    params={
                        "url": url,
                        "success": result.success,
                        "length": len(result.html) if result.html else 0
                    }
                )
            except Exception as e:
                logger.error(f"Error crawling {url}: {str(e)}", tag="TEST")
                results.errors.append(e)
        # End timing
        results.end_time = asyncio.get_event_loop().time()
    except Exception as e:
        logger.error(f"Test failed with error: {str(e)}", tag="TEST")
        results.errors.append(e)
    # Log summary
    results.log_summary(logger)
    return results
 # ======== TEST SCENARIO 2: Detailed custom configuration ========
 async def test_custom_configuration():
    """
    Test Scenario 2: Detailed custom configuration
    This tests the case where the user provides detailed browser configuration
    to customize the browser behavior.
    """
    logger = AsyncLogger(verbose=True)
    results = TestResults("Custom Configuration")
    try:
        # Create custom browser config
        browser_config = BrowserConfig(
            browser_type="chromium",
            headless=True,
            viewport_width=1920,
            viewport_height=1080,
            user_agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
            light_mode=True,
            ignore_https_errors=True,
            extra_args=["--disable-extensions"]
        )
        # Create custom crawler config
        crawler_config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            wait_until="networkidle",
            page_timeout=30000,
            screenshot=True,
            pdf=False,
            screenshot_wait_for=0.5,
            wait_for_images=True,
            scan_full_page=True,
            scroll_delay=0.2,
            process_iframes=True,
            remove_overlay_elements=True
        )
        # Create pipeline with custom configuration
        pipeline = await create_pipeline(
            browser_config=browser_config,
            logger=logger
        )
        # Start timing
        results.start_time = asyncio.get_event_loop().time()
        # Process each URL sequentially
        for url in TEST_URLS:
            try:
                logger.info(f"Crawling {url} with custom configuration", tag="TEST")
                result = await pipeline.crawl(url=url, config=crawler_config)
                results.results.append(result)
                has_screenshot = result.screenshot is not None
                logger.success(
                    message="Result: url={url}, success={success}, screenshot={screenshot}, content_length={length}",
                    tag="TEST",
                    params={
                        "url": url,
                        "success": result.success,
                        "screenshot": has_screenshot,
                        "length": len(result.html) if result.html else 0
                    }
                )
            except Exception as e:
                logger.error(f"Error crawling {url}: {str(e)}", tag="TEST")
                results.errors.append(e)
        # End timing
        results.end_time = asyncio.get_event_loop().time()
        # Get browser hub status from context
        try:
            # Run a dummy crawl to get the context with browser hub
            context = await pipeline.process({"url": "about:blank", "config": crawler_config})
            browser_hub = context.get("browser_hub")
            if browser_hub:
                status = await browser_hub.get_pool_status()
                logger.info(
                    message="Browser hub status: {status}",
                    tag="TEST",
                    params={"status": status}
                )
        except Exception as e:
            logger.error(f"Failed to get browser hub status: {str(e)}", tag="TEST")
    except Exception as e:
        logger.error(f"Test failed with error: {str(e)}", tag="TEST")
        results.errors.append(e)
    # Log summary
    results.log_summary(logger)
    return results
 # ======== TEST SCENARIO 3: Using pre-initialized browser hub ========
 async def test_preinitalized_browser_hub():
    """
    Test Scenario 3: Using pre-initialized browser hub
    This tests the case where a browser hub is initialized separately
    and then passed to the pipeline.
    """
    logger = AsyncLogger(verbose=True)
    results = TestResults("Pre-initialized Browser Hub")
    browser_hub = None
    try:
        # Create and initialize browser hub separately
        logger.info("Initializing browser hub separately", tag="TEST")
        browser_config = BrowserConfig(
            browser_type="chromium",
            headless=True,
            verbose=True
        )
        browser_hub = await BrowserHub.get_browser_manager(
            config=browser_config,
            hub_id="test_preinitalized",
            logger=logger,
            max_browsers_per_config=2,
            max_pages_per_browser=3,
            initial_pool_size=2
        )
        # Display initial status
        status = await browser_hub.get_pool_status()
        logger.info(
            message="Initial browser hub status: {status}",
            tag="TEST",
            params={"status": status}
        )
        # Create pipeline with pre-initialized browser hub
        pipeline = await create_pipeline(
            browser_hub=browser_hub,
            logger=logger
        )
        # Create crawler config
        crawler_config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            wait_until="networkidle",
            screenshot=True
        )
        # Start timing
        results.start_time = asyncio.get_event_loop().time()
        # Process URLs in parallel
        async def crawl_url(url):
            try:
                logger.info(f"Crawling {url} with pre-initialized hub", tag="TEST")
                result = await pipeline.crawl(url=url, config=crawler_config)
                logger.success(f"Completed crawl of {url}", tag="TEST")
                return result
            except Exception as e:
                logger.error(f"Error crawling {url}: {str(e)}", tag="TEST")
                results.errors.append(e)
                return None
        # Create tasks for all URLs
        tasks = [crawl_url(url) for url in TEST_URLS]
        # Execute all tasks in parallel and collect results
        all_results = await asyncio.gather(*tasks)
        results.results = [r for r in all_results if r is not None]
        # End timing
        results.end_time = asyncio.get_event_loop().time()
        # Display final status
        status = await browser_hub.get_pool_status()
        logger.info(
            message="Final browser hub status: {status}",
            tag="TEST",
            params={"status": status}
        )
    except Exception as e:
        logger.error(f"Test failed with error: {str(e)}", tag="TEST")
        results.errors.append(e)
    # Log summary
    results.log_summary(logger)
    return results, browser_hub
 # ======== TEST SCENARIO 4: Parallel pipelines sharing browser hub ========
 async def test_parallel_pipelines():
    """
    Test Scenario 4: Multiple parallel pipelines sharing browser hub
    This tests the case where multiple pipelines share the same browser hub,
    demonstrating resource sharing and parallel operation.
    """
    logger = AsyncLogger(verbose=True)
    results = TestResults("Parallel Pipelines")
    # We'll reuse the browser hub from the previous test
    _, browser_hub = await test_preinitalized_browser_hub()
    try:
        # Create 3 pipelines that all share the same browser hub
        pipelines = []
        for i in range(3):
            pipeline = await create_pipeline(
                browser_hub=browser_hub,
                logger=logger
            )
            pipelines.append(pipeline)
        logger.info(f"Created {len(pipelines)} pipelines sharing the same browser hub", tag="TEST")
        # Create crawler configs with different settings
        configs = [
            CrawlerRunConfig(wait_until="domcontentloaded", screenshot=False),
            CrawlerRunConfig(wait_until="networkidle", screenshot=True),
            CrawlerRunConfig(wait_until="load", scan_full_page=True)
        ]
        # Start timing
        results.start_time = asyncio.get_event_loop().time()
        # Function to process URLs with a specific pipeline
        async def process_with_pipeline(pipeline_idx, urls):
            pipeline_results = []
            for url in urls:
                try:
                    logger.info(f"Pipeline {pipeline_idx} crawling {url}", tag="TEST")
                    result = await pipelines[pipeline_idx].crawl(
                        url=url, 
                        config=configs[pipeline_idx]
                    )
                    pipeline_results.append(result)
                    logger.success(
                        message="Pipeline {idx} completed: url={url}, success={success}",
                        tag="TEST",
                        params={
                            "idx": pipeline_idx,
                            "url": url,
                            "success": result.success
                        }
                    )
                except Exception as e:
                    logger.error(
                        message="Pipeline {idx} error: {error}",
                        tag="TEST",
                        params={
                            "idx": pipeline_idx,
                            "error": str(e)
                        }
                    )
                    results.errors.append(e)
            return pipeline_results
        # Distribute URLs among pipelines
        pipeline_urls = [
            TEST_URLS[:2],
            TEST_URLS[2:4],
            TEST_URLS[4:5] * 2  # Duplicate the last URL to have 2 for pipeline 3
        ]
        # Execute all pipelines in parallel
        tasks = [
            process_with_pipeline(i, urls) 
            for i, urls in enumerate(pipeline_urls)
        ]
        pipeline_results = await asyncio.gather(*tasks)
        # Flatten results
        for res_list in pipeline_results:
            results.results.extend(res_list)
        # End timing
        results.end_time = asyncio.get_event_loop().time()
        # Display browser hub status
        status = await browser_hub.get_pool_status()
        logger.info(
            message="Browser hub status after parallel pipelines: {status}",
            tag="TEST",
            params={"status": status}
        )
    except Exception as e:
        logger.error(f"Test failed with error: {str(e)}", tag="TEST")
        results.errors.append(e)
    # Log summary
    results.log_summary(logger)
    return results
 # ======== TEST SCENARIO 5: Browser hub with connection string ========
 async def test_connection_string():
    """
    Test Scenario 5: Browser hub with connection string
    This tests the case where a browser hub is initialized from a connection string,
    simulating connecting to a running browser hub service.
    """
    logger = AsyncLogger(verbose=True)
    results = TestResults("Connection String")
    try:
        # Create pipeline with connection string
        # Note: In a real implementation, this would connect to an existing service
        # For this test, we're using a simulated connection
        connection_string = "localhost:9222"  # Simulated connection string
        pipeline = await create_pipeline(
            browser_hub_connection=connection_string,
            logger=logger
        )
        # Create crawler config
        crawler_config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            wait_until="networkidle"
        )
        # Start timing
        results.start_time = asyncio.get_event_loop().time()
        # Test with a single URL
        url = TEST_URLS[0]
        try:
            logger.info(f"Crawling {url} with connection string hub", tag="TEST")
            result = await pipeline.crawl(url=url, config=crawler_config)
            results.results.append(result)
            logger.success(
                message="Result: url={url}, success={success}, content_length={length}",
                tag="TEST",
                params={
                    "url": url,
                    "success": result.success,
                    "length": len(result.html) if result.html else 0
                }
            )
        except Exception as e:
            logger.error(f"Error crawling {url}: {str(e)}", tag="TEST")
            results.errors.append(e)
        # End timing
        results.end_time = asyncio.get_event_loop().time()
    except Exception as e:
        logger.error(f"Test failed with error: {str(e)}", tag="TEST")
        results.errors.append(e)
    # Log summary
    results.log_summary(logger)
    return results
 # ======== RUN ALL TESTS ========
 async def run_all_tests():
    """Run all test scenarios"""
    logger = AsyncLogger(verbose=True)
    logger.info("=== STARTING BROWSER HUB TESTS ===", tag="MAIN")
    try:
        # Run each test scenario
        await test_default_configuration()
        # await test_custom_configuration()
        # await test_preinitalized_browser_hub()
        # await test_parallel_pipelines()
        # await test_connection_string()
    except Exception as e:
        logger.error(f"Test suite failed: {str(e)}", tag="MAIN")
    finally:
        # Clean up all browser hubs
        logger.info("Shutting down all browser hubs...", tag="MAIN")
        await BrowserHub.shutdown_all()
        logger.success("All tests completed", tag="MAIN")
 if __name__ == "__main__":
    asyncio.run(run_all_tests())
--- a/tests/pipeline/test_pipeline.py
+++ b/tests/pipeline/test_pipeline.py
@@ -0,0 +1,109 @@
 import asyncio
 from crawl4ai import (
    BrowserConfig,
    CrawlerRunConfig,
    CacheMode,
    DefaultMarkdownGenerator,
    PruningContentFilter
 )
 from pipeline import Pipeline
 async def main():
    # Create configuration objects
    browser_config = BrowserConfig(headless=True, verbose=True)
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        markdown_generator=DefaultMarkdownGenerator(
            content_filter=PruningContentFilter(
                threshold=0.48, 
                threshold_type="fixed", 
                min_word_threshold=0
            )
        ),
    )
    # Create and use pipeline with context manager
    async with Pipeline(browser_config=browser_config) as pipeline:
        result = await pipeline.crawl(
            url="https://www.example.com", 
            config=crawler_config
        )
        # Print the result
        print(f"URL: {result.url}")
        print(f"Success: {result.success}")
        if result.success:
            print("\nMarkdown excerpt:")
            print(result.markdown.raw_markdown[:500] + "...")
        else:
            print(f"Error: {result.error_message}")
 if __name__ == "__main__":
    asyncio.run(main())
 class CrawlTarget:
    def __init__(self, urls, config=None):
        self.urls = urls
        self.config = config
    def __repr__(self):
        return f"CrawlTarget(urls={self.urls}, config={self.config})"
 # async def main():
 #     # Create configuration objects
 #     browser_config = BrowserConfig(headless=True, verbose=True)
 #     # Define different configurations
 #     config1 = CrawlerRunConfig(
 #         cache_mode=CacheMode.BYPASS,
 #         markdown_generator=DefaultMarkdownGenerator(
 #             content_filter=PruningContentFilter(threshold=0.48)
 #         ),
 #     )
 #     config2 = CrawlerRunConfig(
 #         cache_mode=CacheMode.ENABLED,
 #         screenshot=True,
 #         pdf=True
 #     )
 #     # Create crawl targets
 #     targets = [
 #         CrawlTarget(
 #             urls=["https://www.example.com", "https://www.wikipedia.org"],
 #             config=config1
 #         ),
 #         CrawlTarget(
 #             urls="https://news.ycombinator.com",  
 #             config=config2
 #         ),
 #         CrawlTarget(
 #             urls=["https://github.com", "https://stackoverflow.com", "https://python.org"],
 #             config=None
 #         )
 #     ]
 #     # Create and use pipeline with context manager
 #     async with Pipeline(browser_config=browser_config) as pipeline:
 #         all_results = await pipeline.crawl_batch(targets)
 #         for target_key, results in all_results.items():
 #             print(f"\n===== Results for {target_key} =====")
 #             print(f"Number of URLs crawled: {len(results)}")
 #             for i, result in enumerate(results):
 #                 print(f"\nURL {i+1}: {result.url}")
 #                 print(f"Success: {result.success}")
 #                 if result.success:
 #                     print(f"Content length: {len(result.markdown.raw_markdown)} chars")
 #                 else:
 #                     print(f"Error: {result.error_message}")
 # if __name__ == "__main__":
 #     asyncio.run(main())