Some refactoring, movie pipelin submodule folder into the main.

2025-04-06 18:28:28 +08:00
parent 591f55edc7
commit d95b2dc9f2
8 changed files with 2655 additions and 0 deletions
--- a/tests/pipeline/demo_browser_hub_pipeline.py
+++ b/tests/pipeline/demo_browser_hub_pipeline.py
@@ -0,0 +1,222 @@
+# demo_browser_hub.py
+
+import asyncio
+from typing import List
+
+from crawl4ai.browser.browser_hub import BrowserHub
+from pipeline import create_pipeline
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_logger import AsyncLogger
+from crawl4ai.models import CrawlResultContainer
+from crawl4ai.cache_context import CacheMode
+from crawl4ai import DefaultMarkdownGenerator
+from crawl4ai import PruningContentFilter
+
+async def create_prewarmed_browser_hub(urls_to_crawl: List[str]):
+    """Create a pre-warmed browser hub with 10 browsers and 5 pages each."""
+    # Set up logging
+    logger = AsyncLogger(verbose=True)
+    logger.info("Setting up pre-warmed browser hub", tag="DEMO")
+    
+    # Create browser configuration
+    browser_config = BrowserConfig(
+        browser_type="chromium",
+        headless=True,  # Set to False to see the browsers in action
+        viewport_width=1280,
+        viewport_height=800,
+        light_mode=True,  # Optimize for performance
+        java_script_enabled=True
+    )
+    
+    # Create crawler configurations for pre-warming with different user agents
+    # This allows pages to be ready for different scenarios
+    crawler_configs = [
+        CrawlerRunConfig(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            wait_until="networkidle"
+        ),
+        # CrawlerRunConfig(
+        #     user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15",
+        #     wait_until="networkidle"
+        # ),
+        # CrawlerRunConfig(
+        #     user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
+        #     wait_until="networkidle"
+        # )
+    ]
+    
+    # Number of browsers and pages per browser
+    num_browsers = 1
+    pages_per_browser = 1
+    
+    # Distribute pages across configurations
+    # We'll create a total of 50 pages (10 browsers × 5 pages)
+    page_configs = []
+    total_pages = num_browsers * pages_per_browser
+    pages_per_config = total_pages // len(crawler_configs)
+    
+    for i, config in enumerate(crawler_configs):
+        # For the last config, add any remaining pages
+        if i == len(crawler_configs) - 1:
+            remaining = total_pages - (pages_per_config * (len(crawler_configs) - 1))
+            page_configs.append((browser_config, config, remaining))
+        else:
+            page_configs.append((browser_config, config, pages_per_config))
+    
+    # Create browser hub with pre-warmed pages
+    start_time = asyncio.get_event_loop().time()
+    logger.info("Initializing browser hub with pre-warmed pages...", tag="DEMO")
+    
+    hub = await BrowserHub.get_browser_manager(
+        config=browser_config,
+        hub_id="demo_hub",
+        logger=logger,
+        max_browsers_per_config=num_browsers,
+        max_pages_per_browser=pages_per_browser,
+        initial_pool_size=num_browsers,
+        page_configs=page_configs
+    )
+    
+    end_time = asyncio.get_event_loop().time()
+    logger.success(
+        message="Browser hub initialized with {total_pages} pre-warmed pages in {duration:.2f} seconds",
+        tag="DEMO",
+        params={
+            "total_pages": total_pages,
+            "duration": end_time - start_time
+        }
+    )
+    
+    # Get and display pool status
+    status = await hub.get_pool_status()
+    logger.info(
+        message="Browser pool status: {status}",
+        tag="DEMO",
+        params={"status": status}
+    )
+    
+    return hub
+
+async def crawl_urls_with_hub(hub, urls: List[str]) -> List[CrawlResultContainer]:
+    """Crawl a list of URLs using a pre-warmed browser hub."""
+    logger = AsyncLogger(verbose=True)
+    
+    # Create crawler configuration
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(
+                threshold=0.48,
+                threshold_type="fixed",
+                min_word_threshold=0
+            )
+        ),
+        wait_until="networkidle",
+        screenshot=True
+    )
+    
+    # Create pipeline with the browser hub
+    pipeline = await create_pipeline(
+        browser_hub=hub,
+        logger=logger
+    )
+    
+    results = []
+    
+    # Crawl all URLs in parallel
+    async def crawl_url(url):
+        logger.info(f"Crawling {url}...", tag="CRAWL")
+        result = await pipeline.crawl(url=url, config=crawler_config)
+        logger.success(f"Completed crawl of {url}", tag="CRAWL")
+        return result
+    
+    # Create tasks for all URLs
+    tasks = [crawl_url(url) for url in urls]
+    
+    # Execute all tasks in parallel and collect results
+    results = await asyncio.gather(*tasks)
+    
+    return results
+
+async def main():
+    """Main demo function."""
+    # List of URLs to crawl
+    urls_to_crawl = [
+        "https://example.com",
+        # "https://www.python.org",
+        # "https://httpbin.org/html",
+        # "https://news.ycombinator.com",
+        # "https://github.com",
+        # "https://pypi.org",
+        # "https://docs.python.org/3/",
+        # "https://opensource.org",
+        # "https://whatismyipaddress.com",
+        # "https://en.wikipedia.org/wiki/Web_scraping"
+    ]
+    
+    # Set up logging
+    logger = AsyncLogger(verbose=True)
+    logger.info("Starting browser hub demo", tag="DEMO")
+    
+    try:
+        # Create pre-warmed browser hub
+        hub = await create_prewarmed_browser_hub(urls_to_crawl)
+        
+        # Use hub to crawl URLs
+        logger.info("Crawling URLs in parallel...", tag="DEMO")
+        start_time = asyncio.get_event_loop().time()
+        
+        results = await crawl_urls_with_hub(hub, urls_to_crawl)
+        
+        end_time = asyncio.get_event_loop().time()
+        
+        # Display results
+        logger.success(
+            message="Crawled {count} URLs in {duration:.2f} seconds (average: {avg:.2f} seconds per URL)",
+            tag="DEMO",
+            params={
+                "count": len(results),
+                "duration": end_time - start_time,
+                "avg": (end_time - start_time) / len(results)
+            }
+        )
+        
+        # Print summary of results
+        logger.info("Crawl results summary:", tag="DEMO")
+        for i, result in enumerate(results):
+            logger.info(
+                message="{idx}. {url}: Success={success}, Content length={length}",
+                tag="RESULT",
+                params={
+                    "idx": i+1,
+                    "url": result.url,
+                    "success": result.success,
+                    "length": len(result.html) if result.html else 0
+                }
+            )
+            
+            if result.success and result.markdown and result.markdown.raw_markdown:
+                # Print a snippet of the markdown
+                markdown_snippet = result.markdown.raw_markdown[:150] + "..."
+                logger.info(
+                    message="   Markdown: {snippet}",
+                    tag="RESULT",
+                    params={"snippet": markdown_snippet}
+                )
+        
+        # Display final browser pool status
+        status = await hub.get_pool_status()
+        logger.info(
+            message="Final browser pool status: {status}",
+            tag="DEMO",
+            params={"status": status}
+        )
+        
+    finally:
+        # Clean up
+        logger.info("Shutting down browser hub...", tag="DEMO")
+        await BrowserHub.shutdown_all()
+        logger.success("Demo completed", tag="DEMO")
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/tests/pipeline/extended_browser_hub_tests.py
+++ b/tests/pipeline/extended_browser_hub_tests.py
@@ -0,0 +1,505 @@
+# extended_browser_hub_tests.py
+
+import asyncio
+
+from crawl4ai.browser.browser_hub import BrowserHub
+from pipeline import create_pipeline
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_logger import AsyncLogger
+from crawl4ai.cache_context import CacheMode
+
+# Common test URLs
+TEST_URLS = [
+    "https://example.com",
+    "https://example.com/page1",
+    "https://httpbin.org/html",
+    "https://httpbin.org/headers",
+    "https://httpbin.org/ip",
+    "https://httpstat.us/200"
+]
+
+class TestResults:
+    """Simple container for test results"""
+    def __init__(self, name: str):
+        self.name = name
+        self.results = []
+        self.start_time = None
+        self.end_time = None
+        self.errors = []
+    
+    @property
+    def duration(self) -> float:
+        if self.start_time and self.end_time:
+            return self.end_time - self.start_time
+        return 0
+    
+    @property
+    def success_rate(self) -> float:
+        if not self.results:
+            return 0
+        return sum(1 for r in self.results if r.success) / len(self.results) * 100
+
+    def log_summary(self, logger: AsyncLogger):
+        logger.info(f"=== Test: {self.name} ===", tag="SUMMARY")
+        logger.info(
+            message="Duration: {duration:.2f}s, Success rate: {success_rate:.1f}%, Results: {count}",
+            tag="SUMMARY",
+            params={
+                "duration": self.duration,
+                "success_rate": self.success_rate,
+                "count": len(self.results)
+            }
+        )
+        
+        if self.errors:
+            logger.error(
+                message="Errors ({count}): {errors}",
+                tag="SUMMARY",
+                params={
+                    "count": len(self.errors),
+                    "errors": "; ".join(str(e) for e in self.errors)
+                }
+            )
+
+# ======== TEST SCENARIO 1: Simple default configuration ========
+async def test_default_configuration():
+    """
+    Test Scenario 1: Simple default configuration
+    
+    This tests the basic case where the user does not provide any specific
+    browser configuration, relying on default auto-setup.
+    """
+    logger = AsyncLogger(verbose=True)
+    results = TestResults("Default Configuration")
+    
+    try:
+        # Create pipeline with no browser config
+        pipeline = await create_pipeline(logger=logger)
+        
+        # Start timing
+        results.start_time = asyncio.get_event_loop().time()
+        
+        # Create basic crawler config
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            wait_until="domcontentloaded"
+        )
+        
+        # Process each URL sequentially
+        for url in TEST_URLS:
+            try:
+                logger.info(f"Crawling {url} with default configuration", tag="TEST")
+                result = await pipeline.crawl(url=url, config=crawler_config)
+                results.results.append(result)
+                
+                logger.success(
+                    message="Result: url={url}, success={success}, content_length={length}",
+                    tag="TEST",
+                    params={
+                        "url": url,
+                        "success": result.success,
+                        "length": len(result.html) if result.html else 0
+                    }
+                )
+            except Exception as e:
+                logger.error(f"Error crawling {url}: {str(e)}", tag="TEST")
+                results.errors.append(e)
+        
+        # End timing
+        results.end_time = asyncio.get_event_loop().time()
+        
+    except Exception as e:
+        logger.error(f"Test failed with error: {str(e)}", tag="TEST")
+        results.errors.append(e)
+    
+    # Log summary
+    results.log_summary(logger)
+    
+    return results
+
+# ======== TEST SCENARIO 2: Detailed custom configuration ========
+async def test_custom_configuration():
+    """
+    Test Scenario 2: Detailed custom configuration
+    
+    This tests the case where the user provides detailed browser configuration
+    to customize the browser behavior.
+    """
+    logger = AsyncLogger(verbose=True)
+    results = TestResults("Custom Configuration")
+    
+    try:
+        # Create custom browser config
+        browser_config = BrowserConfig(
+            browser_type="chromium",
+            headless=True,
+            viewport_width=1920,
+            viewport_height=1080,
+            user_agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
+            light_mode=True,
+            ignore_https_errors=True,
+            extra_args=["--disable-extensions"]
+        )
+        
+        # Create custom crawler config
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            wait_until="networkidle",
+            page_timeout=30000,
+            screenshot=True,
+            pdf=False,
+            screenshot_wait_for=0.5,
+            wait_for_images=True,
+            scan_full_page=True,
+            scroll_delay=0.2,
+            process_iframes=True,
+            remove_overlay_elements=True
+        )
+        
+        # Create pipeline with custom configuration
+        pipeline = await create_pipeline(
+            browser_config=browser_config,
+            logger=logger
+        )
+        
+        # Start timing
+        results.start_time = asyncio.get_event_loop().time()
+        
+        # Process each URL sequentially
+        for url in TEST_URLS:
+            try:
+                logger.info(f"Crawling {url} with custom configuration", tag="TEST")
+                result = await pipeline.crawl(url=url, config=crawler_config)
+                results.results.append(result)
+                
+                has_screenshot = result.screenshot is not None
+                
+                logger.success(
+                    message="Result: url={url}, success={success}, screenshot={screenshot}, content_length={length}",
+                    tag="TEST",
+                    params={
+                        "url": url,
+                        "success": result.success,
+                        "screenshot": has_screenshot,
+                        "length": len(result.html) if result.html else 0
+                    }
+                )
+            except Exception as e:
+                logger.error(f"Error crawling {url}: {str(e)}", tag="TEST")
+                results.errors.append(e)
+        
+        # End timing
+        results.end_time = asyncio.get_event_loop().time()
+        
+        # Get browser hub status from context
+        try:
+            # Run a dummy crawl to get the context with browser hub
+            context = await pipeline.process({"url": "about:blank", "config": crawler_config})
+            browser_hub = context.get("browser_hub")
+            if browser_hub:
+                status = await browser_hub.get_pool_status()
+                logger.info(
+                    message="Browser hub status: {status}",
+                    tag="TEST",
+                    params={"status": status}
+                )
+        except Exception as e:
+            logger.error(f"Failed to get browser hub status: {str(e)}", tag="TEST")
+    
+    except Exception as e:
+        logger.error(f"Test failed with error: {str(e)}", tag="TEST")
+        results.errors.append(e)
+    
+    # Log summary
+    results.log_summary(logger)
+    
+    return results
+
+# ======== TEST SCENARIO 3: Using pre-initialized browser hub ========
+async def test_preinitalized_browser_hub():
+    """
+    Test Scenario 3: Using pre-initialized browser hub
+    
+    This tests the case where a browser hub is initialized separately
+    and then passed to the pipeline.
+    """
+    logger = AsyncLogger(verbose=True)
+    results = TestResults("Pre-initialized Browser Hub")
+    
+    browser_hub = None
+    try:
+        # Create and initialize browser hub separately
+        logger.info("Initializing browser hub separately", tag="TEST")
+        
+        browser_config = BrowserConfig(
+            browser_type="chromium",
+            headless=True,
+            verbose=True
+        )
+        
+        browser_hub = await BrowserHub.get_browser_manager(
+            config=browser_config,
+            hub_id="test_preinitalized",
+            logger=logger,
+            max_browsers_per_config=2,
+            max_pages_per_browser=3,
+            initial_pool_size=2
+        )
+        
+        # Display initial status
+        status = await browser_hub.get_pool_status()
+        logger.info(
+            message="Initial browser hub status: {status}",
+            tag="TEST",
+            params={"status": status}
+        )
+        
+        # Create pipeline with pre-initialized browser hub
+        pipeline = await create_pipeline(
+            browser_hub=browser_hub,
+            logger=logger
+        )
+        
+        # Create crawler config
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            wait_until="networkidle",
+            screenshot=True
+        )
+        
+        # Start timing
+        results.start_time = asyncio.get_event_loop().time()
+        
+        # Process URLs in parallel
+        async def crawl_url(url):
+            try:
+                logger.info(f"Crawling {url} with pre-initialized hub", tag="TEST")
+                result = await pipeline.crawl(url=url, config=crawler_config)
+                logger.success(f"Completed crawl of {url}", tag="TEST")
+                return result
+            except Exception as e:
+                logger.error(f"Error crawling {url}: {str(e)}", tag="TEST")
+                results.errors.append(e)
+                return None
+        
+        # Create tasks for all URLs
+        tasks = [crawl_url(url) for url in TEST_URLS]
+        
+        # Execute all tasks in parallel and collect results
+        all_results = await asyncio.gather(*tasks)
+        results.results = [r for r in all_results if r is not None]
+        
+        # End timing
+        results.end_time = asyncio.get_event_loop().time()
+        
+        # Display final status
+        status = await browser_hub.get_pool_status()
+        logger.info(
+            message="Final browser hub status: {status}",
+            tag="TEST",
+            params={"status": status}
+        )
+        
+    except Exception as e:
+        logger.error(f"Test failed with error: {str(e)}", tag="TEST")
+        results.errors.append(e)
+    
+    # Log summary
+    results.log_summary(logger)
+    
+    return results, browser_hub
+
+# ======== TEST SCENARIO 4: Parallel pipelines sharing browser hub ========
+async def test_parallel_pipelines():
+    """
+    Test Scenario 4: Multiple parallel pipelines sharing browser hub
+    
+    This tests the case where multiple pipelines share the same browser hub,
+    demonstrating resource sharing and parallel operation.
+    """
+    logger = AsyncLogger(verbose=True)
+    results = TestResults("Parallel Pipelines")
+    
+    # We'll reuse the browser hub from the previous test
+    _, browser_hub = await test_preinitalized_browser_hub()
+    
+    try:
+        # Create 3 pipelines that all share the same browser hub
+        pipelines = []
+        for i in range(3):
+            pipeline = await create_pipeline(
+                browser_hub=browser_hub,
+                logger=logger
+            )
+            pipelines.append(pipeline)
+        
+        logger.info(f"Created {len(pipelines)} pipelines sharing the same browser hub", tag="TEST")
+        
+        # Create crawler configs with different settings
+        configs = [
+            CrawlerRunConfig(wait_until="domcontentloaded", screenshot=False),
+            CrawlerRunConfig(wait_until="networkidle", screenshot=True),
+            CrawlerRunConfig(wait_until="load", scan_full_page=True)
+        ]
+        
+        # Start timing
+        results.start_time = asyncio.get_event_loop().time()
+        
+        # Function to process URLs with a specific pipeline
+        async def process_with_pipeline(pipeline_idx, urls):
+            pipeline_results = []
+            for url in urls:
+                try:
+                    logger.info(f"Pipeline {pipeline_idx} crawling {url}", tag="TEST")
+                    result = await pipelines[pipeline_idx].crawl(
+                        url=url, 
+                        config=configs[pipeline_idx]
+                    )
+                    pipeline_results.append(result)
+                    logger.success(
+                        message="Pipeline {idx} completed: url={url}, success={success}",
+                        tag="TEST",
+                        params={
+                            "idx": pipeline_idx,
+                            "url": url,
+                            "success": result.success
+                        }
+                    )
+                except Exception as e:
+                    logger.error(
+                        message="Pipeline {idx} error: {error}",
+                        tag="TEST",
+                        params={
+                            "idx": pipeline_idx,
+                            "error": str(e)
+                        }
+                    )
+                    results.errors.append(e)
+            return pipeline_results
+        
+        # Distribute URLs among pipelines
+        pipeline_urls = [
+            TEST_URLS[:2],
+            TEST_URLS[2:4],
+            TEST_URLS[4:5] * 2  # Duplicate the last URL to have 2 for pipeline 3
+        ]
+        
+        # Execute all pipelines in parallel
+        tasks = [
+            process_with_pipeline(i, urls) 
+            for i, urls in enumerate(pipeline_urls)
+        ]
+        
+        pipeline_results = await asyncio.gather(*tasks)
+        
+        # Flatten results
+        for res_list in pipeline_results:
+            results.results.extend(res_list)
+        
+        # End timing
+        results.end_time = asyncio.get_event_loop().time()
+        
+        # Display browser hub status
+        status = await browser_hub.get_pool_status()
+        logger.info(
+            message="Browser hub status after parallel pipelines: {status}",
+            tag="TEST",
+            params={"status": status}
+        )
+        
+    except Exception as e:
+        logger.error(f"Test failed with error: {str(e)}", tag="TEST")
+        results.errors.append(e)
+    
+    # Log summary
+    results.log_summary(logger)
+    
+    return results
+
+# ======== TEST SCENARIO 5: Browser hub with connection string ========
+async def test_connection_string():
+    """
+    Test Scenario 5: Browser hub with connection string
+    
+    This tests the case where a browser hub is initialized from a connection string,
+    simulating connecting to a running browser hub service.
+    """
+    logger = AsyncLogger(verbose=True)
+    results = TestResults("Connection String")
+    
+    try:
+        # Create pipeline with connection string
+        # Note: In a real implementation, this would connect to an existing service
+        # For this test, we're using a simulated connection
+        connection_string = "localhost:9222"  # Simulated connection string
+        
+        pipeline = await create_pipeline(
+            browser_hub_connection=connection_string,
+            logger=logger
+        )
+        
+        # Create crawler config
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            wait_until="networkidle"
+        )
+        
+        # Start timing
+        results.start_time = asyncio.get_event_loop().time()
+        
+        # Test with a single URL
+        url = TEST_URLS[0]
+        try:
+            logger.info(f"Crawling {url} with connection string hub", tag="TEST")
+            result = await pipeline.crawl(url=url, config=crawler_config)
+            results.results.append(result)
+            
+            logger.success(
+                message="Result: url={url}, success={success}, content_length={length}",
+                tag="TEST",
+                params={
+                    "url": url,
+                    "success": result.success,
+                    "length": len(result.html) if result.html else 0
+                }
+            )
+        except Exception as e:
+            logger.error(f"Error crawling {url}: {str(e)}", tag="TEST")
+            results.errors.append(e)
+        
+        # End timing
+        results.end_time = asyncio.get_event_loop().time()
+        
+    except Exception as e:
+        logger.error(f"Test failed with error: {str(e)}", tag="TEST")
+        results.errors.append(e)
+    
+    # Log summary
+    results.log_summary(logger)
+    
+    return results
+
+# ======== RUN ALL TESTS ========
+async def run_all_tests():
+    """Run all test scenarios"""
+    logger = AsyncLogger(verbose=True)
+    logger.info("=== STARTING BROWSER HUB TESTS ===", tag="MAIN")
+    
+    try:
+        # Run each test scenario
+        await test_default_configuration()
+        # await test_custom_configuration()
+        # await test_preinitalized_browser_hub()
+        # await test_parallel_pipelines()
+        # await test_connection_string()
+        
+    except Exception as e:
+        logger.error(f"Test suite failed: {str(e)}", tag="MAIN")
+    finally:
+        # Clean up all browser hubs
+        logger.info("Shutting down all browser hubs...", tag="MAIN")
+        await BrowserHub.shutdown_all()
+        logger.success("All tests completed", tag="MAIN")
+
+if __name__ == "__main__":
+    asyncio.run(run_all_tests())
--- a/tests/pipeline/test_pipeline.py
+++ b/tests/pipeline/test_pipeline.py
@@ -0,0 +1,109 @@
+import asyncio
+from crawl4ai import (
+    BrowserConfig,
+    CrawlerRunConfig,
+    CacheMode,
+    DefaultMarkdownGenerator,
+    PruningContentFilter
+)
+from pipeline import Pipeline
+
+async def main():
+    # Create configuration objects
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(
+                threshold=0.48, 
+                threshold_type="fixed", 
+                min_word_threshold=0
+            )
+        ),
+    )
+    
+    # Create and use pipeline with context manager
+    async with Pipeline(browser_config=browser_config) as pipeline:
+        result = await pipeline.crawl(
+            url="https://www.example.com", 
+            config=crawler_config
+        )
+        
+        # Print the result
+        print(f"URL: {result.url}")
+        print(f"Success: {result.success}")
+        
+        if result.success:
+            print("\nMarkdown excerpt:")
+            print(result.markdown.raw_markdown[:500] + "...")
+        else:
+            print(f"Error: {result.error_message}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
+
+class CrawlTarget:
+    def __init__(self, urls, config=None):
+        self.urls = urls
+        self.config = config
+
+    def __repr__(self):
+        return f"CrawlTarget(urls={self.urls}, config={self.config})"
+    
+
+
+
+# async def main():
+#     # Create configuration objects
+#     browser_config = BrowserConfig(headless=True, verbose=True)
+    
+#     # Define different configurations
+#     config1 = CrawlerRunConfig(
+#         cache_mode=CacheMode.BYPASS,
+#         markdown_generator=DefaultMarkdownGenerator(
+#             content_filter=PruningContentFilter(threshold=0.48)
+#         ),
+#     )
+    
+#     config2 = CrawlerRunConfig(
+#         cache_mode=CacheMode.ENABLED,
+#         screenshot=True,
+#         pdf=True
+#     )
+    
+#     # Create crawl targets
+#     targets = [
+#         CrawlTarget(
+#             urls=["https://www.example.com", "https://www.wikipedia.org"],
+#             config=config1
+#         ),
+#         CrawlTarget(
+#             urls="https://news.ycombinator.com",  
+#             config=config2
+#         ),
+#         CrawlTarget(
+#             urls=["https://github.com", "https://stackoverflow.com", "https://python.org"],
+#             config=None
+#         )
+#     ]
+    
+#     # Create and use pipeline with context manager
+#     async with Pipeline(browser_config=browser_config) as pipeline:
+#         all_results = await pipeline.crawl_batch(targets)
+        
+#         for target_key, results in all_results.items():
+#             print(f"\n===== Results for {target_key} =====")
+#             print(f"Number of URLs crawled: {len(results)}")
+            
+#             for i, result in enumerate(results):
+#                 print(f"\nURL {i+1}: {result.url}")
+#                 print(f"Success: {result.success}")
+                
+#                 if result.success:
+#                     print(f"Content length: {len(result.markdown.raw_markdown)} chars")
+#                 else:
+#                     print(f"Error: {result.error_message}")
+
+# if __name__ == "__main__":
+#     asyncio.run(main())