feat: Add deep crawl capabilities to arun_many function

2025-01-30 17:49:58 +05:30
parent f6edb8342e
commit f7ce2d42c9
3 changed files with 124 additions and 18 deletions
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -798,6 +798,22 @@ class AsyncWebCrawler:
        ):
            print(f"Processed {result.url}: {len(result.markdown)} chars")
        """
        async def merge_async_generators(generators):
            tasks = {asyncio.create_task(gen.__anext__()): gen for gen in generators}
            while tasks:
                done, _ = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
                for task in done:
                    gen = tasks.pop(task)  # Get the generator associated with this task
                    try:
                        result = task.result()
                        yield result  # Yield the result
                        tasks[asyncio.create_task(gen.__anext__())] = gen  # Fetch next item
                    except StopAsyncIteration:
                        pass  # Generator is exhausted, don't add it back to the tasks
        if config is None:
            config = CrawlerRunConfig(
                word_count_threshold=word_count_threshold,
@@ -838,6 +854,27 @@ class AsyncWebCrawler:
        stream = config.stream
        if config.deep_crawl_strategy:
            if config.stream:
                generators = []
                for url in urls:
                    generators.append(
                        config.deep_crawl_strategy.arun(
                            start_url=url, crawler=self, crawler_run_config=config
                        )
                    )
                return merge_async_generators(generators)
            else:
                results = []
                for url in urls:
                    url_results = []
                    async for result in config.deep_crawl_strategy.arun(
                        start_url=url, crawler=self, crawler_run_config=config
                    ):
                        url_results.append(result)
                    results.append(url_results)
                return results
        if stream:
            async def result_transformer():
--- a/crawl4ai/deep_crawl/bfs_deep_crawl_strategy.py
+++ b/crawl4ai/deep_crawl/bfs_deep_crawl_strategy.py
@@ -187,7 +187,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
        finally:
            self.stats.end_time = datetime.now()
            await crawler.close()
    async def shutdown(self):
        """Clean up resources and stop crawling"""
--- a/docs/deep_crawl/deep_crawl_quickstart.py
+++ b/docs/deep_crawl/deep_crawl_quickstart.py
@@ -1,4 +1,3 @@
 # basic_scraper_example.py
 from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
 from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
 from crawl4ai.deep_crawl import (
@@ -20,9 +19,9 @@ import logging
 browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
-async def basic_scraper_example():
+async def basic_example():
    """
-    Basic example: Scrape a blog site for articles
+    Basic example: Deep crawl a blog site for articles
    - Crawls only HTML pages
    - Stays within the blog section
    - Collects all results at once
@@ -45,14 +44,15 @@ async def basic_scraper_example():
        process_external_links=True,
    )
-    # Create the crawler and scraper
+    # Create the crawler
    async with AsyncWebCrawler(
        config=browser_config,
    ) as crawler:
        # Start scraping
        try:
            results = await crawler.arun(
-                "https://crawl4ai.com/mkdocs", CrawlerRunConfig(deep_crawl_strategy=bfs_strategy)
+                "https://crawl4ai.com/mkdocs",
                CrawlerRunConfig(deep_crawl_strategy=bfs_strategy),
            )
            # Process results
            print(f"Crawled {len(results)} pages:")
@@ -62,9 +62,10 @@ async def basic_scraper_example():
        except Exception as e:
            print(f"Error during scraping: {e}")
-async def advanced_scraper_example():
+
 async def advanced_example():
    """
-    Advanced example: Intelligent news site scraping
+    Advanced example: Intelligent news site crawling
    - Uses all filter types
    - Implements sophisticated scoring
    - Streams results
@@ -72,7 +73,7 @@ async def advanced_scraper_example():
    """
    # Set up logging
    logging.basicConfig(level=logging.INFO)
-    logger = logging.getLogger("advanced_scraper")
+    logger = logging.getLogger("advanced_deep_crawler")
    # Create sophisticated filter chain
    filter_chain = FilterChain(
@@ -115,7 +116,7 @@ async def advanced_scraper_example():
        max_depth=2, filter_chain=filter_chain, url_scorer=scorer
    )
-    # Create crawler and scraper
+    # Create crawler
    async with AsyncWebCrawler(
        config=browser_config,
    ) as crawler:
@@ -128,8 +129,7 @@ async def advanced_scraper_example():
            results = []
            result_generator = await crawler.arun(
                "https://techcrunch.com",
-                config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy,
+                config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy, stream=True),
                stream=True)
            )
            async for result in result_generator:
                stats["processed"] += 1
@@ -174,17 +174,87 @@ async def advanced_scraper_example():
            )
 async def basic_example_many_urls():
    filter_chain = FilterChain(
        [
            URLPatternFilter("*/basic/*"),
            ContentTypeFilter(["text/html"]),
        ]
    )
    # Initialize the strategy with basic configuration
    bfs_strategy = BFSDeepCrawlStrategy(
        max_depth=2,  # Only go 2 levels deep
        filter_chain=filter_chain,
        url_scorer=None,  # Use default scoring
        process_external_links=False,
    )
    # Create the crawler
    async with AsyncWebCrawler(
        config=browser_config,
    ) as crawler:
        # Start scraping
        try:
            results = await crawler.arun_many(
                urls=["https://crawl4ai.com/mkdocs","https://aravindkarnam.com"],
                config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy),
            )
            # Process results
            print(f"Crawled {len(results)} pages:")
            for url_result in results:
                for result in url_result:
                    print(f"- {result.url}: {len(result.html)} bytes")
        except Exception as e:
            print(f"Error during scraping: {e}")
 async def basic_example_many_urls_stream():
    filter_chain = FilterChain(
        [
            URLPatternFilter("*/basic/*"),
            ContentTypeFilter(["text/html"]),
        ]
    )
    # Initialize the strategy with basic configuration
    bfs_strategy = BFSDeepCrawlStrategy(
        max_depth=2,  # Only go 2 levels deep
        filter_chain=filter_chain,
        url_scorer=None,  # Use default scoring
        process_external_links=False,
    )
    # Create the crawler
    async with AsyncWebCrawler(
        config=browser_config,
    ) as crawler:
        # Start scraping
        try:
            async for result in await crawler.arun_many(
                urls=["https://crawl4ai.com/mkdocs","https://aravindkarnam.com"],
                config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy,stream=True),
            ):
            # Process results
                print(f"- {result.url}: {len(result.html)} bytes")
        except Exception as e:
            print(f"Error during scraping: {e}")
 if __name__ == "__main__":
    import asyncio
    import time
    # Run basic example
    start_time = time.perf_counter()
-    print("Running basic scraper example...")
+    print("Running basic Deep crawl example...")
-    asyncio.run(basic_scraper_example())
+    asyncio.run(basic_example())
    end_time = time.perf_counter()
-    print(f"Basic scraper example completed in {end_time - start_time:.2f} seconds")
+    print(f"Basic deep crawl example completed in {end_time - start_time:.2f} seconds")
-    # # Run advanced example
+    # Run advanced example
-    print("\nRunning advanced scraper example...")
+    print("\nRunning advanced deep crawl example...")
-    asyncio.run(advanced_scraper_example())
+    asyncio.run(advanced_example())
    print("\nRunning advanced deep crawl example with arun_many...")
    asyncio.run(basic_example_many_urls())
    print("\nRunning advanced deep crawl example with arun_many streaming enabled...")
    asyncio.run(basic_example_many_urls_stream())