Refactor: Move adeep_crawl as method of crawler itself. Create attributes in CrawlResult to reconstruct the tree once deep crawling is completed

2025-01-29 13:06:09 +05:30
parent 78223bc847
commit 84ffdaab9a
15 changed files with 385 additions and 669 deletions
--- a/docs/scraper/deep_crawl_quickstart.py
+++ b/docs/scraper/deep_crawl_quickstart.py
@@ -1,20 +1,18 @@
 # basic_scraper_example.py
-from crawl4ai.async_configs import CrawlerRunConfig
+from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
 from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
-from crawl4ai.scraper import (
-    AsyncWebScraper,
-    BFSScraperStrategy,
+from crawl4ai.traversal import (
+    BFSTraversalStrategy,
    FilterChain,
    URLPatternFilter,
    ContentTypeFilter,
 )
-from crawl4ai.async_webcrawler import BrowserConfig
+from crawl4ai.async_webcrawler import AsyncWebCrawler
 import re
 import time

 browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)

-
 async def basic_scraper_example():
    """
    Basic example: Scrape a blog site for articles
@@ -33,7 +31,7 @@ async def basic_scraper_example():
    )

    # Initialize the strategy with basic configuration
-    bfs_strategy = BFSScraperStrategy(
+    bfs_strategy = BFSTraversalStrategy(
        max_depth=2,  # Only go 2 levels deep
        filter_chain=filter_chain,
        url_scorer=None,  # Use default scoring
@@ -41,17 +39,18 @@ async def basic_scraper_example():
    )

    # Create the crawler and scraper
-    async with AsyncWebScraper(
-        strategy=bfs_strategy,
-    ) as scraper:
+    async with AsyncWebCrawler(
+        config=browser_config,
+    ) as crawler:
        # Start scraping
        try:
-            result = await scraper.ascrape("https://crawl4ai.com/mkdocs")
-
+            results = await crawler.adeep_crawl(
+                "https://crawl4ai.com/mkdocs", strategy=bfs_strategy
+            )
            # Process results
-            print(f"Crawled {len(result.crawled_urls)} pages:")
-            for url, page_result in result.extracted_data.items():
-                print(f"- {url}: {len(page_result.result.html)} bytes")
+            print(f"Crawled {len(results)} pages:")
+            for result in results:
+                print(f"- {result.url}: {len(result.html)} bytes")

        except Exception as e:
            print(f"Error during scraping: {e}")
@@ -60,9 +59,8 @@ async def basic_scraper_example():
 # advanced_scraper_example.py
 import logging

-from crawl4ai.scraper import (
-    AsyncWebScraper,
-    BFSScraperStrategy,
+from crawl4ai.traversal import (
+    BFSTraversalStrategy,
    FilterChain,
    URLPatternFilter,
    ContentTypeFilter,
@@ -123,34 +121,36 @@ async def advanced_scraper_example():
    )

    # Initialize strategy with advanced configuration
-    bfs_strategy = BFSScraperStrategy(
+    bfs_strategy = BFSTraversalStrategy(
        max_depth=2, filter_chain=filter_chain, url_scorer=scorer
    )

    # Create crawler and scraper
-    async with AsyncWebScraper(
-        strategy=bfs_strategy,
-        crawler_config=CrawlerRunConfig(bypass_cache=True, scraping_strategy=LXMLWebScrapingStrategy(),),
-        browser_config=browser_config,
-    ) as scraper:
+    async with AsyncWebCrawler(
+        config=browser_config,
+    ) as crawler:

        # Track statistics
        stats = {"processed": 0, "errors": 0, "total_size": 0}

        try:
            # Use streaming mode
-            result_generator = await scraper.ascrape(
-                "https://techcrunch.com", stream=True
+            result_generator = await crawler.adeep_crawl(
+                "https://techcrunch.com",
+                strategy=bfs_strategy,
+                crawler_run_config=CrawlerRunConfig(
+                    scraping_strategy=LXMLWebScrapingStrategy()
+                ),
+                stream=True,
            )
-            async for page_result in result_generator:
-                result = page_result.result
-                score = page_result.score
-                depth = page_result.depth
+            async for result in result_generator:
                stats["processed"] += 1

                if result.success:
                    stats["total_size"] += len(result.html)
-                    logger.info(f"Processed at depth: {depth} with score: {score:.3f} : \n {result.url}")
+                    logger.info(
+                        f"Processed at depth: {result.depth} with score: {result.score:.3f} : \n {result.url}"
+                    )
                else:
                    stats["errors"] += 1
                    logger.error(
--- a/docs/scraper/scraper_quickstart_review.py
+++ b/docs/scraper/scraper_quickstart_review.py
@@ -1,187 +0,0 @@
-# basic_scraper_example.py
-from crawl4ai.scraper import (
-    AsyncWebScraper,
-    BFSScraperStrategy,
-    FilterChain,
-    URLPatternFilter,
-    ContentTypeFilter,
-)
-from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig
-import re
-
-browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
-
-
-async def basic_scraper_example():
-    """
-    Basic example: Scrape a blog site for articles
-    - Crawls only HTML pages
-    - Stays within the blog section
-    - Collects all results at once
-    """
-    # Create a simple filter chain
-    filter_chain = FilterChain(
-        [
-            # Only crawl pages within the blog section
-            URLPatternFilter("*/tutorial/*"),
-            # Only process HTML pages
-            ContentTypeFilter(["text/html"]),
-        ]
-    )
-
-    # Initialize the strategy with basic configuration
-    strategy = BFSScraperStrategy(
-        max_depth=2,  # Only go 2 levels deep
-        filter_chain=filter_chain,
-        url_scorer=None,  # Use default scoring
-        process_external_links=True,
-    )
-
-    # Create the crawler and scraper
-    async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler:
-        scraper = AsyncWebScraper(crawler, strategy)
-        # Start scraping
-        try:
-            result = await scraper.ascrape("https://crawl4ai.com/mkdocs")
-
-            # Process results
-            print(f"Crawled {len(result.crawled_urls)} pages:")
-            for url, data in result.extracted_data.items():
-                print(f"- {url}: {len(data.html)} bytes")
-
-        except Exception as e:
-            print(f"Error during scraping: {e}")
-
-
-# advanced_scraper_example.py
-import logging
-from crawl4ai.scraper import (
-    AsyncWebScraper,
-    BFSScraperStrategy,
-    FilterChain,
-    URLPatternFilter,
-    ContentTypeFilter,
-    DomainFilter,
-    KeywordRelevanceScorer,
-    PathDepthScorer,
-    FreshnessScorer,
-    CompositeScorer,
-)
-from crawl4ai.async_webcrawler import AsyncWebCrawler
-
-
-async def advanced_scraper_example():
-    """
-    Advanced example: Intelligent news site scraping
-    - Uses all filter types
-    - Implements sophisticated scoring
-    - Streams results
-    - Includes monitoring and logging
-    """
-    # Set up logging
-    logging.basicConfig(level=logging.INFO)
-    logger = logging.getLogger("advanced_scraper")
-
-    # Create sophisticated filter chain
-    filter_chain = FilterChain(
-        [
-            # Domain control
-            DomainFilter(
-                allowed_domains=["techcrunch.com"],
-                blocked_domains=["login.techcrunch.com", "legal.yahoo.com"],
-            ),
-            # URL patterns
-            URLPatternFilter(
-                [
-                    "*/article/*",
-                    "*/news/*",
-                    "*/blog/*",
-                    re.compile(r"\d{4}/\d{2}/.*"),  # Date-based URLs
-                ]
-            ),
-            # Content types
-            ContentTypeFilter(["text/html", "application/xhtml+xml"]),
-        ]
-    )
-
-    # Create composite scorer
-    scorer = CompositeScorer(
-        [
-            # Prioritize by keywords
-            KeywordRelevanceScorer(
-                keywords=["news", "breaking", "update", "latest"], weight=1.0
-            ),
-            # Prefer optimal URL structure
-            PathDepthScorer(optimal_depth=3, weight=0.7),
-            # Prioritize fresh content
-            FreshnessScorer(weight=0.9),
-        ]
-    )
-
-    # Initialize strategy with advanced configuration
-    strategy = BFSScraperStrategy(
-        max_depth=2, filter_chain=filter_chain, url_scorer=scorer
-    )
-
-    # Create crawler and scraper
-    async with AsyncWebCrawler(verbose=True, config=browser_config) as crawler:
-        scraper = AsyncWebScraper(crawler, strategy)
-
-        # Track statistics
-        stats = {"processed": 0, "errors": 0, "total_size": 0}
-
-        try:
-            # Use streaming mode
-            result_generator = await scraper.ascrape(
-                "https://techcrunch.com", stream=True
-            )
-            async for result in result_generator:
-                stats["processed"] += 1
-
-                if result.success:
-                    stats["total_size"] += len(result.html)
-                    logger.info(f"Processed: {result.url}")
-                else:
-                    stats["errors"] += 1
-                    logger.error(
-                        f"Failed to process {result.url}: {result.error_message}"
-                    )
-
-                # Log progress regularly
-                if stats["processed"] % 10 == 0:
-                    logger.info(f"Progress: {stats['processed']} URLs processed")
-
-        except Exception as e:
-            logger.error(f"Scraping error: {e}")
-
-        finally:
-            # Print final statistics
-            logger.info("Scraping completed:")
-            logger.info(f"- URLs processed: {stats['processed']}")
-            logger.info(f"- Errors: {stats['errors']}")
-            logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
-
-            # Print filter statistics
-            for filter_ in filter_chain.filters:
-                logger.info(f"{filter_.name} stats:")
-                logger.info(f"- Passed: {filter_.stats.passed_urls}")
-                logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
-
-            # Print scorer statistics
-            logger.info("Scoring statistics:")
-            logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
-            logger.info(
-                f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}"
-            )
-
-
-if __name__ == "__main__":
-    import asyncio
-
-    # Run basic example
-    print("Running basic scraper example...")
-    asyncio.run(basic_scraper_example())
-
-    # Run advanced example
-    print("\nRunning advanced scraper example...")
-    asyncio.run(advanced_scraper_example())