Refactor:Moved deep_crawl_strategy, inside crawler run config

2025-01-30 16:18:15 +05:30
parent 858c18df39
commit ca3f0126d3
9 changed files with 79 additions and 57 deletions
--- a/docs/deep_crawl/deep_crawl_quickstart.py
+++ b/docs/deep_crawl/deep_crawl_quickstart.py
@@ -1,18 +1,25 @@
 # basic_scraper_example.py
 from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
 from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
-from crawl4ai.traversal import (
-    BFSTraversalStrategy,
+from crawl4ai.deep_crawl import (
+    BFSDeepCrawlStrategy,
    FilterChain,
    URLPatternFilter,
    ContentTypeFilter,
+    DomainFilter,
+    KeywordRelevanceScorer,
+    PathDepthScorer,
+    FreshnessScorer,
+    CompositeScorer,
 )
 from crawl4ai.async_webcrawler import AsyncWebCrawler
 import re
 import time
+import logging

 browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)

+
 async def basic_scraper_example():
    """
    Basic example: Scrape a blog site for articles
@@ -31,7 +38,7 @@ async def basic_scraper_example():
    )

    # Initialize the strategy with basic configuration
-    bfs_strategy = BFSTraversalStrategy(
+    bfs_strategy = BFSDeepCrawlStrategy(
        max_depth=2,  # Only go 2 levels deep
        filter_chain=filter_chain,
        url_scorer=None,  # Use default scoring
@@ -44,8 +51,8 @@ async def basic_scraper_example():
    ) as crawler:
        # Start scraping
        try:
-            results = await crawler.adeep_crawl(
-                "https://crawl4ai.com/mkdocs", strategy=bfs_strategy
+            results = await crawler.arun(
+                "https://crawl4ai.com/mkdocs", CrawlerRunConfig(deep_crawl_strategy=bfs_strategy)
            )
            # Process results
            print(f"Crawled {len(results)} pages:")
@@ -55,23 +62,6 @@ async def basic_scraper_example():
        except Exception as e:
            print(f"Error during scraping: {e}")

-
-# advanced_scraper_example.py
-import logging
-
-from crawl4ai.traversal import (
-    BFSTraversalStrategy,
-    FilterChain,
-    URLPatternFilter,
-    ContentTypeFilter,
-    DomainFilter,
-    KeywordRelevanceScorer,
-    PathDepthScorer,
-    FreshnessScorer,
-    CompositeScorer,
-)
-
-
 async def advanced_scraper_example():
    """
    Advanced example: Intelligent news site scraping
@@ -121,7 +111,7 @@ async def advanced_scraper_example():
    )

    # Initialize strategy with advanced configuration
-    bfs_strategy = BFSTraversalStrategy(
+    bfs_strategy = BFSDeepCrawlStrategy(
        max_depth=2, filter_chain=filter_chain, url_scorer=scorer
    )

@@ -136,13 +126,10 @@ async def advanced_scraper_example():
        try:
            # Use streaming mode
            results = []
-            result_generator = await crawler.adeep_crawl(
+            result_generator = await crawler.arun(
                "https://techcrunch.com",
-                strategy=bfs_strategy,
-                crawler_run_config=CrawlerRunConfig(
-                    scraping_strategy=LXMLWebScrapingStrategy()
-                ),
-                stream=True,
+                config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy,
+                stream=True)
            )
            async for result in result_generator:
                stats["processed"] += 1