refactor(deep-crawling): reorganize deep crawling strategies and add new implementations

Split deep crawling code into separate strategy files for better organization and maintainability. Added new BFF (Best First) and DFS crawling strategies. Introduced base strategy class and common types. BREAKING CHANGE: Deep crawling implementation has been split into multiple files. Import paths for deep crawling strategies have changed.
2025-02-05 22:50:39 +08:00
parent c308a794e8
commit a9415aaaf6
10 changed files with 769 additions and 214 deletions
--- a/crawl4ai/deep_crawling/dfs_strategy.py
+++ b/crawl4ai/deep_crawling/dfs_strategy.py
@@ -0,0 +1,95 @@
+# dfs_deep_crawl_strategy.py
+from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
+
+from ..models import CrawlResult
+from .bfs_strategy import BFSDeepCrawlStrategy  # Inherit common logic: can_process_url, link_discovery, etc.
+
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from ..async_configs import CrawlerRunConfig
+    from ..async_webcrawler import AsyncWebCrawler
+
+
+class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
+    """
+    Depth-First Search (DFS) deep crawling strategy.
+
+    Inherits URL validation and link discovery from BFSDeepCrawlStrategy.
+    Overrides _arun_batch and _arun_stream to use a stack (LIFO) for DFS traversal.
+    """
+    async def _arun_batch(
+        self,
+        start_url: str,
+        crawler: "AsyncWebCrawler",
+        config: "CrawlerRunConfig",
+    ) -> List[CrawlResult]:
+        """
+        Batch (non-streaming) DFS mode.
+        Uses a stack to traverse URLs in DFS order, aggregating CrawlResults into a list.
+        """
+        visited: Set[str] = set()
+        # Stack items: (url, parent_url, depth)
+        stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
+        depths: Dict[str, int] = {start_url: 0}
+        results: List[CrawlResult] = []
+
+        while stack and not self._cancel_event.is_set():
+            url, parent, depth = stack.pop()
+            if url in visited or depth > self.max_depth:
+                continue
+            visited.add(url)
+
+            # Clone config to disable recursive deep crawling.
+            batch_config = config.clone(deep_crawl_strategy=None, stream=False)
+            url_results = await crawler.arun_many(urls=[url], config=batch_config)
+            for result in url_results:
+                result.metadata = result.metadata or {}
+                result.metadata["depth"] = depth
+                result.metadata["parent_url"] = parent
+                if self.url_scorer:
+                    result.metadata["score"] = self.url_scorer.score(url)
+                results.append(result)
+
+                new_links: List[Tuple[str, Optional[str]]] = []
+                await self.link_discovery(result, url, depth, visited, new_links, depths)
+                # Push new links in reverse order so the first discovered is processed next.
+                for new_url, new_parent in reversed(new_links):
+                    new_depth = depths.get(new_url, depth + 1)
+                    stack.append((new_url, new_parent, new_depth))
+        return results
+
+    async def _arun_stream(
+        self,
+        start_url: str,
+        crawler: "AsyncWebCrawler",
+        config: "CrawlerRunConfig",
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Streaming DFS mode.
+        Uses a stack to traverse URLs in DFS order and yields CrawlResults as they become available.
+        """
+        visited: Set[str] = set()
+        stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
+        depths: Dict[str, int] = {start_url: 0}
+
+        while stack and not self._cancel_event.is_set():
+            url, parent, depth = stack.pop()
+            if url in visited or depth > self.max_depth:
+                continue
+            visited.add(url)
+
+            stream_config = config.clone(deep_crawl_strategy=None, stream=True)
+            stream_gen = await crawler.arun_many(urls=[url], config=stream_config)
+            async for result in stream_gen:
+                result.metadata = result.metadata or {}
+                result.metadata["depth"] = depth
+                result.metadata["parent_url"] = parent
+                if self.url_scorer:
+                    result.metadata["score"] = self.url_scorer.score(url)
+                yield result
+
+                new_links: List[Tuple[str, Optional[str]]] = []
+                await self.link_discovery(result, url, depth, visited, new_links, depths)
+                for new_url, new_parent in reversed(new_links):
+                    new_depth = depths.get(new_url, depth + 1)
+                    stack.append((new_url, new_parent, new_depth))