#1510 : Add DFS deep crawler demonstration script and enhance DFS strategy with seen URL tracking

2025-11-12 17:44:43 +08:00
parent d56b0eb9a9
commit 1bd3de6a47
2 changed files with 120 additions and 0 deletions
--- a/crawl4ai/deep_crawling/dfs_strategy.py
+++ b/crawl4ai/deep_crawling/dfs_strategy.py
@@ -4,6 +4,7 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
 from ..models import CrawlResult
 from .bfs_strategy import BFSDeepCrawlStrategy  # noqa
 from ..types import AsyncWebCrawler, CrawlerRunConfig
+from ..utils import normalize_url_for_deep_crawl

 class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
    """
@@ -12,6 +13,14 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
    Inherits URL validation and link discovery from BFSDeepCrawlStrategy.
    Overrides _arun_batch and _arun_stream to use a stack (LIFO) for DFS traversal.
    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._dfs_seen: Set[str] = set()
+
+    def _reset_seen(self, start_url: str) -> None:
+        self._dfs_seen = {start_url}
+
    async def _arun_batch(
        self,
        start_url: str,
@@ -27,6 +36,7 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
        stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
        depths: Dict[str, int] = {start_url: 0}
        results: List[CrawlResult] = []
+        self._reset_seen(start_url)

        while stack and not self._cancel_event.is_set():
            url, parent, depth = stack.pop()
@@ -77,6 +87,7 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
        visited: Set[str] = set()
        stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
        depths: Dict[str, int] = {start_url: 0}
+        self._reset_seen(start_url)

        while stack and not self._cancel_event.is_set():
            url, parent, depth = stack.pop()
@@ -108,3 +119,73 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
                    for new_url, new_parent in reversed(new_links):
                        new_depth = depths.get(new_url, depth + 1)
                        stack.append((new_url, new_parent, new_depth))
+
+    async def link_discovery(
+        self,
+        result: CrawlResult,
+        source_url: str,
+        current_depth: int,
+        _visited: Set[str],
+        next_level: List[Tuple[str, Optional[str]]],
+        depths: Dict[str, int],
+    ) -> None:
+        """
+        DFS-specific link discovery that avoids mutating the traversal
+        'visited' set, preventing premature skips.
+        """
+        next_depth = current_depth + 1
+        if next_depth > self.max_depth:
+            return
+
+        remaining_capacity = self.max_pages - self._pages_crawled
+        if remaining_capacity <= 0:
+            self.logger.info(
+                f"Max pages limit ({self.max_pages}) reached, stopping link discovery"
+            )
+            return
+
+        links = result.links.get("internal", [])
+        if self.include_external:
+            links += result.links.get("external", [])
+
+        seen = self._dfs_seen
+        valid_links: List[Tuple[str, float]] = []
+
+        for link in links:
+            raw_url = link.get("href")
+            if not raw_url:
+                continue
+
+            normalized_url = normalize_url_for_deep_crawl(raw_url, source_url)
+            if not normalized_url or normalized_url in seen:
+                continue
+
+            if not await self.can_process_url(raw_url, next_depth):
+                self.stats.urls_skipped += 1
+                continue
+
+            score = self.url_scorer.score(normalized_url) if self.url_scorer else 0
+            if score < self.score_threshold:
+                self.logger.debug(
+                    f"URL {normalized_url} skipped: score {score} below threshold {self.score_threshold}"
+                )
+                self.stats.urls_skipped += 1
+                continue
+
+            seen.add(normalized_url)
+            valid_links.append((normalized_url, score))
+
+        if len(valid_links) > remaining_capacity:
+            if self.url_scorer:
+                valid_links.sort(key=lambda x: x[1], reverse=True)
+            valid_links = valid_links[:remaining_capacity]
+            self.logger.info(
+                f"Limiting to {remaining_capacity} URLs due to max_pages limit"
+            )
+
+        for url, score in valid_links:
+            if score:
+                result.metadata = result.metadata or {}
+                result.metadata["score"] = score
+            next_level.append((url, source_url))
+            depths[url] = next_depth
--- a/docs/examples/dfs_crawl_demo.py
+++ b/docs/examples/dfs_crawl_demo.py
@@ -0,0 +1,39 @@
+"""
+Simple demonstration of the DFS deep crawler visiting multiple pages.
+
+Run with:  python docs/examples/dfs_crawl_demo.py
+"""
+import asyncio
+
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+from crawl4ai.cache_context import CacheMode
+from crawl4ai.deep_crawling.dfs_strategy import DFSDeepCrawlStrategy
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+
+async def main() -> None:
+    dfs_strategy = DFSDeepCrawlStrategy(
+        max_depth=3,
+        max_pages=50,
+        include_external=False,
+    )
+
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=dfs_strategy,
+        cache_mode=CacheMode.BYPASS,
+        markdown_generator=DefaultMarkdownGenerator(),
+        stream=True,
+    )
+
+    seed_url = "https://docs.python.org/3/"  # Plenty of internal links
+
+    async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
+        async for result in await crawler.arun(url=seed_url, config=config):
+            depth = result.metadata.get("depth")
+            status = "SUCCESS" if result.success else "FAILED"
+            print(f"[{status}] depth={depth} url={result.url}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())