#1510 : Add DFS deep crawler demonstration script and enhance DFS strategy with seen URL tracking

2025-11-12 17:44:43 +08:00
parent d56b0eb9a9
commit 1bd3de6a47
2 changed files with 120 additions and 0 deletions
--- a/crawl4ai/deep_crawling/dfs_strategy.py
+++ b/crawl4ai/deep_crawling/dfs_strategy.py
@@ -4,6 +4,7 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
 from ..models import CrawlResult
 from .bfs_strategy import BFSDeepCrawlStrategy  # noqa
 from ..types import AsyncWebCrawler, CrawlerRunConfig
 from ..utils import normalize_url_for_deep_crawl
 class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
    """
@@ -12,6 +13,14 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
    Inherits URL validation and link discovery from BFSDeepCrawlStrategy.
    Overrides _arun_batch and _arun_stream to use a stack (LIFO) for DFS traversal.
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._dfs_seen: Set[str] = set()
    def _reset_seen(self, start_url: str) -> None:
        self._dfs_seen = {start_url}
    async def _arun_batch(
        self,
        start_url: str,
@@ -27,6 +36,7 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
        stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
        depths: Dict[str, int] = {start_url: 0}
        results: List[CrawlResult] = []
        self._reset_seen(start_url)
        while stack and not self._cancel_event.is_set():
            url, parent, depth = stack.pop()
@@ -77,6 +87,7 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
        visited: Set[str] = set()
        stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
        depths: Dict[str, int] = {start_url: 0}
        self._reset_seen(start_url)
        while stack and not self._cancel_event.is_set():
            url, parent, depth = stack.pop()
@@ -108,3 +119,73 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
                    for new_url, new_parent in reversed(new_links):
                        new_depth = depths.get(new_url, depth + 1)
                        stack.append((new_url, new_parent, new_depth))
    async def link_discovery(
        self,
        result: CrawlResult,
        source_url: str,
        current_depth: int,
        _visited: Set[str],
        next_level: List[Tuple[str, Optional[str]]],
        depths: Dict[str, int],
    ) -> None:
        """
        DFS-specific link discovery that avoids mutating the traversal
        'visited' set, preventing premature skips.
        """
        next_depth = current_depth + 1
        if next_depth > self.max_depth:
            return
        remaining_capacity = self.max_pages - self._pages_crawled
        if remaining_capacity <= 0:
            self.logger.info(
                f"Max pages limit ({self.max_pages}) reached, stopping link discovery"
            )
            return
        links = result.links.get("internal", [])
        if self.include_external:
            links += result.links.get("external", [])
        seen = self._dfs_seen
        valid_links: List[Tuple[str, float]] = []
        for link in links:
            raw_url = link.get("href")
            if not raw_url:
                continue
            normalized_url = normalize_url_for_deep_crawl(raw_url, source_url)
            if not normalized_url or normalized_url in seen:
                continue
            if not await self.can_process_url(raw_url, next_depth):
                self.stats.urls_skipped += 1
                continue
            score = self.url_scorer.score(normalized_url) if self.url_scorer else 0
            if score < self.score_threshold:
                self.logger.debug(
                    f"URL {normalized_url} skipped: score {score} below threshold {self.score_threshold}"
                )
                self.stats.urls_skipped += 1
                continue
            seen.add(normalized_url)
            valid_links.append((normalized_url, score))
        if len(valid_links) > remaining_capacity:
            if self.url_scorer:
                valid_links.sort(key=lambda x: x[1], reverse=True)
            valid_links = valid_links[:remaining_capacity]
            self.logger.info(
                f"Limiting to {remaining_capacity} URLs due to max_pages limit"
            )
        for url, score in valid_links:
            if score:
                result.metadata = result.metadata or {}
                result.metadata["score"] = score
            next_level.append((url, source_url))
            depths[url] = next_depth
--- a/docs/examples/dfs_crawl_demo.py
+++ b/docs/examples/dfs_crawl_demo.py
@@ -0,0 +1,39 @@
 """
 Simple demonstration of the DFS deep crawler visiting multiple pages.
 Run with:  python docs/examples/dfs_crawl_demo.py
 """
 import asyncio
 from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 from crawl4ai.async_webcrawler import AsyncWebCrawler
 from crawl4ai.cache_context import CacheMode
 from crawl4ai.deep_crawling.dfs_strategy import DFSDeepCrawlStrategy
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 async def main() -> None:
    dfs_strategy = DFSDeepCrawlStrategy(
        max_depth=3,
        max_pages=50,
        include_external=False,
    )
    config = CrawlerRunConfig(
        deep_crawl_strategy=dfs_strategy,
        cache_mode=CacheMode.BYPASS,
        markdown_generator=DefaultMarkdownGenerator(),
        stream=True,
    )
    seed_url = "https://docs.python.org/3/"  # Plenty of internal links
    async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
        async for result in await crawler.arun(url=seed_url, config=config):
            depth = result.metadata.get("depth")
            status = "SUCCESS" if result.success else "FAILED"
            print(f"[{status}] depth={depth} url={result.url}")
 if __name__ == "__main__":
    asyncio.run(main())