Enhance DFSDeepCrawlStrategy documentation for clarity and detail

#1510 : Add DFS deep crawler demonstration script and enhance DFS strategy with seen URL tracking
2025-11-13 16:39:08 +08:00 · 2025-11-12 17:44:43 +08:00
4 changed files with 169 additions and 200 deletions
--- a/crawl4ai/async_url_seeder.py
+++ b/crawl4ai/async_url_seeder.py
@@ -845,15 +845,6 @@ class AsyncUrlSeeder:
            return

        data = gzip.decompress(r.content) if url.endswith(".gz") else r.content
-        base_url = str(r.url)
-
-        def _normalize_loc(raw: Optional[str]) -> Optional[str]:
-            if not raw:
-                return None
-            normalized = urljoin(base_url, raw.strip())
-            if not normalized:
-                return None
-            return normalized

        # Detect if this is a sitemap index by checking for <sitemapindex> or presence of <sitemap> elements
        is_sitemap_index = False
@@ -866,42 +857,25 @@ class AsyncUrlSeeder:
                # Use XML parser for sitemaps, not HTML parser
                parser = etree.XMLParser(recover=True)
                root = etree.fromstring(data, parser=parser)
-                # Namespace-agnostic lookups using local-name() so we honor custom or missing namespaces
-                sitemap_loc_nodes = root.xpath("//*[local-name()='sitemap']/*[local-name()='loc']")
-                url_loc_nodes = root.xpath("//*[local-name()='url']/*[local-name()='loc']")

-                self._log(
-                    "debug",
-                    "Parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered",
-                    params={
-                        "url": url,
-                        "sitemap_count": len(sitemap_loc_nodes),
-                        "url_count": len(url_loc_nodes),
-                    },
-                    tag="URL_SEED",
-                )
+                # Define namespace for sitemap
+                ns = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

                # Check for sitemap index entries
-                if sitemap_loc_nodes:
+                sitemap_locs = root.xpath('//s:sitemap/s:loc', namespaces=ns)
+                if sitemap_locs:
                    is_sitemap_index = True
-                    for sitemap_elem in sitemap_loc_nodes:
-                        loc = _normalize_loc(sitemap_elem.text)
+                    for sitemap_elem in sitemap_locs:
+                        loc = sitemap_elem.text.strip() if sitemap_elem.text else ""
                        if loc:
                            sub_sitemaps.append(loc)

                # If not a sitemap index, get regular URLs
                if not is_sitemap_index:
-                    for loc_elem in url_loc_nodes:
-                        loc = _normalize_loc(loc_elem.text)
+                    for loc_elem in root.xpath('//s:url/s:loc', namespaces=ns):
+                        loc = loc_elem.text.strip() if loc_elem.text else ""
                        if loc:
                            regular_urls.append(loc)
-                    if not regular_urls:
-                        self._log(
-                            "warning",
-                            "No <loc> entries found inside <url> tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.",
-                            params={"url": url},
-                            tag="URL_SEED",
-                        )
            except Exception as e:
                self._log("error", "LXML parsing error for sitemap {url}: {error}",
                          params={"url": url, "error": str(e)}, tag="URL_SEED")
@@ -918,39 +892,19 @@ class AsyncUrlSeeder:

                # Check for sitemap index entries
                sitemaps = root.findall('.//sitemap')
-                url_entries = root.findall('.//url')
-                self._log(
-                    "debug",
-                    "ElementTree parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered",
-                    params={
-                        "url": url,
-                        "sitemap_count": len(sitemaps),
-                        "url_count": len(url_entries),
-                    },
-                    tag="URL_SEED",
-                )
                if sitemaps:
                    is_sitemap_index = True
                    for sitemap in sitemaps:
                        loc_elem = sitemap.find('loc')
-                        loc = _normalize_loc(loc_elem.text if loc_elem is not None else None)
-                        if loc:
-                            sub_sitemaps.append(loc)
+                        if loc_elem is not None and loc_elem.text:
+                            sub_sitemaps.append(loc_elem.text.strip())

                # If not a sitemap index, get regular URLs
                if not is_sitemap_index:
-                    for url_elem in url_entries:
+                    for url_elem in root.findall('.//url'):
                        loc_elem = url_elem.find('loc')
-                        loc = _normalize_loc(loc_elem.text if loc_elem is not None else None)
-                        if loc:
-                            regular_urls.append(loc)
-                    if not regular_urls:
-                        self._log(
-                            "warning",
-                            "No <loc> entries found inside <url> tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.",
-                            params={"url": url},
-                            tag="URL_SEED",
-                        )
+                        if loc_elem is not None and loc_elem.text:
+                            regular_urls.append(loc_elem.text.strip())
            except Exception as e:
                self._log("error", "ElementTree parsing error for sitemap {url}: {error}",
                          params={"url": url, "error": str(e)}, tag="URL_SEED")
--- a/crawl4ai/deep_crawling/dfs_strategy.py
+++ b/crawl4ai/deep_crawling/dfs_strategy.py
@@ -4,14 +4,26 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
 from ..models import CrawlResult
 from .bfs_strategy import BFSDeepCrawlStrategy  # noqa
 from ..types import AsyncWebCrawler, CrawlerRunConfig
+from ..utils import normalize_url_for_deep_crawl

 class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
    """
-    Depth-First Search (DFS) deep crawling strategy.
+    Depth-first deep crawling with familiar BFS rules.

-    Inherits URL validation and link discovery from BFSDeepCrawlStrategy.
-    Overrides _arun_batch and _arun_stream to use a stack (LIFO) for DFS traversal.
+    We reuse the same filters, scoring, and page limits from :class:`BFSDeepCrawlStrategy`,
+    but walk the graph with a stack so we fully explore one branch before hopping to the
+    next. DFS also keeps its own ``_dfs_seen`` set so we can drop duplicate links at
+    discovery time without accidentally marking them as “already crawled”.
    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._dfs_seen: Set[str] = set()
+
+    def _reset_seen(self, start_url: str) -> None:
+        """Start each crawl with a clean dedupe set seeded with the root URL."""
+        self._dfs_seen = {start_url}
+
    async def _arun_batch(
        self,
        start_url: str,
@@ -19,14 +31,19 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
        config: CrawlerRunConfig,
    ) -> List[CrawlResult]:
        """
-        Batch (non-streaming) DFS mode.
-        Uses a stack to traverse URLs in DFS order, aggregating CrawlResults into a list.
+        Crawl level-by-level but emit results at the end.
+
+        We keep a stack of ``(url, parent, depth)`` tuples, pop one at a time, and
+        hand it to ``crawler.arun_many`` with deep crawling disabled so we remain
+        in control of traversal. Every successful page bumps ``_pages_crawled`` and
+        seeds new stack items discovered via :meth:`link_discovery`.
        """
        visited: Set[str] = set()
        # Stack items: (url, parent_url, depth)
        stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
        depths: Dict[str, int] = {start_url: 0}
        results: List[CrawlResult] = []
+        self._reset_seen(start_url)

        while stack and not self._cancel_event.is_set():
            url, parent, depth = stack.pop()
@@ -71,12 +88,16 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
        config: CrawlerRunConfig,
    ) -> AsyncGenerator[CrawlResult, None]:
        """
-        Streaming DFS mode.
-        Uses a stack to traverse URLs in DFS order and yields CrawlResults as they become available.
+        Same traversal as :meth:`_arun_batch`, but yield pages immediately.
+
+        Each popped URL is crawled, its metadata annotated, then the result gets
+        yielded before we even look at the next stack entry. Successful crawls
+        still feed :meth:`link_discovery`, keeping DFS order intact.
        """
        visited: Set[str] = set()
        stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
        depths: Dict[str, int] = {start_url: 0}
+        self._reset_seen(start_url)

        while stack and not self._cancel_event.is_set():
            url, parent, depth = stack.pop()
@@ -108,3 +129,92 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
                    for new_url, new_parent in reversed(new_links):
                        new_depth = depths.get(new_url, depth + 1)
                        stack.append((new_url, new_parent, new_depth))
+
+    async def link_discovery(
+        self,
+        result: CrawlResult,
+        source_url: str,
+        current_depth: int,
+        _visited: Set[str],
+        next_level: List[Tuple[str, Optional[str]]],
+        depths: Dict[str, int],
+    ) -> None:
+        """
+        Find the next URLs we should push onto the DFS stack.
+
+        Parameters
+        ----------
+        result : CrawlResult
+            Output of the page we just crawled; its ``links`` block is our raw material.
+        source_url : str
+            URL of the parent page; stored so callers can track ancestry.
+        current_depth : int
+            Depth of the parent; children naturally sit at ``current_depth + 1``.
+        _visited : Set[str]
+            Present to match the BFS signature, but we rely on ``_dfs_seen`` instead.
+        next_level : list of tuples
+            The stack buffer supplied by the caller; we append new ``(url, parent)`` items here.
+        depths : dict
+            Shared depth map so future metadata tagging knows how deep each URL lives.
+
+        Notes
+        -----
+        - ``_dfs_seen`` keeps us from pushing duplicates without touching the traversal guard.
+        - Validation, scoring, and capacity trimming mirror the BFS version so behaviour stays consistent.
+        """
+        next_depth = current_depth + 1
+        if next_depth > self.max_depth:
+            return
+
+        remaining_capacity = self.max_pages - self._pages_crawled
+        if remaining_capacity <= 0:
+            self.logger.info(
+                f"Max pages limit ({self.max_pages}) reached, stopping link discovery"
+            )
+            return
+
+        links = result.links.get("internal", [])
+        if self.include_external:
+            links += result.links.get("external", [])
+
+        seen = self._dfs_seen
+        valid_links: List[Tuple[str, float]] = []
+
+        for link in links:
+            raw_url = link.get("href")
+            if not raw_url:
+                continue
+
+            normalized_url = normalize_url_for_deep_crawl(raw_url, source_url)
+            if not normalized_url or normalized_url in seen:
+                continue
+
+            if not await self.can_process_url(raw_url, next_depth):
+                self.stats.urls_skipped += 1
+                continue
+
+            score = self.url_scorer.score(normalized_url) if self.url_scorer else 0
+            if score < self.score_threshold:
+                self.logger.debug(
+                    f"URL {normalized_url} skipped: score {score} below threshold {self.score_threshold}"
+                )
+                self.stats.urls_skipped += 1
+                continue
+
+            seen.add(normalized_url)
+            valid_links.append((normalized_url, score))
+
+        if len(valid_links) > remaining_capacity:
+            if self.url_scorer:
+                valid_links.sort(key=lambda x: x[1], reverse=True)
+            valid_links = valid_links[:remaining_capacity]
+            self.logger.info(
+                f"Limiting to {remaining_capacity} URLs due to max_pages limit"
+            )
+
+        for url, score in valid_links:
+            if score:
+                result.metadata = result.metadata or {}
+                result.metadata["score"] = score
+            next_level.append((url, source_url))
+            depths[url] = next_depth
--- a/docs/examples/dfs_crawl_demo.py
+++ b/docs/examples/dfs_crawl_demo.py
@@ -0,0 +1,39 @@
+"""
+Simple demonstration of the DFS deep crawler visiting multiple pages.
+
+Run with:  python docs/examples/dfs_crawl_demo.py
+"""
+import asyncio
+
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+from crawl4ai.cache_context import CacheMode
+from crawl4ai.deep_crawling.dfs_strategy import DFSDeepCrawlStrategy
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+
+async def main() -> None:
+    dfs_strategy = DFSDeepCrawlStrategy(
+        max_depth=3,
+        max_pages=50,
+        include_external=False,
+    )
+
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=dfs_strategy,
+        cache_mode=CacheMode.BYPASS,
+        markdown_generator=DefaultMarkdownGenerator(),
+        stream=True,
+    )
+
+    seed_url = "https://docs.python.org/3/"  # Plenty of internal links
+
+    async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
+        async for result in await crawler.arun(url=seed_url, config=config):
+            depth = result.metadata.get("depth")
+            status = "SUCCESS" if result.success else "FAILED"
+            print(f"[{status}] depth={depth} url={result.url}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/tests/unit/test_sitemap_namespace_parsing.py
+++ b/tests/unit/test_sitemap_namespace_parsing.py
@@ -1,134 +0,0 @@
-import sys
-from types import SimpleNamespace
-
-import pytest
-
-# Provide a lightweight stub for rank_bm25 before importing the seeder to avoid
-# optional dependency issues (e.g., incompatible wheels in CI).
-class _FakeBM25:
-    def __init__(self, corpus):
-        self._scores = [1.0] * len(corpus)
-
-    def get_scores(self, tokens):
-        return self._scores
-
-
-sys.modules.setdefault("rank_bm25", SimpleNamespace(BM25Okapi=_FakeBM25))
-
-from crawl4ai.async_url_seeder import AsyncUrlSeeder
-
-
-class DummyResponse:
-    def __init__(self, request_url: str, text: str):
-        self.status_code = 200
-        self._content = text.encode("utf-8")
-        self.url = request_url
-
-    def raise_for_status(self):
-        return None
-
-    @property
-    def content(self):
-        return self._content
-
-    @property
-    def text(self):
-        return self._content.decode("utf-8")
-
-
-class DummyAsyncClient:
-    def __init__(self, response_map):
-        self._responses = response_map
-
-    async def get(self, url, **kwargs):
-        payload = self._responses[url]
-        if callable(payload):
-            payload = payload()
-        return DummyResponse(url, payload)
-
-
-@pytest.mark.asyncio
-async def test_iter_sitemap_handles_namespace_less_sitemaps():
-    xml = """<?xml version="1.0"?>
-    <urlset>
-        <url><loc>https://example.com/a</loc></url>
-        <url><loc>https://example.com/b</loc></url>
-    </urlset>
-    """
-    seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/sitemap.xml": xml}))
-
-    urls = []
-    async for u in seeder._iter_sitemap("https://example.com/sitemap.xml"):
-        urls.append(u)
-
-    assert urls == ["https://example.com/a", "https://example.com/b"]
-
-
-@pytest.mark.asyncio
-async def test_iter_sitemap_handles_custom_namespace():
-    xml = """<?xml version="1.0"?>
-    <urlset xmlns="https://custom.namespace/schema">
-        <url><loc>https://example.com/ns</loc></url>
-    </urlset>
-    """
-    seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/ns-sitemap.xml": xml}))
-
-    urls = []
-    async for u in seeder._iter_sitemap("https://example.com/ns-sitemap.xml"):
-        urls.append(u)
-
-    assert urls == ["https://example.com/ns"]
-
-
-@pytest.mark.asyncio
-async def test_iter_sitemap_handles_namespace_index_and_children():
-    index_xml = """<?xml version="1.0"?>
-    <sitemapindex xmlns="http://another.example/ns">
-        <sitemap>
-            <loc>https://example.com/child-1.xml</loc>
-        </sitemap>
-        <sitemap>
-            <loc>https://example.com/child-2.xml</loc>
-        </sitemap>
-    </sitemapindex>
-    """
-    child_xml = """<?xml version="1.0"?>
-    <urlset xmlns="http://irrelevant">
-        <url><loc>https://example.com/page-{n}</loc></url>
-    </urlset>
-    """
-    responses = {
-        "https://example.com/index.xml": index_xml,
-        "https://example.com/child-1.xml": child_xml.format(n=1),
-        "https://example.com/child-2.xml": child_xml.format(n=2),
-    }
-    seeder = AsyncUrlSeeder(client=DummyAsyncClient(responses))
-
-    urls = []
-    async for u in seeder._iter_sitemap("https://example.com/index.xml"):
-        urls.append(u)
-
-    assert sorted(urls) == [
-        "https://example.com/page-1",
-        "https://example.com/page-2",
-    ]
-
-
-@pytest.mark.asyncio
-async def test_iter_sitemap_normalizes_relative_locations():
-    xml = """<?xml version="1.0"?>
-    <urlset>
-        <url><loc>/relative-path</loc></url>
-        <url><loc>https://example.com/absolute</loc></url>
-    </urlset>
-    """
-    seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/sitemap.xml": xml}))
-
-    urls = []
-    async for u in seeder._iter_sitemap("https://example.com/sitemap.xml"):
-        urls.append(u)
-
-    assert urls == [
-        "https://example.com/relative-path",
-        "https://example.com/absolute",
-    ]
Author	SHA1	Message	Date
AHMET YILMAZ	ceade853c3	Enhance DFSDeepCrawlStrategy documentation for clarity and detail	2025-11-13 16:39:08 +08:00
AHMET YILMAZ	1bd3de6a47	#1510 : Add DFS deep crawler demonstration script and enhance DFS strategy with seen URL tracking	2025-11-12 17:44:43 +08:00