Enhance DFSDeepCrawlStrategy documentation for clarity and detail

#1510 : Add DFS deep crawler demonstration script and enhance DFS strategy with seen URL tracking
2025-11-13 16:39:08 +08:00 · 2025-11-12 17:44:43 +08:00
3 changed files with 215 additions and 272 deletions
--- a/crawl4ai/deep_crawling/dfs_strategy.py
+++ b/crawl4ai/deep_crawling/dfs_strategy.py
@@ -4,14 +4,26 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
 from ..models import CrawlResult
 from .bfs_strategy import BFSDeepCrawlStrategy  # noqa
 from ..types import AsyncWebCrawler, CrawlerRunConfig
+from ..utils import normalize_url_for_deep_crawl

 class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
    """
-    Depth-First Search (DFS) deep crawling strategy.
+    Depth-first deep crawling with familiar BFS rules.

-    Inherits URL validation and link discovery from BFSDeepCrawlStrategy.
-    Overrides _arun_batch and _arun_stream to use a stack (LIFO) for DFS traversal.
+    We reuse the same filters, scoring, and page limits from :class:`BFSDeepCrawlStrategy`,
+    but walk the graph with a stack so we fully explore one branch before hopping to the
+    next. DFS also keeps its own ``_dfs_seen`` set so we can drop duplicate links at
+    discovery time without accidentally marking them as “already crawled”.
    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._dfs_seen: Set[str] = set()
+
+    def _reset_seen(self, start_url: str) -> None:
+        """Start each crawl with a clean dedupe set seeded with the root URL."""
+        self._dfs_seen = {start_url}
+
    async def _arun_batch(
        self,
        start_url: str,
@@ -19,14 +31,19 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
        config: CrawlerRunConfig,
    ) -> List[CrawlResult]:
        """
-        Batch (non-streaming) DFS mode.
-        Uses a stack to traverse URLs in DFS order, aggregating CrawlResults into a list.
+        Crawl level-by-level but emit results at the end.
+
+        We keep a stack of ``(url, parent, depth)`` tuples, pop one at a time, and
+        hand it to ``crawler.arun_many`` with deep crawling disabled so we remain
+        in control of traversal. Every successful page bumps ``_pages_crawled`` and
+        seeds new stack items discovered via :meth:`link_discovery`.
        """
        visited: Set[str] = set()
        # Stack items: (url, parent_url, depth)
        stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
        depths: Dict[str, int] = {start_url: 0}
        results: List[CrawlResult] = []
+        self._reset_seen(start_url)

        while stack and not self._cancel_event.is_set():
            url, parent, depth = stack.pop()
@@ -71,12 +88,16 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
        config: CrawlerRunConfig,
    ) -> AsyncGenerator[CrawlResult, None]:
        """
-        Streaming DFS mode.
-        Uses a stack to traverse URLs in DFS order and yields CrawlResults as they become available.
+        Same traversal as :meth:`_arun_batch`, but yield pages immediately.
+
+        Each popped URL is crawled, its metadata annotated, then the result gets
+        yielded before we even look at the next stack entry. Successful crawls
+        still feed :meth:`link_discovery`, keeping DFS order intact.
        """
        visited: Set[str] = set()
        stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
        depths: Dict[str, int] = {start_url: 0}
+        self._reset_seen(start_url)

        while stack and not self._cancel_event.is_set():
            url, parent, depth = stack.pop()
@@ -108,3 +129,92 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
                    for new_url, new_parent in reversed(new_links):
                        new_depth = depths.get(new_url, depth + 1)
                        stack.append((new_url, new_parent, new_depth))
+
+    async def link_discovery(
+        self,
+        result: CrawlResult,
+        source_url: str,
+        current_depth: int,
+        _visited: Set[str],
+        next_level: List[Tuple[str, Optional[str]]],
+        depths: Dict[str, int],
+    ) -> None:
+        """
+        Find the next URLs we should push onto the DFS stack.
+
+        Parameters
+        ----------
+        result : CrawlResult
+            Output of the page we just crawled; its ``links`` block is our raw material.
+        source_url : str
+            URL of the parent page; stored so callers can track ancestry.
+        current_depth : int
+            Depth of the parent; children naturally sit at ``current_depth + 1``.
+        _visited : Set[str]
+            Present to match the BFS signature, but we rely on ``_dfs_seen`` instead.
+        next_level : list of tuples
+            The stack buffer supplied by the caller; we append new ``(url, parent)`` items here.
+        depths : dict
+            Shared depth map so future metadata tagging knows how deep each URL lives.
+
+        Notes
+        -----
+        - ``_dfs_seen`` keeps us from pushing duplicates without touching the traversal guard.
+        - Validation, scoring, and capacity trimming mirror the BFS version so behaviour stays consistent.
+        """
+        next_depth = current_depth + 1
+        if next_depth > self.max_depth:
+            return
+
+        remaining_capacity = self.max_pages - self._pages_crawled
+        if remaining_capacity <= 0:
+            self.logger.info(
+                f"Max pages limit ({self.max_pages}) reached, stopping link discovery"
+            )
+            return
+
+        links = result.links.get("internal", [])
+        if self.include_external:
+            links += result.links.get("external", [])
+
+        seen = self._dfs_seen
+        valid_links: List[Tuple[str, float]] = []
+
+        for link in links:
+            raw_url = link.get("href")
+            if not raw_url:
+                continue
+
+            normalized_url = normalize_url_for_deep_crawl(raw_url, source_url)
+            if not normalized_url or normalized_url in seen:
+                continue
+
+            if not await self.can_process_url(raw_url, next_depth):
+                self.stats.urls_skipped += 1
+                continue
+
+            score = self.url_scorer.score(normalized_url) if self.url_scorer else 0
+            if score < self.score_threshold:
+                self.logger.debug(
+                    f"URL {normalized_url} skipped: score {score} below threshold {self.score_threshold}"
+                )
+                self.stats.urls_skipped += 1
+                continue
+
+            seen.add(normalized_url)
+            valid_links.append((normalized_url, score))
+
+        if len(valid_links) > remaining_capacity:
+            if self.url_scorer:
+                valid_links.sort(key=lambda x: x[1], reverse=True)
+            valid_links = valid_links[:remaining_capacity]
+            self.logger.info(
+                f"Limiting to {remaining_capacity} URLs due to max_pages limit"
+            )
+
+        for url, score in valid_links:
+            if score:
+                result.metadata = result.metadata or {}
+                result.metadata["score"] = score
+            next_level.append((url, source_url))
+            depths[url] = next_depth
--- a/docs/examples/dfs_crawl_demo.py
+++ b/docs/examples/dfs_crawl_demo.py
@@ -0,0 +1,39 @@
+"""
+Simple demonstration of the DFS deep crawler visiting multiple pages.
+
+Run with:  python docs/examples/dfs_crawl_demo.py
+"""
+import asyncio
+
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+from crawl4ai.cache_context import CacheMode
+from crawl4ai.deep_crawling.dfs_strategy import DFSDeepCrawlStrategy
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+
+async def main() -> None:
+    dfs_strategy = DFSDeepCrawlStrategy(
+        max_depth=3,
+        max_pages=50,
+        include_external=False,
+    )
+
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=dfs_strategy,
+        cache_mode=CacheMode.BYPASS,
+        markdown_generator=DefaultMarkdownGenerator(),
+        stream=True,
+    )
+
+    seed_url = "https://docs.python.org/3/"  # Plenty of internal links
+
+    async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
+        async for result in await crawler.arun(url=seed_url, config=config):
+            depth = result.metadata.get("depth")
+            status = "SUCCESS" if result.success else "FAILED"
+            print(f"[{status}] depth={depth} url={result.url}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/md_v2/advanced/proxy-security.md
+++ b/docs/md_v2/advanced/proxy-security.md
@@ -1,304 +1,98 @@
-# Proxy & Security
-
-This guide covers proxy configuration and security features in Crawl4AI, including SSL certificate analysis and proxy rotation strategies.
-
-## Understanding Proxy Configuration
-
-Crawl4AI recommends configuring proxies per request through `CrawlerRunConfig.proxy_config`. This gives you precise control, enables rotation strategies, and keeps examples simple enough to copy, paste, and run.
+# Proxy 

 ## Basic Proxy Setup

-Configure proxies that apply to each crawl operation:
+Simple proxy configuration with `BrowserConfig`:

 ```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, ProxyConfig
+from crawl4ai.async_configs import BrowserConfig

-run_config = CrawlerRunConfig(proxy_config=ProxyConfig(server="http://proxy.example.com:8080"))
-# run_config = CrawlerRunConfig(proxy_config={"server": "http://proxy.example.com:8080"})
-# run_config = CrawlerRunConfig(proxy_config="http://proxy.example.com:8080")
+# Using HTTP proxy
+browser_config = BrowserConfig(proxy_config={"server": "http://proxy.example.com:8080"})
+async with AsyncWebCrawler(config=browser_config) as crawler:
+    result = await crawler.arun(url="https://example.com")

-
-async def main():
-    browser_config = BrowserConfig()
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(url="https://example.com", config=run_config)
-        print(f"Success: {result.success} -> {result.url}")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
+# Using SOCKS proxy
+browser_config = BrowserConfig(proxy_config={"server": "socks5://proxy.example.com:1080"})
+async with AsyncWebCrawler(config=browser_config) as crawler:
+    result = await crawler.arun(url="https://example.com")
 ```

-!!! note "Why request-level?"
-    `CrawlerRunConfig.proxy_config` keeps each request self-contained, so swapping proxies or rotation strategies is just a matter of building a new run configuration.
+## Authenticated Proxy

-## Supported Proxy Formats
-
-The `ProxyConfig.from_string()` method supports multiple formats:
+Use an authenticated proxy with `BrowserConfig`:

 ```python
-from crawl4ai import ProxyConfig
+from crawl4ai.async_configs import BrowserConfig

-# HTTP proxy with authentication
-proxy1 = ProxyConfig.from_string("http://user:pass@192.168.1.1:8080")
-
-# HTTPS proxy
-proxy2 = ProxyConfig.from_string("https://proxy.example.com:8080")
-
-# SOCKS5 proxy
-proxy3 = ProxyConfig.from_string("socks5://proxy.example.com:1080")
-
-# Simple IP:port format
-proxy4 = ProxyConfig.from_string("192.168.1.1:8080")
-
-# IP:port:user:pass format
-proxy5 = ProxyConfig.from_string("192.168.1.1:8080:user:pass")
+browser_config = BrowserConfig(proxy_config={
+    "server": "http://[host]:[port]",
+    "username": "[username]",
+    "password": "[password]",
+})
+async with AsyncWebCrawler(config=browser_config) as crawler:
+    result = await crawler.arun(url="https://example.com")
 ```

-## Authenticated Proxies

-For proxies requiring authentication:
+## Rotating Proxies 
+
+Example using a proxy rotation service dynamically:

 ```python
-import asyncio
-from crawl4ai import AsyncWebCrawler,BrowserConfig, CrawlerRunConfig, ProxyConfig
-
-run_config = CrawlerRunConfig(
-    proxy_config=ProxyConfig(
-        server="http://proxy.example.com:8080",
-        username="your_username",
-        password="your_password",
-    )
-)
-# Or dictionary style:
-# run_config = CrawlerRunConfig(proxy_config={
-#     "server": "http://proxy.example.com:8080",
-#     "username": "your_username",
-#     "password": "your_password",
-# })
-
-
-async def main():
-    browser_config = BrowserConfig()
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(url="https://example.com", config=run_config)
-        print(f"Success: {result.success} -> {result.url}")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-## Environment Variable Configuration
-
-Load proxies from environment variables for easy configuration:
-
-```python
-import os
-from crawl4ai import ProxyConfig, CrawlerRunConfig
-
-# Set environment variable
-os.environ["PROXIES"] = "ip1:port1:user1:pass1,ip2:port2:user2:pass2,ip3:port3"
-
-# Load all proxies
-proxies = ProxyConfig.from_env()
-print(f"Loaded {len(proxies)} proxies")
-
-# Use first proxy
-if proxies:
-    run_config = CrawlerRunConfig(proxy_config=proxies[0])
-```
-
-## Rotating Proxies
-
-Crawl4AI supports automatic proxy rotation to distribute requests across multiple proxy servers. Rotation is applied per request using a rotation strategy on `CrawlerRunConfig`.
-
-### Proxy Rotation (recommended)
-```python
-import asyncio
 import re
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, ProxyConfig
-from crawl4ai.proxy_strategy import RoundRobinProxyStrategy
-
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    CacheMode,
+    RoundRobinProxyStrategy,
+)
+import asyncio
+from crawl4ai import ProxyConfig
 async def main():
-    # Load proxies from environment
+    # Load proxies and create rotation strategy
    proxies = ProxyConfig.from_env()
+    #eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
    if not proxies:
-        print("No proxies found! Set PROXIES environment variable.")
+        print("No proxies found in environment. Set PROXIES env variable!")
        return

-    # Create rotation strategy
    proxy_strategy = RoundRobinProxyStrategy(proxies)

-    # Configure per-request with proxy rotation
+    # Create configs
    browser_config = BrowserConfig(headless=True, verbose=False)
    run_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
-        proxy_rotation_strategy=proxy_strategy,
+        proxy_rotation_strategy=proxy_strategy
    )

    async with AsyncWebCrawler(config=browser_config) as crawler:
        urls = ["https://httpbin.org/ip"] * (len(proxies) * 2)  # Test each proxy twice

-        print(f"🚀 Testing {len(proxies)} proxies with rotation...")
-        results = await crawler.arun_many(urls=urls, config=run_config)
+        print("\n📈 Initializing crawler with proxy rotation...")
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            print("\n🚀 Starting batch crawl with proxy rotation...")
+            results = await crawler.arun_many(
+                urls=urls,
+                config=run_config
+            )
+            for result in results:
+                if result.success:
+                    ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
+                    current_proxy = run_config.proxy_config if run_config.proxy_config else None

-        for i, result in enumerate(results):
-            if result.success:
-                # Extract IP from response
-                ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
-                if ip_match:
-                    detected_ip = ip_match.group(0)
-                    proxy_index = i % len(proxies)
-                    expected_ip = proxies[proxy_index].ip
+                    if current_proxy and ip_match:
+                        print(f"URL {result.url}")
+                        print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}")
+                        verified = ip_match.group(0) == current_proxy.ip
+                        if verified:
+                            print(f"✅ Proxy working! IP matches: {current_proxy.ip}")
+                        else:
+                            print("❌ Proxy failed or IP mismatch!")
+                    print("---")

-                    print(f"✅ Request {i+1}: Proxy {proxy_index+1} -> IP {detected_ip}")
-                    if detected_ip == expected_ip:
-                        print("   🎯 IP matches proxy configuration")
-                    else:
-                        print(f"   ⚠️  IP mismatch (expected {expected_ip})")
-                else:
-                    print(f"❌ Request {i+1}: Could not extract IP from response")
-            else:
-                print(f"❌ Request {i+1}: Failed - {result.error_message}")
+asyncio.run(main())

-if __name__ == "__main__":
-    asyncio.run(main())
 ```

-## SSL Certificate Analysis
-
-Combine proxy usage with SSL certificate inspection for enhanced security analysis. SSL certificate fetching is configured per request via `CrawlerRunConfig`.
-
-### Per-Request SSL Certificate Analysis
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-
-run_config = CrawlerRunConfig(
-    proxy_config={
-        "server": "http://proxy.example.com:8080",
-        "username": "user",
-        "password": "pass",
-    },
-    fetch_ssl_certificate=True,  # Enable SSL certificate analysis for this request
-)
-
-
-async def main():
-    browser_config = BrowserConfig()
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(url="https://example.com", config=run_config)
-
-        if result.success:
-            print(f"✅ Crawled via proxy: {result.url}")
-
-            # Analyze SSL certificate
-            if result.ssl_certificate:
-                cert = result.ssl_certificate
-                print("🔒 SSL Certificate Info:")
-                print(f"   Issuer: {cert.issuer}")
-                print(f"   Subject: {cert.subject}")
-                print(f"   Valid until: {cert.valid_until}")
-                print(f"   Fingerprint: {cert.fingerprint}")
-
-                # Export certificate
-                cert.to_json("certificate.json")
-                print("💾 Certificate exported to certificate.json")
-            else:
-                print("⚠️  No SSL certificate information available")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-## Security Best Practices
-
-### 1. Proxy Rotation for Anonymity
-```python
-from crawl4ai import CrawlerRunConfig, ProxyConfig
-from crawl4ai.proxy_strategy import RoundRobinProxyStrategy
-
-# Use multiple proxies to avoid IP blocking
-proxies = ProxyConfig.from_env("PROXIES")
-strategy = RoundRobinProxyStrategy(proxies)
-
-# Configure rotation per request (recommended)
-run_config = CrawlerRunConfig(proxy_rotation_strategy=strategy)
-
-# For a fixed proxy across all requests, just reuse the same run_config instance
-static_run_config = run_config
-```
-
-### 2. SSL Certificate Verification
-```python
-from crawl4ai import CrawlerRunConfig
-
-# Always verify SSL certificates when possible
-# Per-request (affects specific requests)
-run_config = CrawlerRunConfig(fetch_ssl_certificate=True)
-```
-
-### 3. Environment Variable Security
-```bash
-# Use environment variables for sensitive proxy credentials
-# Avoid hardcoding usernames/passwords in code
-export PROXIES="ip1:port1:user1:pass1,ip2:port2:user2:pass2"
-```
-
-### 4. SOCKS5 for Enhanced Security
-```python
-from crawl4ai import CrawlerRunConfig
-
-# Prefer SOCKS5 proxies for better protocol support
-run_config = CrawlerRunConfig(proxy_config="socks5://proxy.example.com:1080")
-```
-
-## Migration from Deprecated `proxy` Parameter
-
-!!! warning "Deprecation Notice"
-    The legacy `proxy` argument on `BrowserConfig` is deprecated. Configure proxies through `CrawlerRunConfig.proxy_config` so each request fully describes its network settings.
-
-```python
-# Old (deprecated) approach
-# from crawl4ai import BrowserConfig
-# browser_config = BrowserConfig(proxy="http://proxy.example.com:8080")
-
-# New (preferred) approach
-from crawl4ai import CrawlerRunConfig
-run_config = CrawlerRunConfig(proxy_config="http://proxy.example.com:8080")
-```
-
-### Safe Logging of Proxies
-```python
-from crawl4ai import ProxyConfig
-
-def safe_proxy_repr(proxy: ProxyConfig):
-    if getattr(proxy, "username", None):
-        return f"{proxy.server} (auth: ****)"
-    return proxy.server
-```
-
-## Troubleshooting
-
-### Common Issues
-
-???+ question "Proxy connection failed"
-    - Verify the proxy server is reachable from your network.
-    - Double-check authentication credentials.
-    - Ensure the protocol matches (`http`, `https`, or `socks5`).
-
-???+ question "SSL certificate errors"
-    - Some proxies break SSL inspection; switch proxies if you see repeated failures.
-    - Consider temporarily disabling certificate fetching to isolate the issue.
-
-???+ question "Environment variables not loading"
-    - Confirm `PROXIES` (or your custom env var) is set before running the script.
-    - Check formatting: `ip:port:user:pass,ip:port:user:pass`.
-
-???+ question "Proxy rotation not working"
-    - Ensure `ProxyConfig.from_env()` actually loaded entries (`len(proxies) > 0`).
-    - Attach `proxy_rotation_strategy` to `CrawlerRunConfig`.
-    - Validate the proxy definitions you pass into the strategy.
Author	SHA1	Message	Date
AHMET YILMAZ	ceade853c3	Enhance DFSDeepCrawlStrategy documentation for clarity and detail	2025-11-13 16:39:08 +08:00
AHMET YILMAZ	1bd3de6a47	#1510 : Add DFS deep crawler demonstration script and enhance DFS strategy with seen URL tracking	2025-11-12 17:44:43 +08:00