diff --git a/crawl4ai/deep_crawling/dfs_strategy.py b/crawl4ai/deep_crawling/dfs_strategy.py
index 0eca58e3..c710a2a5 100644
--- a/crawl4ai/deep_crawling/dfs_strategy.py
+++ b/crawl4ai/deep_crawling/dfs_strategy.py
@@ -4,14 +4,26 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
from ..models import CrawlResult
from .bfs_strategy import BFSDeepCrawlStrategy # noqa
from ..types import AsyncWebCrawler, CrawlerRunConfig
+from ..utils import normalize_url_for_deep_crawl
class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
"""
- Depth-First Search (DFS) deep crawling strategy.
+ Depth-first deep crawling with familiar BFS rules.
- Inherits URL validation and link discovery from BFSDeepCrawlStrategy.
- Overrides _arun_batch and _arun_stream to use a stack (LIFO) for DFS traversal.
+ We reuse the same filters, scoring, and page limits from :class:`BFSDeepCrawlStrategy`,
+ but walk the graph with a stack so we fully explore one branch before hopping to the
+ next. DFS also keeps its own ``_dfs_seen`` set so we can drop duplicate links at
+ discovery time without accidentally marking them as “already crawled”.
"""
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self._dfs_seen: Set[str] = set()
+
+ def _reset_seen(self, start_url: str) -> None:
+ """Start each crawl with a clean dedupe set seeded with the root URL."""
+ self._dfs_seen = {start_url}
+
async def _arun_batch(
self,
start_url: str,
@@ -19,14 +31,19 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
config: CrawlerRunConfig,
) -> List[CrawlResult]:
"""
- Batch (non-streaming) DFS mode.
- Uses a stack to traverse URLs in DFS order, aggregating CrawlResults into a list.
+ Crawl level-by-level but emit results at the end.
+
+ We keep a stack of ``(url, parent, depth)`` tuples, pop one at a time, and
+ hand it to ``crawler.arun_many`` with deep crawling disabled so we remain
+ in control of traversal. Every successful page bumps ``_pages_crawled`` and
+ seeds new stack items discovered via :meth:`link_discovery`.
"""
visited: Set[str] = set()
# Stack items: (url, parent_url, depth)
stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
depths: Dict[str, int] = {start_url: 0}
results: List[CrawlResult] = []
+ self._reset_seen(start_url)
while stack and not self._cancel_event.is_set():
url, parent, depth = stack.pop()
@@ -71,12 +88,16 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
config: CrawlerRunConfig,
) -> AsyncGenerator[CrawlResult, None]:
"""
- Streaming DFS mode.
- Uses a stack to traverse URLs in DFS order and yields CrawlResults as they become available.
+ Same traversal as :meth:`_arun_batch`, but yield pages immediately.
+
+ Each popped URL is crawled, its metadata annotated, then the result gets
+ yielded before we even look at the next stack entry. Successful crawls
+ still feed :meth:`link_discovery`, keeping DFS order intact.
"""
visited: Set[str] = set()
stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
depths: Dict[str, int] = {start_url: 0}
+ self._reset_seen(start_url)
while stack and not self._cancel_event.is_set():
url, parent, depth = stack.pop()
@@ -108,3 +129,92 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
for new_url, new_parent in reversed(new_links):
new_depth = depths.get(new_url, depth + 1)
stack.append((new_url, new_parent, new_depth))
+
+ async def link_discovery(
+ self,
+ result: CrawlResult,
+ source_url: str,
+ current_depth: int,
+ _visited: Set[str],
+ next_level: List[Tuple[str, Optional[str]]],
+ depths: Dict[str, int],
+ ) -> None:
+ """
+ Find the next URLs we should push onto the DFS stack.
+
+ Parameters
+ ----------
+ result : CrawlResult
+ Output of the page we just crawled; its ``links`` block is our raw material.
+ source_url : str
+ URL of the parent page; stored so callers can track ancestry.
+ current_depth : int
+ Depth of the parent; children naturally sit at ``current_depth + 1``.
+ _visited : Set[str]
+ Present to match the BFS signature, but we rely on ``_dfs_seen`` instead.
+ next_level : list of tuples
+ The stack buffer supplied by the caller; we append new ``(url, parent)`` items here.
+ depths : dict
+ Shared depth map so future metadata tagging knows how deep each URL lives.
+
+ Notes
+ -----
+ - ``_dfs_seen`` keeps us from pushing duplicates without touching the traversal guard.
+ - Validation, scoring, and capacity trimming mirror the BFS version so behaviour stays consistent.
+ """
+ next_depth = current_depth + 1
+ if next_depth > self.max_depth:
+ return
+
+ remaining_capacity = self.max_pages - self._pages_crawled
+ if remaining_capacity <= 0:
+ self.logger.info(
+ f"Max pages limit ({self.max_pages}) reached, stopping link discovery"
+ )
+ return
+
+ links = result.links.get("internal", [])
+ if self.include_external:
+ links += result.links.get("external", [])
+
+ seen = self._dfs_seen
+ valid_links: List[Tuple[str, float]] = []
+
+ for link in links:
+ raw_url = link.get("href")
+ if not raw_url:
+ continue
+
+ normalized_url = normalize_url_for_deep_crawl(raw_url, source_url)
+ if not normalized_url or normalized_url in seen:
+ continue
+
+ if not await self.can_process_url(raw_url, next_depth):
+ self.stats.urls_skipped += 1
+ continue
+
+ score = self.url_scorer.score(normalized_url) if self.url_scorer else 0
+ if score < self.score_threshold:
+ self.logger.debug(
+ f"URL {normalized_url} skipped: score {score} below threshold {self.score_threshold}"
+ )
+ self.stats.urls_skipped += 1
+ continue
+
+ seen.add(normalized_url)
+ valid_links.append((normalized_url, score))
+
+ if len(valid_links) > remaining_capacity:
+ if self.url_scorer:
+ valid_links.sort(key=lambda x: x[1], reverse=True)
+ valid_links = valid_links[:remaining_capacity]
+ self.logger.info(
+ f"Limiting to {remaining_capacity} URLs due to max_pages limit"
+ )
+
+ for url, score in valid_links:
+ if score:
+ result.metadata = result.metadata or {}
+ result.metadata["score"] = score
+ next_level.append((url, source_url))
+ depths[url] = next_depth
diff --git a/docs/examples/dfs_crawl_demo.py b/docs/examples/dfs_crawl_demo.py
new file mode 100644
index 00000000..321c4131
--- /dev/null
+++ b/docs/examples/dfs_crawl_demo.py
@@ -0,0 +1,39 @@
+"""
+Simple demonstration of the DFS deep crawler visiting multiple pages.
+
+Run with: python docs/examples/dfs_crawl_demo.py
+"""
+import asyncio
+
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+from crawl4ai.cache_context import CacheMode
+from crawl4ai.deep_crawling.dfs_strategy import DFSDeepCrawlStrategy
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+
+async def main() -> None:
+ dfs_strategy = DFSDeepCrawlStrategy(
+ max_depth=3,
+ max_pages=50,
+ include_external=False,
+ )
+
+ config = CrawlerRunConfig(
+ deep_crawl_strategy=dfs_strategy,
+ cache_mode=CacheMode.BYPASS,
+ markdown_generator=DefaultMarkdownGenerator(),
+ stream=True,
+ )
+
+ seed_url = "https://docs.python.org/3/" # Plenty of internal links
+
+ async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
+ async for result in await crawler.arun(url=seed_url, config=config):
+ depth = result.metadata.get("depth")
+ status = "SUCCESS" if result.success else "FAILED"
+ print(f"[{status}] depth={depth} url={result.url}")
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md
index 368f53fb..41984ba5 100644
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -21,21 +21,35 @@ browser_cfg = BrowserConfig(
|-----------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------|
| **`browser_type`** | `"chromium"`, `"firefox"`, `"webkit"`
*(default: `"chromium"`)* | Which browser engine to use. `"chromium"` is typical for many sites, `"firefox"` or `"webkit"` for specialized tests. |
| **`headless`** | `bool` (default: `True`) | Headless means no visible UI. `False` is handy for debugging. |
+| **`browser_mode`** | `str` (default: `"dedicated"`) | How browser is initialized: `"dedicated"` (new instance), `"builtin"` (CDP background), `"custom"` (explicit CDP), `"docker"` (container). |
+| **`use_managed_browser`** | `bool` (default: `False`) | Launch browser via CDP for advanced control. Set automatically based on `browser_mode`. |
+| **`cdp_url`** | `str` (default: `None`) | Chrome DevTools Protocol endpoint URL (e.g., `"ws://localhost:9222/devtools/browser/"`). Set automatically based on `browser_mode`. |
+| **`debugging_port`** | `int` (default: `9222`) | Port for browser debugging protocol. |
+| **`host`** | `str` (default: `"localhost"`) | Host for browser connection. |
| **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. |
| **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). |
+| **`viewport`** | `dict` (default: `None`) | Viewport dimensions dict. If set, overrides `viewport_width` and `viewport_height`. |
| **`proxy`** | `str` (deprecated) | Deprecated. Use `proxy_config` instead. If set, it will be auto-converted internally. |
-| **`proxy_config`** | `dict` (default: `None`) | For advanced or multi-proxy needs, specify details like `{"server": "...", "username": "...", ...}`. |
+| **`proxy_config`** | `ProxyConfig or dict` (default: `None`)| For advanced or multi-proxy needs, specify `ProxyConfig` object or dict like `{"server": "...", "username": "...", "password": "..."}`. |
| **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. |
| **`user_data_dir`** | `str or None` (default: `None`) | Directory to store user data (profiles, cookies). Must be set if you want permanent sessions. |
+| **`chrome_channel`** | `str` (default: `"chromium"`) | Chrome channel to launch (e.g., "chrome", "msedge"). Only for `browser_type="chromium"`. Auto-set to empty for Firefox/WebKit. |
+| **`channel`** | `str` (default: `"chromium"`) | Alias for `chrome_channel`. |
+| **`accept_downloads`** | `bool` (default: `False`) | Whether to allow file downloads. Requires `downloads_path` if `True`. |
+| **`downloads_path`** | `str or None` (default: `None`) | Directory to store downloaded files. |
+| **`storage_state`** | `str or dict or None` (default: `None`)| In-memory storage state (cookies, localStorage) to restore browser state. |
| **`ignore_https_errors`** | `bool` (default: `True`) | If `True`, continues despite invalid certificates (common in dev/staging). |
| **`java_script_enabled`** | `bool` (default: `True`) | Disable if you want no JS overhead, or if only static content is needed. |
+| **`sleep_on_close`** | `bool` (default: `False`) | Add a small delay when closing browser (can help with cleanup issues). |
| **`cookies`** | `list` (default: `[]`) | Pre-set cookies, each a dict like `{"name": "session", "value": "...", "url": "..."}`. |
| **`headers`** | `dict` (default: `{}`) | Extra HTTP headers for every request, e.g. `{"Accept-Language": "en-US"}`. |
-| **`user_agent`** | `str` (default: Chrome-based UA) | Your custom or random user agent. `user_agent_mode="random"` can shuffle it. |
-| **`light_mode`** | `bool` (default: `False`) | Disables some background features for performance gains. |
+| **`user_agent`** | `str` (default: Chrome-based UA) | Your custom user agent string. |
+| **`user_agent_mode`** | `str` (default: `""`) | Set to `"random"` to randomize user agent from a pool (helps with bot detection). |
+| **`user_agent_generator_config`** | `dict` (default: `{}`) | Configuration dict for user agent generation when `user_agent_mode="random"`. |
| **`text_mode`** | `bool` (default: `False`) | If `True`, tries to disable images/other heavy content for speed. |
-| **`use_managed_browser`** | `bool` (default: `False`) | For advanced “managed” interactions (debugging, CDP usage). Typically set automatically if persistent context is on. |
+| **`light_mode`** | `bool` (default: `False`) | Disables some background features for performance gains. |
| **`extra_args`** | `list` (default: `[]`) | Additional flags for the underlying browser process, e.g. `["--disable-extensions"]`. |
+| **`enable_stealth`** | `bool` (default: `False`) | Enable playwright-stealth mode to bypass bot detection. Cannot be used with `browser_mode="builtin"`. |
**Tips**:
- Set `headless=False` to visually **debug** how pages load or how interactions proceed.
@@ -70,6 +84,7 @@ We group them by category.
|------------------------------|--------------------------------------|-------------------------------------------------------------------------------------------------|
| **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. |
| **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). |
+| **`chunking_strategy`** | `ChunkingStrategy` (default: RegexChunking()) | Strategy to chunk content before extraction. Can be customized for different chunking approaches. |
| **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). Can be customized with options such as `content_source` parameter to select the HTML input source ('cleaned_html', 'raw_html', or 'fit_html'). |
| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. Affects the entire extraction process. |
| **`target_elements`** | `List[str]` (None) | List of CSS selectors for elements to focus on for markdown generation and data extraction, while still processing the entire page for links, media, etc. Provides more flexibility than `css_selector`. |
@@ -78,32 +93,50 @@ We group them by category.
| **`only_text`** | `bool` (False) | If `True`, tries to extract text-only content. |
| **`prettiify`** | `bool` (False) | If `True`, beautifies final HTML (slower, purely cosmetic). |
| **`keep_data_attributes`** | `bool` (False) | If `True`, preserve `data-*` attributes in cleaned HTML. |
+| **`keep_attrs`** | `list` (default: []) | List of HTML attributes to keep during processing (e.g., `["id", "class", "data-value"]`). |
| **`remove_forms`** | `bool` (False) | If `True`, remove all `