#1551 : Fix casing and variable name consistency for LLMConfig in documentation

2025-11-10 15:38:14 +08:00
3 changed files with 12 additions and 161 deletions
--- a/crawl4ai/deep_crawling/dfs_strategy.py
+++ b/crawl4ai/deep_crawling/dfs_strategy.py
@@ -4,26 +4,14 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
 from ..models import CrawlResult
 from .bfs_strategy import BFSDeepCrawlStrategy  # noqa
 from ..types import AsyncWebCrawler, CrawlerRunConfig
 from ..utils import normalize_url_for_deep_crawl
 class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
    """
-    Depth-first deep crawling with familiar BFS rules.
+    Depth-First Search (DFS) deep crawling strategy.
-    We reuse the same filters, scoring, and page limits from :class:`BFSDeepCrawlStrategy`,
+    Inherits URL validation and link discovery from BFSDeepCrawlStrategy.
-    but walk the graph with a stack so we fully explore one branch before hopping to the
+    Overrides _arun_batch and _arun_stream to use a stack (LIFO) for DFS traversal.
    next. DFS also keeps its own ``_dfs_seen`` set so we can drop duplicate links at
    discovery time without accidentally marking them as “already crawled”.
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._dfs_seen: Set[str] = set()
    def _reset_seen(self, start_url: str) -> None:
        """Start each crawl with a clean dedupe set seeded with the root URL."""
        self._dfs_seen = {start_url}
    async def _arun_batch(
        self,
        start_url: str,
@@ -31,19 +19,14 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
        config: CrawlerRunConfig,
    ) -> List[CrawlResult]:
        """
-        Crawl level-by-level but emit results at the end.
+        Batch (non-streaming) DFS mode.
-
+        Uses a stack to traverse URLs in DFS order, aggregating CrawlResults into a list.
        We keep a stack of ``(url, parent, depth)`` tuples, pop one at a time, and
        hand it to ``crawler.arun_many`` with deep crawling disabled so we remain
        in control of traversal. Every successful page bumps ``_pages_crawled`` and
        seeds new stack items discovered via :meth:`link_discovery`.
        """
        visited: Set[str] = set()
        # Stack items: (url, parent_url, depth)
        stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
        depths: Dict[str, int] = {start_url: 0}
        results: List[CrawlResult] = []
        self._reset_seen(start_url)
        while stack and not self._cancel_event.is_set():
            url, parent, depth = stack.pop()
@@ -88,16 +71,12 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
        config: CrawlerRunConfig,
    ) -> AsyncGenerator[CrawlResult, None]:
        """
-        Same traversal as :meth:`_arun_batch`, but yield pages immediately.
+        Streaming DFS mode.
-
+        Uses a stack to traverse URLs in DFS order and yields CrawlResults as they become available.
        Each popped URL is crawled, its metadata annotated, then the result gets
        yielded before we even look at the next stack entry. Successful crawls
        still feed :meth:`link_discovery`, keeping DFS order intact.
        """
        visited: Set[str] = set()
        stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
        depths: Dict[str, int] = {start_url: 0}
        self._reset_seen(start_url)
        while stack and not self._cancel_event.is_set():
            url, parent, depth = stack.pop()
@@ -129,92 +108,3 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
                    for new_url, new_parent in reversed(new_links):
                        new_depth = depths.get(new_url, depth + 1)
                        stack.append((new_url, new_parent, new_depth))
    async def link_discovery(
        self,
        result: CrawlResult,
        source_url: str,
        current_depth: int,
        _visited: Set[str],
        next_level: List[Tuple[str, Optional[str]]],
        depths: Dict[str, int],
    ) -> None:
        """
        Find the next URLs we should push onto the DFS stack.
        Parameters
        ----------
        result : CrawlResult
            Output of the page we just crawled; its ``links`` block is our raw material.
        source_url : str
            URL of the parent page; stored so callers can track ancestry.
        current_depth : int
            Depth of the parent; children naturally sit at ``current_depth + 1``.
        _visited : Set[str]
            Present to match the BFS signature, but we rely on ``_dfs_seen`` instead.
        next_level : list of tuples
            The stack buffer supplied by the caller; we append new ``(url, parent)`` items here.
        depths : dict
            Shared depth map so future metadata tagging knows how deep each URL lives.
        Notes
        -----
        - ``_dfs_seen`` keeps us from pushing duplicates without touching the traversal guard.
        - Validation, scoring, and capacity trimming mirror the BFS version so behaviour stays consistent.
        """
        next_depth = current_depth + 1
        if next_depth > self.max_depth:
            return
        remaining_capacity = self.max_pages - self._pages_crawled
        if remaining_capacity <= 0:
            self.logger.info(
                f"Max pages limit ({self.max_pages}) reached, stopping link discovery"
            )
            return
        links = result.links.get("internal", [])
        if self.include_external:
            links += result.links.get("external", [])
        seen = self._dfs_seen
        valid_links: List[Tuple[str, float]] = []
        for link in links:
            raw_url = link.get("href")
            if not raw_url:
                continue
            normalized_url = normalize_url_for_deep_crawl(raw_url, source_url)
            if not normalized_url or normalized_url in seen:
                continue
            if not await self.can_process_url(raw_url, next_depth):
                self.stats.urls_skipped += 1
                continue
            score = self.url_scorer.score(normalized_url) if self.url_scorer else 0
            if score < self.score_threshold:
                self.logger.debug(
                    f"URL {normalized_url} skipped: score {score} below threshold {self.score_threshold}"
                )
                self.stats.urls_skipped += 1
                continue
            seen.add(normalized_url)
            valid_links.append((normalized_url, score))
        if len(valid_links) > remaining_capacity:
            if self.url_scorer:
                valid_links.sort(key=lambda x: x[1], reverse=True)
            valid_links = valid_links[:remaining_capacity]
            self.logger.info(
                f"Limiting to {remaining_capacity} URLs due to max_pages limit"
            )
        for url, score in valid_links:
            if score:
                result.metadata = result.metadata or {}
                result.metadata["score"] = score
            next_level.append((url, source_url))
            depths[url] = next_depth
--- a/docs/examples/dfs_crawl_demo.py
+++ b/docs/examples/dfs_crawl_demo.py
@@ -1,39 +0,0 @@
 """
 Simple demonstration of the DFS deep crawler visiting multiple pages.
 Run with:  python docs/examples/dfs_crawl_demo.py
 """
 import asyncio
 from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 from crawl4ai.async_webcrawler import AsyncWebCrawler
 from crawl4ai.cache_context import CacheMode
 from crawl4ai.deep_crawling.dfs_strategy import DFSDeepCrawlStrategy
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 async def main() -> None:
    dfs_strategy = DFSDeepCrawlStrategy(
        max_depth=3,
        max_pages=50,
        include_external=False,
    )
    config = CrawlerRunConfig(
        deep_crawl_strategy=dfs_strategy,
        cache_mode=CacheMode.BYPASS,
        markdown_generator=DefaultMarkdownGenerator(),
        stream=True,
    )
    seed_url = "https://docs.python.org/3/"  # Plenty of internal links
    async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
        async for result in await crawler.arun(url=seed_url, config=config):
            depth = result.metadata.get("depth")
            status = "SUCCESS" if result.success else "FAILED"
            print(f"[{status}] depth={depth} url={result.url}")
 if __name__ == "__main__":
    asyncio.run(main())
--- a/docs/md_v2/extraction/llm-strategies.md
+++ b/docs/md_v2/extraction/llm-strategies.md
@@ -20,10 +20,10 @@ In some cases, you need to extract **complex or unstructured** information from
 ## 2. Provider-Agnostic via LiteLLM
-You can use LlmConfig, to quickly configure multiple variations of LLMs and experiment with them to find the optimal one for your use case. You can read more about LlmConfig [here](/api/parameters).
+You can use LLMConfig, to quickly configure multiple variations of LLMs and experiment with them to find the optimal one for your use case. You can read more about LLMConfig [here](/api/parameters).
 ```python
-llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
+llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
 ```
 Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LiteLLM supports is fair game. You just provide:
@@ -58,7 +58,7 @@ For structured data, `"schema"` is recommended. You provide `schema=YourPydantic
 Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`.
-1. **`llmConfig`** (LlmConfig): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.    
+1. **`llm_config`** (LLMConfig): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.
 2. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.  
 3. **`extraction_type`** (str): `"schema"` or `"block"`.  
 4. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”  
@@ -112,7 +112,7 @@ async def main():
    # 1. Define the LLM extraction strategy
    llm_strategy = LLMExtractionStrategy(
        llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
-        schema=Product.schema_json(), # Or use model_json_schema()
+        schema=Product.model_json_schema(), # Or use model_json_schema()
        extraction_type="schema",
        instruction="Extract all product objects with 'name' and 'price' from the content.",
        chunk_token_threshold=1000,
@@ -238,7 +238,7 @@ class KnowledgeGraph(BaseModel):
 async def main():
    # LLM extraction strategy
    llm_strat = LLMExtractionStrategy(
-        llmConfig = LLMConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')),
+        llm_config = LLMConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')),
        schema=KnowledgeGraph.model_json_schema(),
        extraction_type="schema",
        instruction="Extract entities and relationships from the content. Return valid JSON.",