Merge branch 'develop' into release/v0.7.7

2025-11-13 14:54:05 +01:00
parent b58579548c f3146de969
commit 2c973b1183
4 changed files with 347 additions and 93 deletions
--- a/crawl4ai/deep_crawling/dfs_strategy.py
+++ b/crawl4ai/deep_crawling/dfs_strategy.py
@@ -4,14 +4,26 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
 from ..models import CrawlResult
 from .bfs_strategy import BFSDeepCrawlStrategy  # noqa
 from ..types import AsyncWebCrawler, CrawlerRunConfig
 from ..utils import normalize_url_for_deep_crawl
 class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
    """
-    Depth-First Search (DFS) deep crawling strategy.
+    Depth-first deep crawling with familiar BFS rules.
-    Inherits URL validation and link discovery from BFSDeepCrawlStrategy.
+    We reuse the same filters, scoring, and page limits from :class:`BFSDeepCrawlStrategy`,
-    Overrides _arun_batch and _arun_stream to use a stack (LIFO) for DFS traversal.
+    but walk the graph with a stack so we fully explore one branch before hopping to the
    next. DFS also keeps its own ``_dfs_seen`` set so we can drop duplicate links at
    discovery time without accidentally marking them as “already crawled”.
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._dfs_seen: Set[str] = set()
    def _reset_seen(self, start_url: str) -> None:
        """Start each crawl with a clean dedupe set seeded with the root URL."""
        self._dfs_seen = {start_url}
    async def _arun_batch(
        self,
        start_url: str,
@@ -19,14 +31,19 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
        config: CrawlerRunConfig,
    ) -> List[CrawlResult]:
        """
-        Batch (non-streaming) DFS mode.
+        Crawl level-by-level but emit results at the end.
-        Uses a stack to traverse URLs in DFS order, aggregating CrawlResults into a list.
+
        We keep a stack of ``(url, parent, depth)`` tuples, pop one at a time, and
        hand it to ``crawler.arun_many`` with deep crawling disabled so we remain
        in control of traversal. Every successful page bumps ``_pages_crawled`` and
        seeds new stack items discovered via :meth:`link_discovery`.
        """
        visited: Set[str] = set()
        # Stack items: (url, parent_url, depth)
        stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
        depths: Dict[str, int] = {start_url: 0}
        results: List[CrawlResult] = []
        self._reset_seen(start_url)
        while stack and not self._cancel_event.is_set():
            url, parent, depth = stack.pop()
@@ -71,12 +88,16 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
        config: CrawlerRunConfig,
    ) -> AsyncGenerator[CrawlResult, None]:
        """
-        Streaming DFS mode.
+        Same traversal as :meth:`_arun_batch`, but yield pages immediately.
-        Uses a stack to traverse URLs in DFS order and yields CrawlResults as they become available.
+
        Each popped URL is crawled, its metadata annotated, then the result gets
        yielded before we even look at the next stack entry. Successful crawls
        still feed :meth:`link_discovery`, keeping DFS order intact.
        """
        visited: Set[str] = set()
        stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
        depths: Dict[str, int] = {start_url: 0}
        self._reset_seen(start_url)
        while stack and not self._cancel_event.is_set():
            url, parent, depth = stack.pop()
@@ -108,3 +129,92 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
                    for new_url, new_parent in reversed(new_links):
                        new_depth = depths.get(new_url, depth + 1)
                        stack.append((new_url, new_parent, new_depth))
    async def link_discovery(
        self,
        result: CrawlResult,
        source_url: str,
        current_depth: int,
        _visited: Set[str],
        next_level: List[Tuple[str, Optional[str]]],
        depths: Dict[str, int],
    ) -> None:
        """
        Find the next URLs we should push onto the DFS stack.
        Parameters
        ----------
        result : CrawlResult
            Output of the page we just crawled; its ``links`` block is our raw material.
        source_url : str
            URL of the parent page; stored so callers can track ancestry.
        current_depth : int
            Depth of the parent; children naturally sit at ``current_depth + 1``.
        _visited : Set[str]
            Present to match the BFS signature, but we rely on ``_dfs_seen`` instead.
        next_level : list of tuples
            The stack buffer supplied by the caller; we append new ``(url, parent)`` items here.
        depths : dict
            Shared depth map so future metadata tagging knows how deep each URL lives.
        Notes
        -----
        - ``_dfs_seen`` keeps us from pushing duplicates without touching the traversal guard.
        - Validation, scoring, and capacity trimming mirror the BFS version so behaviour stays consistent.
        """
        next_depth = current_depth + 1
        if next_depth > self.max_depth:
            return
        remaining_capacity = self.max_pages - self._pages_crawled
        if remaining_capacity <= 0:
            self.logger.info(
                f"Max pages limit ({self.max_pages}) reached, stopping link discovery"
            )
            return
        links = result.links.get("internal", [])
        if self.include_external:
            links += result.links.get("external", [])
        seen = self._dfs_seen
        valid_links: List[Tuple[str, float]] = []
        for link in links:
            raw_url = link.get("href")
            if not raw_url:
                continue
            normalized_url = normalize_url_for_deep_crawl(raw_url, source_url)
            if not normalized_url or normalized_url in seen:
                continue
            if not await self.can_process_url(raw_url, next_depth):
                self.stats.urls_skipped += 1
                continue
            score = self.url_scorer.score(normalized_url) if self.url_scorer else 0
            if score < self.score_threshold:
                self.logger.debug(
                    f"URL {normalized_url} skipped: score {score} below threshold {self.score_threshold}"
                )
                self.stats.urls_skipped += 1
                continue
            seen.add(normalized_url)
            valid_links.append((normalized_url, score))
        if len(valid_links) > remaining_capacity:
            if self.url_scorer:
                valid_links.sort(key=lambda x: x[1], reverse=True)
            valid_links = valid_links[:remaining_capacity]
            self.logger.info(
                f"Limiting to {remaining_capacity} URLs due to max_pages limit"
            )
        for url, score in valid_links:
            if score:
                result.metadata = result.metadata or {}
                result.metadata["score"] = score
            next_level.append((url, source_url))
            depths[url] = next_depth
--- a/docs/examples/dfs_crawl_demo.py
+++ b/docs/examples/dfs_crawl_demo.py
@@ -0,0 +1,39 @@
 """
 Simple demonstration of the DFS deep crawler visiting multiple pages.
 Run with:  python docs/examples/dfs_crawl_demo.py
 """
 import asyncio
 from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 from crawl4ai.async_webcrawler import AsyncWebCrawler
 from crawl4ai.cache_context import CacheMode
 from crawl4ai.deep_crawling.dfs_strategy import DFSDeepCrawlStrategy
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 async def main() -> None:
    dfs_strategy = DFSDeepCrawlStrategy(
        max_depth=3,
        max_pages=50,
        include_external=False,
    )
    config = CrawlerRunConfig(
        deep_crawl_strategy=dfs_strategy,
        cache_mode=CacheMode.BYPASS,
        markdown_generator=DefaultMarkdownGenerator(),
        stream=True,
    )
    seed_url = "https://docs.python.org/3/"  # Plenty of internal links
    async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
        async for result in await crawler.arun(url=seed_url, config=config):
            depth = result.metadata.get("depth")
            status = "SUCCESS" if result.success else "FAILED"
            print(f"[{status}] depth={depth} url={result.url}")
 if __name__ == "__main__":
    asyncio.run(main())
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -21,21 +21,35 @@ browser_cfg = BrowserConfig(
 |-----------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------|
 | **`browser_type`**    | `"chromium"`, `"firefox"`, `"webkit"`<br/>*(default: `"chromium"`)* | Which browser engine to use. `"chromium"` is typical for many sites, `"firefox"` or `"webkit"` for specialized tests.                 |
 | **`headless`**        | `bool` (default: `True`)               | Headless means no visible UI. `False` is handy for debugging.                                                                         |
 | **`browser_mode`**    | `str` (default: `"dedicated"`)         | How browser is initialized: `"dedicated"` (new instance), `"builtin"` (CDP background), `"custom"` (explicit CDP), `"docker"` (container). |
 | **`use_managed_browser`** | `bool` (default: `False`)          | Launch browser via CDP for advanced control. Set automatically based on `browser_mode`.                  |
 | **`cdp_url`**         | `str` (default: `None`)                | Chrome DevTools Protocol endpoint URL (e.g., `"ws://localhost:9222/devtools/browser/"`). Set automatically based on `browser_mode`.   |
 | **`debugging_port`**  | `int` (default: `9222`)                | Port for browser debugging protocol.                                                                                                   |
 | **`host`**            | `str` (default: `"localhost"`)         | Host for browser connection.                                                                                                           |
 | **`viewport_width`**  | `int` (default: `1080`)                | Initial page width (in px). Useful for testing responsive layouts.                                                                    |
 | **`viewport_height`** | `int` (default: `600`)                 | Initial page height (in px).                                                                                                          |
 | **`viewport`**        | `dict` (default: `None`)               | Viewport dimensions dict. If set, overrides `viewport_width` and `viewport_height`.                                                   |
 | **`proxy`**           | `str` (deprecated)                      | Deprecated. Use `proxy_config` instead. If set, it will be auto-converted internally. |
-| **`proxy_config`**    | `dict` (default: `None`)               | For advanced or multi-proxy needs, specify details like `{"server": "...", "username": "...", ...}`.                                  |
+| **`proxy_config`**    | `ProxyConfig or dict` (default: `None`)| For advanced or multi-proxy needs, specify `ProxyConfig` object or dict like `{"server": "...", "username": "...", "password": "..."}`.  |
 | **`use_persistent_context`** | `bool` (default: `False`)       | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`.          |
 | **`user_data_dir`**   | `str or None` (default: `None`)        | Directory to store user data (profiles, cookies). Must be set if you want permanent sessions.                                         |
 | **`chrome_channel`**  | `str` (default: `"chromium"`)          | Chrome channel to launch (e.g., "chrome", "msedge"). Only for `browser_type="chromium"`. Auto-set to empty for Firefox/WebKit.       |
 | **`channel`**         | `str` (default: `"chromium"`)          | Alias for `chrome_channel`.                                                                                                           |
 | **`accept_downloads`** | `bool` (default: `False`)             | Whether to allow file downloads. Requires `downloads_path` if `True`.                                                                 |
 | **`downloads_path`**  | `str or None` (default: `None`)        | Directory to store downloaded files.                                                                                                  |
 | **`storage_state`**   | `str or dict or None` (default: `None`)| In-memory storage state (cookies, localStorage) to restore browser state.                                                             |
 | **`ignore_https_errors`** | `bool` (default: `True`)           | If `True`, continues despite invalid certificates (common in dev/staging).                                                            |
 | **`java_script_enabled`** | `bool` (default: `True`)           | Disable if you want no JS overhead, or if only static content is needed.                                                              |
 | **`sleep_on_close`**  | `bool` (default: `False`)              | Add a small delay when closing browser (can help with cleanup issues).                                                                |
 | **`cookies`**         | `list` (default: `[]`)                 | Pre-set cookies, each a dict like `{"name": "session", "value": "...", "url": "..."}`.                                                |
 | **`headers`**         | `dict` (default: `{}`)                 | Extra HTTP headers for every request, e.g. `{"Accept-Language": "en-US"}`.                                                            |
-| **`user_agent`**      | `str` (default: Chrome-based UA)       | Your custom or random user agent. `user_agent_mode="random"` can shuffle it.                                                          |
+| **`user_agent`**      | `str` (default: Chrome-based UA)       | Your custom user agent string.                                                                                                        |
-| **`light_mode`**      | `bool` (default: `False`)              | Disables some background features for performance gains.                                                                              |
+| **`user_agent_mode`** | `str` (default: `""`)                  | Set to `"random"` to randomize user agent from a pool (helps with bot detection).                                                     |
 | **`user_agent_generator_config`** | `dict` (default: `{}`)     | Configuration dict for user agent generation when `user_agent_mode="random"`.                                                         |
 | **`text_mode`**       | `bool` (default: `False`)              | If `True`, tries to disable images/other heavy content for speed.                                                                     |
-| **`use_managed_browser`** | `bool` (default: `False`)          | For advanced “managed” interactions (debugging, CDP usage). Typically set automatically if persistent context is on.                  |
+| **`light_mode`**      | `bool` (default: `False`)              | Disables some background features for performance gains.                                                                              |
 | **`extra_args`**      | `list` (default: `[]`)                 | Additional flags for the underlying browser process, e.g. `["--disable-extensions"]`.                                                |
 | **`enable_stealth`**  | `bool` (default: `False`)              | Enable playwright-stealth mode to bypass bot detection. Cannot be used with `browser_mode="builtin"`.                                |
 **Tips**:
 - Set `headless=False` to visually **debug** how pages load or how interactions proceed.  
@@ -70,6 +84,7 @@ We group them by category.
 |------------------------------|--------------------------------------|-------------------------------------------------------------------------------------------------|
 | **`word_count_threshold`**   | `int` (default: ~200)                | Skips text blocks below X words. Helps ignore trivial sections.                                 |
 | **`extraction_strategy`**    | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.).                                  |
 | **`chunking_strategy`**      | `ChunkingStrategy` (default: RegexChunking()) | Strategy to chunk content before extraction. Can be customized for different chunking approaches. |
 | **`markdown_generator`**     | `MarkdownGenerationStrategy` (None)  | If you want specialized markdown output (citations, filtering, chunking, etc.). Can be customized with options such as `content_source` parameter to select the HTML input source ('cleaned_html', 'raw_html', or 'fit_html').                 |
 | **`css_selector`**           | `str` (None)                         | Retains only the part of the page matching this selector. Affects the entire extraction process. |
 | **`target_elements`**        | `List[str]` (None)                   | List of CSS selectors for elements to focus on for markdown generation and data extraction, while still processing the entire page for links, media, etc. Provides more flexibility than `css_selector`. |
@@ -78,32 +93,50 @@ We group them by category.
 | **`only_text`**              | `bool` (False)                       | If `True`, tries to extract text-only content.                                                  |
 | **`prettiify`**              | `bool` (False)                       | If `True`, beautifies final HTML (slower, purely cosmetic).                                      |
 | **`keep_data_attributes`**   | `bool` (False)                       | If `True`, preserve `data-*` attributes in cleaned HTML.                                         |
 | **`keep_attrs`**             | `list` (default: [])                 | List of HTML attributes to keep during processing (e.g., `["id", "class", "data-value"]`).      |
 | **`remove_forms`**           | `bool` (False)                       | If `True`, remove all `<form>` elements.                                                        |
 | **`parser_type`**            | `str` (default: "lxml")              | HTML parser to use (e.g., "lxml", "html.parser").                                               |
 | **`scraping_strategy`**      | `ContentScrapingStrategy` (default: LXMLWebScrapingStrategy()) | Strategy to use for content scraping. Can be customized for different scraping needs (e.g., PDF extraction). |
 ---
-### B) **Caching & Session**
+### B) **Browser Location and Identity**
 | **Parameter**          | **Type / Default**        | **What It Does**                                                                                       |
 |------------------------|---------------------------|--------------------------------------------------------------------------------------------------------|
 | **`locale`**           | `str or None` (None)      | Browser's locale (e.g., "en-US", "fr-FR") for language preferences.                                   |
 | **`timezone_id`**      | `str or None` (None)      | Browser's timezone (e.g., "America/New_York", "Europe/Paris").                                         |
 | **`geolocation`**      | `GeolocationConfig or None` (None) | GPS coordinates configuration. Use `GeolocationConfig(latitude=..., longitude=..., accuracy=...)`. |
 | **`fetch_ssl_certificate`** | `bool` (False)       | If `True`, fetches and includes SSL certificate information in the result.                             |
 | **`proxy_config`**           | `ProxyConfig or dict or None` (None) | Proxy configuration for this specific crawl. Can override browser-level proxy settings.          |
 | **`proxy_rotation_strategy`** | `ProxyRotationStrategy` (None)      | Strategy for rotating proxies during crawl operations.                                           |
 ---
 ### C) **Caching & Session**
 | **Parameter**           | **Type / Default**     | **What It Does**                                                                                                              |
 |-------------------------|------------------------|------------------------------------------------------------------------------------------------------------------------------|
 | **`cache_mode`**        | `CacheMode or None`    | Controls how caching is handled (`ENABLED`, `BYPASS`, `DISABLED`, etc.). If `None`, typically defaults to `ENABLED`.          |
 | **`session_id`**        | `str or None`          | Assign a unique ID to reuse a single browser session across multiple `arun()` calls.                                          |
-| **`bypass_cache`**      | `bool` (False)         | If `True`, acts like `CacheMode.BYPASS`.                                                                                     |
+| **`bypass_cache`**      | `bool` (False)         | **Deprecated.** If `True`, acts like `CacheMode.BYPASS`. Use `cache_mode` instead.                                           |
-| **`disable_cache`**     | `bool` (False)         | If `True`, acts like `CacheMode.DISABLED`.                                                                                   |
+| **`disable_cache`**     | `bool` (False)         | **Deprecated.** If `True`, acts like `CacheMode.DISABLED`. Use `cache_mode` instead.                                         |
-| **`no_cache_read`**     | `bool` (False)         | If `True`, acts like `CacheMode.WRITE_ONLY` (writes cache but never reads).                                                  |
+| **`no_cache_read`**     | `bool` (False)         | **Deprecated.** If `True`, acts like `CacheMode.WRITE_ONLY` (writes cache but never reads). Use `cache_mode` instead.        |
-| **`no_cache_write`**    | `bool` (False)         | If `True`, acts like `CacheMode.READ_ONLY` (reads cache but never writes).                                                   |
+| **`no_cache_write`**    | `bool` (False)         | **Deprecated.** If `True`, acts like `CacheMode.READ_ONLY` (reads cache but never writes). Use `cache_mode` instead.         |
 | **`shared_data`**       | `dict or None` (None)  | Shared data to be passed between hooks and accessible across crawl operations.                                                |
 Use these for controlling whether you read or write from a local content cache. Handy for large batch crawls or repeated site visits.
 ---
-### C) **Page Navigation & Timing**
+### D) **Page Navigation & Timing**
 | **Parameter**              | **Type / Default**      | **What It Does**                                                                                                    |
 |----------------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------|
-| **`wait_until`**           | `str` (domcontentloaded)| Condition for navigation to “complete”. Often `"networkidle"` or `"domcontentloaded"`.                               |
+| **`wait_until`**           | `str` (domcontentloaded)| Condition for navigation to "complete". Often `"networkidle"` or `"domcontentloaded"`.                               |
 | **`page_timeout`**         | `int` (60000 ms)        | Timeout for page navigation or JS steps. Increase for slow sites.                                                    |
 | **`wait_for`**             | `str or None`           | Wait for a CSS (`"css:selector"`) or JS (`"js:() => bool"`) condition before content extraction.                     |
 | **`wait_for_timeout`**     | `int or None` (None)    | Specific timeout in ms for the `wait_for` condition. If None, uses `page_timeout`.                                   |
 | **`wait_for_images`**      | `bool` (False)          | Wait for images to load before finishing. Slows down if you only want text.                                          |
 | **`delay_before_return_html`** | `float` (0.1)       | Additional pause (seconds) before final HTML is captured. Good for last-second updates.                               |
 | **`check_robots_txt`**     | `bool` (False)          | Whether to check and respect robots.txt rules before crawling. If True, caches robots.txt for efficiency.            |
@@ -112,15 +145,17 @@ Use these for controlling whether you read or write from a local content cache.
 ---
-### D) **Page Interaction**
+### E) **Page Interaction**
 | **Parameter**              | **Type / Default**            | **What It Does**                                                                                                                       |
 |----------------------------|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------|
 | **`js_code`**              | `str or list[str]` (None)      | JavaScript to run after load. E.g. `"document.querySelector('button')?.click();"`.                                                     |
-| **`js_only`**              | `bool` (False)                 | If `True`, indicates we’re reusing an existing session and only applying JS. No full reload.                                           |
+| **`c4a_script`**           | `str or list[str]` (None)      | C4A script that compiles to JavaScript. Alternative to writing raw JS.                                                                 |
 | **`js_only`**              | `bool` (False)                 | If `True`, indicates we're reusing an existing session and only applying JS. No full reload.                                           |
 | **`ignore_body_visibility`** | `bool` (True)                | Skip checking if `<body>` is visible. Usually best to keep `True`.                                                                     |
 | **`scan_full_page`**       | `bool` (False)                 | If `True`, auto-scroll the page to load dynamic content (infinite scroll).                                                              |
 | **`scroll_delay`**         | `float` (0.2)                  | Delay between scroll steps if `scan_full_page=True`.                                                                                   |
 | **`max_scroll_steps`**     | `int or None` (None)           | Maximum number of scroll steps during full page scan. If None, scrolls until entire page is loaded.                                     |
 | **`process_iframes`**      | `bool` (False)                 | Inlines iframe content for single-page extraction.                                                                                     |
 | **`remove_overlay_elements`** | `bool` (False)              | Removes potential modals/popups blocking the main content.                                                                              |
 | **`simulate_user`**        | `bool` (False)                 | Simulate user interactions (mouse movements) to avoid bot detection.                                                                    |
@@ -132,7 +167,7 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
 ---
-### E) **Media Handling**
+### F) **Media Handling**
 | **Parameter**                              | **Type / Default**  | **What It Does**                                                                                         |
 |--------------------------------------------|---------------------|-----------------------------------------------------------------------------------------------------------|
@@ -141,13 +176,16 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
 | **`screenshot_height_threshold`**          | `int` (~20000)      | If the page is taller than this, alternate screenshot strategies are used.                                |
 | **`pdf`**                                  | `bool` (False)      | If `True`, returns a PDF in `result.pdf`.                                                                 |
 | **`capture_mhtml`**                        | `bool` (False)      | If `True`, captures an MHTML snapshot of the page in `result.mhtml`. MHTML includes all page resources (CSS, images, etc.) in a single file. |
-| **`image_description_min_word_threshold`** | `int` (~50)         | Minimum words for an image’s alt text or description to be considered valid.                              |
+| **`image_description_min_word_threshold`** | `int` (~50)         | Minimum words for an image's alt text or description to be considered valid.                              |
 | **`image_score_threshold`**                | `int` (~3)          | Filter out low-scoring images. The crawler scores images by relevance (size, context, etc.).              |
 | **`exclude_external_images`**              | `bool` (False)      | Exclude images from other domains.                                                                        |
 | **`exclude_all_images`**                   | `bool` (False)      | If `True`, excludes all images from processing (both internal and external).                              |
 | **`table_score_threshold`**                | `int` (7)           | Minimum score threshold for processing a table. Lower values include more tables.                         |
 | **`table_extraction`**                     | `TableExtractionStrategy` (DefaultTableExtraction) | Strategy for table extraction. Defaults to DefaultTableExtraction with configured threshold. |
 ---
-### F) **Link/Domain Handling**
+### G) **Link/Domain Handling**
 | **Parameter**                | **Type / Default**      | **What It Does**                                                                                                             |
 |------------------------------|-------------------------|-----------------------------------------------------------------------------------------------------------------------------|
@@ -155,23 +193,39 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
 | **`exclude_external_links`** | `bool` (False)          | Removes all links pointing outside the current domain.                                                                      |
 | **`exclude_social_media_links`** | `bool` (False)      | Strips links specifically to social sites (like Facebook or Twitter).                                                      |
 | **`exclude_domains`**        | `list` ([])             | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`).                                            |
 | **`exclude_internal_links`** | `bool` (False)          | If `True`, excludes internal links from the results.                                                                        |
 | **`score_links`**            | `bool` (False)          | If `True`, calculates intrinsic quality scores for all links using URL structure, text quality, and contextual metrics.     |
 | **`preserve_https_for_internal_links`** | `bool` (False) | If `True`, preserves HTTPS scheme for internal links even when the server redirects to HTTP. Useful for security-conscious crawling. |
 Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains).
 ---
-### G) **Debug & Logging**
+### H) **Debug, Logging & Network Monitoring**
 | **Parameter**  | **Type / Default** | **What It Does**                                                         |
 |----------------|--------------------|---------------------------------------------------------------------------|
 | **`verbose`**  | `bool` (True)     | Prints logs detailing each step of crawling, interactions, or errors.    |
-| **`log_console`** | `bool` (False) | Logs the page’s JavaScript console output if you want deeper JS debugging.|
+| **`log_console`** | `bool` (False) | Logs the page's JavaScript console output if you want deeper JS debugging.|
 | **`capture_network_requests`** | `bool` (False) | If `True`, captures network requests made by the page in `result.captured_requests`. |
 | **`capture_console_messages`** | `bool` (False) | If `True`, captures console messages from the page in `result.console_messages`. |
 ---
 ### I) **Connection & HTTP Parameters**
-### H) **Virtual Scroll Configuration**
+| **Parameter**               | **Type / Default**      | **What It Does**                                                                                                    |
 |-----------------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------|
 | **`method`**                | `str` ("GET")          | HTTP method to use when using AsyncHTTPCrawlerStrategy (e.g., "GET", "POST").                                       |
 | **`stream`**                | `bool` (False)         | If `True`, enables streaming mode for `arun_many()` to process URLs as they complete rather than waiting for all.   |
 | **`url`**                   | `str or None` (None)   | URL for this specific config. Not typically set directly but used internally for URL-specific configurations.       |
 | **`user_agent`**            | `str or None` (None)   | Custom User-Agent string for this crawl. Can override browser-level user agent.                                     |
 | **`user_agent_mode`**       | `str or None` (None)   | Set to `"random"` to randomize user agent. Can override browser-level setting.                                      |
 | **`user_agent_generator_config`** | `dict` ({})      | Configuration for user agent generation when `user_agent_mode="random"`.                                            |
 ---
 ### J) **Virtual Scroll Configuration**
 | **Parameter**                | **Type / Default**           | **What It Does**                                                                                                                    |
 |------------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
@@ -211,7 +265,7 @@ See [Virtual Scroll documentation](../../advanced/virtual-scroll.md) for detaile
 ---
-### I) **URL Matching Configuration**
+### K) **URL Matching Configuration**
 | **Parameter**          | **Type / Default**           | **What It Does**                                                                                                                    |
 |------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
@@ -274,7 +328,25 @@ default_config = CrawlerRunConfig()  # No url_matcher = matches everything
 - If no config matches a URL and there's no default config (one without `url_matcher`), the URL will fail with "No matching configuration found"
 - Always include a default config as the last item if you want to handle all URLs
---## 2.2 Helper Methods
+---
 ### L) **Advanced Crawling Features**
 | **Parameter**               | **Type / Default**           | **What It Does**                                                                                                                    |
 |-----------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
 | **`deep_crawl_strategy`**   | `DeepCrawlStrategy or None` (None) | Strategy for deep/recursive crawling. Enables automatic link following and multi-level site crawling.                     |
 | **`link_preview_config`**   | `LinkPreviewConfig or dict or None` (None) | Configuration for link head extraction and scoring. Fetches and scores link metadata without full page loads.  |
 | **`experimental`**          | `dict or None` (None)       | Dictionary for experimental/beta features not yet integrated into main parameters. Use with caution.                                |
 **Deep Crawl Strategy** enables automatic site exploration by following links according to defined rules. Useful for sitemap generation or comprehensive site archiving.
 **Link Preview Config** allows efficient link discovery and scoring by fetching only the `<head>` section of linked pages, enabling smart crawl prioritization without the overhead of full page loads.
 **Experimental** parameters are features in beta testing. They may change or be removed in future versions. Check documentation for currently available experimental features.
 ---
 ## 2.2 Helper Methods
 Both `BrowserConfig` and `CrawlerRunConfig` provide a `clone()` method to create modified copies:
--- a/docs/md_v2/core/browser-crawler-config.md
+++ b/docs/md_v2/core/browser-crawler-config.md
@@ -17,6 +17,11 @@ class BrowserConfig:
    def __init__(
        browser_type="chromium",
        headless=True,
        browser_mode="dedicated",
        use_managed_browser=False,
        cdp_url=None,
        debugging_port=9222,
        host="localhost",
        proxy_config=None,
        viewport_width=1080,
        viewport_height=600,
@@ -25,7 +30,13 @@ class BrowserConfig:
        user_data_dir=None,
        cookies=None,
        headers=None,
-        user_agent=None,
+        user_agent=(
            # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 "
            # "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
            # "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36"
        ),
        user_agent_mode="",
        text_mode=False,
        light_mode=False,
        extra_args=None,
@@ -37,17 +48,33 @@ class BrowserConfig:
 ### Key Fields to Note
-1. **`browser_type`**  
+1.⠀**`browser_type`**  
- Options: `"chromium"`, `"firefox"`, or `"webkit"`.  
+   - Options: `"chromium"`, `"firefox"`, or `"webkit"`.  
- Defaults to `"chromium"`.  
+   - Defaults to `"chromium"`.  
- If you need a different engine, specify it here.
+   - If you need a different engine, specify it here.
-2. **`headless`**  
+2.⠀**`headless`**  
   - `True`: Runs the browser in headless mode (invisible browser).  
   - `False`: Runs the browser in visible mode, which helps with debugging.
-3. **`proxy_config`**  
+3.⠀**`browser_mode`**  
-   - A dictionary with fields like:  
+   - Determines how the browser should be initialized:
     - `"dedicated"` (default): Creates a new browser instance each time
     - `"builtin"`: Uses the builtin CDP browser running in background
     - `"custom"`: Uses explicit CDP settings provided in `cdp_url`
     - `"docker"`: Runs browser in Docker container with isolation
 4.⠀**`use_managed_browser`** & **`cdp_url`**  
   - `use_managed_browser=True`: Launch browser using Chrome DevTools Protocol (CDP) for advanced control
   - `cdp_url`: URL for CDP endpoint (e.g., `"ws://localhost:9222/devtools/browser/"`)
   - Automatically set based on `browser_mode`
 5.⠀**`debugging_port`** & **`host`**  
   - `debugging_port`: Port for browser debugging protocol (default: 9222)
   - `host`: Host for browser connection (default: "localhost")
 6.⠀**`proxy_config`**  
   - A `ProxyConfig` object or dictionary with fields like:  
 ```json
 {
    "server": "http://proxy.example.com:8080", 
@@ -57,35 +84,35 @@ class BrowserConfig:
 ```
   - Leave as `None` if a proxy is not required.
-4. **`viewport_width` & `viewport_height`**:  
+7.⠀**`viewport_width` & `viewport_height`**  
   - The initial window size.  
   - Some sites behave differently with smaller or bigger viewports.
-5. **`verbose`**:  
+8.⠀**`verbose`**  
   - If `True`, prints extra logs.  
   - Handy for debugging.
-6. **`use_persistent_context`**:  
+9.⠀**`use_persistent_context`**  
   - If `True`, uses a **persistent** browser profile, storing cookies/local storage across runs.  
   - Typically also set `user_data_dir` to point to a folder.
-7. **`cookies`** & **`headers`**:  
+10.⠀**`cookies`** & **`headers`**  
-   - If you want to start with specific cookies or add universal HTTP headers, set them here.  
+    - If you want to start with specific cookies or add universal HTTP headers to the browser context, set them here.  
-   - E.g. `cookies=[{"name": "session", "value": "abc123", "domain": "example.com"}]`.
+    - E.g. `cookies=[{"name": "session", "value": "abc123", "domain": "example.com"}]`.
-8. **`user_agent`**:  
+11.⠀**`user_agent`** & **`user_agent_mode`**  
-   - Custom User-Agent string. If `None`, a default is used.  
+    - `user_agent`: Custom User-Agent string. If `None`, a default is used.  
-   - You can also set `user_agent_mode="random"` for randomization (if you want to fight bot detection).
+    - `user_agent_mode`: Set to `"random"` for randomization (helps fight bot detection).
-9. **`text_mode`** & **`light_mode`**:  
+12.⠀**`text_mode`** & **`light_mode`**  
-   - `text_mode=True` disables images, possibly speeding up text-only crawls.  
+    - `text_mode=True` disables images, possibly speeding up text-only crawls.  
-   - `light_mode=True` turns off certain background features for performance.  
+    - `light_mode=True` turns off certain background features for performance.  
-10. **`extra_args`**:  
+13.⠀**`extra_args`**  
    - Additional flags for the underlying browser.  
    - E.g. `["--disable-extensions"]`.
-11. **`enable_stealth`**:  
+14.⠀**`enable_stealth`**  
    - If `True`, enables stealth mode using playwright-stealth.  
    - Modifies browser fingerprints to avoid basic bot detection.  
    - Default is `False`. Recommended for sites with bot protection.
@@ -134,9 +161,11 @@ class CrawlerRunConfig:
    def __init__(
        word_count_threshold=200,
        extraction_strategy=None,
        chunking_strategy=RegexChunking(),
        markdown_generator=None,
-        cache_mode=None,
+        cache_mode=CacheMode.BYPASS,
        js_code=None,
        c4a_script=None,
        wait_for=None,
        screenshot=False,
        pdf=False,
@@ -145,13 +174,18 @@ class CrawlerRunConfig:
        locale=None,            # e.g. "en-US", "fr-FR"
        timezone_id=None,       # e.g. "America/New_York"
        geolocation=None,       # GeolocationConfig object
-        # Resource Management
+        # Proxy Configuration
-        enable_rate_limiting=False,
+        proxy_config=None,
-        rate_limit_config=None,
+        proxy_rotation_strategy=None,
-        memory_threshold_percent=70.0,
+        # Page Interaction Parameters
-        check_interval=1.0,
+        scan_full_page=False,
-        max_session_permit=20,
+        scroll_delay=0.2,
-        display_mode=None,
+        wait_until="domcontentloaded",
        page_timeout=60000,
        delay_before_return_html=0.1,
        # URL Matching Parameters
        url_matcher=None,       # For URL-specific configurations
        match_mode=MatchMode.OR,
        verbose=True,
        stream=False,  # Enable streaming for arun_many()
        # ... other advanced parameters omitted
@@ -161,69 +195,68 @@ class CrawlerRunConfig:
 ### Key Fields to Note
-1. **`word_count_threshold`**:  
+1.⠀**`word_count_threshold`**:  
   - The minimum word count before a block is considered.  
   - If your site has lots of short paragraphs or items, you can lower it.
-2. **`extraction_strategy`**:  
+2.⠀**`extraction_strategy`**:  
   - Where you plug in JSON-based extraction (CSS, LLM, etc.).  
   - If `None`, no structured extraction is done (only raw/cleaned HTML + markdown).
-3. **`markdown_generator`**:  
+3.⠀**`chunking_strategy`**:  
   - Strategy to chunk content before extraction.  
   - Defaults to `RegexChunking()`. Can be customized for different chunking approaches.
 4.⠀**`markdown_generator`**:  
   - E.g., `DefaultMarkdownGenerator(...)`, controlling how HTML→Markdown conversion is done.  
   - If `None`, a default approach is used.
-4. **`cache_mode`**:  
+5.⠀**`cache_mode`**:  
   - Controls caching behavior (`ENABLED`, `BYPASS`, `DISABLED`, etc.).  
-   - If `None`, defaults to some level of caching or you can specify `CacheMode.ENABLED`.
+   - Defaults to `CacheMode.BYPASS`.
-5. **`js_code`**:  
+6.⠀**`js_code`** & **`c4a_script`**:  
-   - A string or list of JS strings to execute.  
+   - `js_code`: A string or list of JavaScript strings to execute.  
   - `c4a_script`: C4A script that compiles to JavaScript.
   - Great for "Load More" buttons or user interactions.  
-6. **`wait_for`**:  
+7.⠀**`wait_for`**:  
   - A CSS or JS expression to wait for before extracting content.  
   - Common usage: `wait_for="css:.main-loaded"` or `wait_for="js:() => window.loaded === true"`.
-7. **`screenshot`**, **`pdf`**, & **`capture_mhtml`**:  
+8.⠀**`screenshot`**, **`pdf`**, & **`capture_mhtml`**:  
   - If `True`, captures a screenshot, PDF, or MHTML snapshot after the page is fully loaded.  
   - The results go to `result.screenshot` (base64), `result.pdf` (bytes), or `result.mhtml` (string).
-8. **Location Parameters**:  
+9.⠀**Location Parameters**:  
   - **`locale`**: Browser's locale (e.g., `"en-US"`, `"fr-FR"`) for language preferences
   - **`timezone_id`**: Browser's timezone (e.g., `"America/New_York"`, `"Europe/Paris"`)
   - **`geolocation`**: GPS coordinates via `GeolocationConfig(latitude=48.8566, longitude=2.3522)`
   - See [Identity Based Crawling](../advanced/identity-based-crawling.md#7-locale-timezone-and-geolocation-control)
-9. **`verbose`**:  
+10.⠀**Proxy Configuration**:  
-   - Logs additional runtime details.  
+    - **`proxy_config`**: Proxy server configuration (ProxyConfig object or dict) e.g. {"server": "...", "username": "...", "password"}
-   - Overlaps with the browser's verbosity if also set to `True` in `BrowserConfig`.
+    - **`proxy_rotation_strategy`**: Strategy for rotating proxies during crawls
-10. **`enable_rate_limiting`**:  
+11.⠀**Page Interaction Parameters**:  
-   - If `True`, enables rate limiting for batch processing.  
+    - **`scan_full_page`**: If `True`, scroll through the entire page to load all content
-   - Requires `rate_limit_config` to be set.
+    - **`wait_until`**: Condition to wait for when navigating (e.g., "domcontentloaded", "networkidle")
    - **`page_timeout`**: Timeout in milliseconds for page operations (default: 60000)
    - **`delay_before_return_html`**: Delay in seconds before retrieving final HTML.
-11. **`memory_threshold_percent`**:  
+12.⠀**`url_matcher`** & **`match_mode`**:  
    - The memory threshold (as a percentage) to monitor.  
    - If exceeded, the crawler will pause or slow down.
 12. **`check_interval`**:  
    - The interval (in seconds) to check system resources.  
    - Affects how often memory and CPU usage are monitored.
 13. **`max_session_permit`**:  
    - The maximum number of concurrent crawl sessions.  
    - Helps prevent overwhelming the system.
 14. **`url_matcher`** & **`match_mode`**:  
    - Enable URL-specific configurations when used with `arun_many()`.
    - Set `url_matcher` to patterns (glob, function, or list) to match specific URLs.
    - Use `match_mode` (OR/AND) to control how multiple patterns combine.
    - See [URL-Specific Configurations](../api/arun_many.md#url-specific-configurations) for examples.
-15. **`display_mode`**:  
+13.⠀**`verbose`**:  
-    - The display mode for progress information (`DETAILED`, `BRIEF`, etc.).  
+    - Logs additional runtime details.  
-    - Affects how much information is printed during the crawl.
+    - Overlaps with the browser's verbosity if also set to `True` in `BrowserConfig`.
 14.⠀**`stream`**:  
    - If `True`, enables streaming mode for `arun_many()` to process URLs as they complete.
    - Allows handling results incrementally instead of waiting for all URLs to finish.
 ### Helper Methods
@@ -263,16 +296,16 @@ The `clone()` method:
 ### Key fields to note
-1. **`provider`**:  
+1.⠀**`provider`**:  
 - Which LLM provider to use. 
 - Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)*
-2. **`api_token`**:  
+2.⠀**`api_token`**:  
    - Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables  
    - API token of LLM provider <br/> eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"`
    - Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"`            
-3. **`base_url`**:  
+3.⠀**`base_url`**:  
   - If your provider has a custom endpoint
 ```python