Merge branch 'develop' into release/v0.7.7

This commit is contained in:
ntohidi
2025-11-13 14:54:05 +01:00
4 changed files with 347 additions and 93 deletions

View File

@@ -4,14 +4,26 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
from ..models import CrawlResult from ..models import CrawlResult
from .bfs_strategy import BFSDeepCrawlStrategy # noqa from .bfs_strategy import BFSDeepCrawlStrategy # noqa
from ..types import AsyncWebCrawler, CrawlerRunConfig from ..types import AsyncWebCrawler, CrawlerRunConfig
from ..utils import normalize_url_for_deep_crawl
class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
""" """
Depth-First Search (DFS) deep crawling strategy. Depth-first deep crawling with familiar BFS rules.
Inherits URL validation and link discovery from BFSDeepCrawlStrategy. We reuse the same filters, scoring, and page limits from :class:`BFSDeepCrawlStrategy`,
Overrides _arun_batch and _arun_stream to use a stack (LIFO) for DFS traversal. but walk the graph with a stack so we fully explore one branch before hopping to the
next. DFS also keeps its own ``_dfs_seen`` set so we can drop duplicate links at
discovery time without accidentally marking them as “already crawled”.
""" """
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._dfs_seen: Set[str] = set()
def _reset_seen(self, start_url: str) -> None:
"""Start each crawl with a clean dedupe set seeded with the root URL."""
self._dfs_seen = {start_url}
async def _arun_batch( async def _arun_batch(
self, self,
start_url: str, start_url: str,
@@ -19,14 +31,19 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
config: CrawlerRunConfig, config: CrawlerRunConfig,
) -> List[CrawlResult]: ) -> List[CrawlResult]:
""" """
Batch (non-streaming) DFS mode. Crawl level-by-level but emit results at the end.
Uses a stack to traverse URLs in DFS order, aggregating CrawlResults into a list.
We keep a stack of ``(url, parent, depth)`` tuples, pop one at a time, and
hand it to ``crawler.arun_many`` with deep crawling disabled so we remain
in control of traversal. Every successful page bumps ``_pages_crawled`` and
seeds new stack items discovered via :meth:`link_discovery`.
""" """
visited: Set[str] = set() visited: Set[str] = set()
# Stack items: (url, parent_url, depth) # Stack items: (url, parent_url, depth)
stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)] stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
depths: Dict[str, int] = {start_url: 0} depths: Dict[str, int] = {start_url: 0}
results: List[CrawlResult] = [] results: List[CrawlResult] = []
self._reset_seen(start_url)
while stack and not self._cancel_event.is_set(): while stack and not self._cancel_event.is_set():
url, parent, depth = stack.pop() url, parent, depth = stack.pop()
@@ -71,12 +88,16 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
config: CrawlerRunConfig, config: CrawlerRunConfig,
) -> AsyncGenerator[CrawlResult, None]: ) -> AsyncGenerator[CrawlResult, None]:
""" """
Streaming DFS mode. Same traversal as :meth:`_arun_batch`, but yield pages immediately.
Uses a stack to traverse URLs in DFS order and yields CrawlResults as they become available.
Each popped URL is crawled, its metadata annotated, then the result gets
yielded before we even look at the next stack entry. Successful crawls
still feed :meth:`link_discovery`, keeping DFS order intact.
""" """
visited: Set[str] = set() visited: Set[str] = set()
stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)] stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
depths: Dict[str, int] = {start_url: 0} depths: Dict[str, int] = {start_url: 0}
self._reset_seen(start_url)
while stack and not self._cancel_event.is_set(): while stack and not self._cancel_event.is_set():
url, parent, depth = stack.pop() url, parent, depth = stack.pop()
@@ -108,3 +129,92 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
for new_url, new_parent in reversed(new_links): for new_url, new_parent in reversed(new_links):
new_depth = depths.get(new_url, depth + 1) new_depth = depths.get(new_url, depth + 1)
stack.append((new_url, new_parent, new_depth)) stack.append((new_url, new_parent, new_depth))
async def link_discovery(
self,
result: CrawlResult,
source_url: str,
current_depth: int,
_visited: Set[str],
next_level: List[Tuple[str, Optional[str]]],
depths: Dict[str, int],
) -> None:
"""
Find the next URLs we should push onto the DFS stack.
Parameters
----------
result : CrawlResult
Output of the page we just crawled; its ``links`` block is our raw material.
source_url : str
URL of the parent page; stored so callers can track ancestry.
current_depth : int
Depth of the parent; children naturally sit at ``current_depth + 1``.
_visited : Set[str]
Present to match the BFS signature, but we rely on ``_dfs_seen`` instead.
next_level : list of tuples
The stack buffer supplied by the caller; we append new ``(url, parent)`` items here.
depths : dict
Shared depth map so future metadata tagging knows how deep each URL lives.
Notes
-----
- ``_dfs_seen`` keeps us from pushing duplicates without touching the traversal guard.
- Validation, scoring, and capacity trimming mirror the BFS version so behaviour stays consistent.
"""
next_depth = current_depth + 1
if next_depth > self.max_depth:
return
remaining_capacity = self.max_pages - self._pages_crawled
if remaining_capacity <= 0:
self.logger.info(
f"Max pages limit ({self.max_pages}) reached, stopping link discovery"
)
return
links = result.links.get("internal", [])
if self.include_external:
links += result.links.get("external", [])
seen = self._dfs_seen
valid_links: List[Tuple[str, float]] = []
for link in links:
raw_url = link.get("href")
if not raw_url:
continue
normalized_url = normalize_url_for_deep_crawl(raw_url, source_url)
if not normalized_url or normalized_url in seen:
continue
if not await self.can_process_url(raw_url, next_depth):
self.stats.urls_skipped += 1
continue
score = self.url_scorer.score(normalized_url) if self.url_scorer else 0
if score < self.score_threshold:
self.logger.debug(
f"URL {normalized_url} skipped: score {score} below threshold {self.score_threshold}"
)
self.stats.urls_skipped += 1
continue
seen.add(normalized_url)
valid_links.append((normalized_url, score))
if len(valid_links) > remaining_capacity:
if self.url_scorer:
valid_links.sort(key=lambda x: x[1], reverse=True)
valid_links = valid_links[:remaining_capacity]
self.logger.info(
f"Limiting to {remaining_capacity} URLs due to max_pages limit"
)
for url, score in valid_links:
if score:
result.metadata = result.metadata or {}
result.metadata["score"] = score
next_level.append((url, source_url))
depths[url] = next_depth

View File

@@ -0,0 +1,39 @@
"""
Simple demonstration of the DFS deep crawler visiting multiple pages.
Run with: python docs/examples/dfs_crawl_demo.py
"""
import asyncio
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_webcrawler import AsyncWebCrawler
from crawl4ai.cache_context import CacheMode
from crawl4ai.deep_crawling.dfs_strategy import DFSDeepCrawlStrategy
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
async def main() -> None:
dfs_strategy = DFSDeepCrawlStrategy(
max_depth=3,
max_pages=50,
include_external=False,
)
config = CrawlerRunConfig(
deep_crawl_strategy=dfs_strategy,
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(),
stream=True,
)
seed_url = "https://docs.python.org/3/" # Plenty of internal links
async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
async for result in await crawler.arun(url=seed_url, config=config):
depth = result.metadata.get("depth")
status = "SUCCESS" if result.success else "FAILED"
print(f"[{status}] depth={depth} url={result.url}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -21,21 +21,35 @@ browser_cfg = BrowserConfig(
|-----------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------| |-----------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------|
| **`browser_type`** | `"chromium"`, `"firefox"`, `"webkit"`<br/>*(default: `"chromium"`)* | Which browser engine to use. `"chromium"` is typical for many sites, `"firefox"` or `"webkit"` for specialized tests. | | **`browser_type`** | `"chromium"`, `"firefox"`, `"webkit"`<br/>*(default: `"chromium"`)* | Which browser engine to use. `"chromium"` is typical for many sites, `"firefox"` or `"webkit"` for specialized tests. |
| **`headless`** | `bool` (default: `True`) | Headless means no visible UI. `False` is handy for debugging. | | **`headless`** | `bool` (default: `True`) | Headless means no visible UI. `False` is handy for debugging. |
| **`browser_mode`** | `str` (default: `"dedicated"`) | How browser is initialized: `"dedicated"` (new instance), `"builtin"` (CDP background), `"custom"` (explicit CDP), `"docker"` (container). |
| **`use_managed_browser`** | `bool` (default: `False`) | Launch browser via CDP for advanced control. Set automatically based on `browser_mode`. |
| **`cdp_url`** | `str` (default: `None`) | Chrome DevTools Protocol endpoint URL (e.g., `"ws://localhost:9222/devtools/browser/"`). Set automatically based on `browser_mode`. |
| **`debugging_port`** | `int` (default: `9222`) | Port for browser debugging protocol. |
| **`host`** | `str` (default: `"localhost"`) | Host for browser connection. |
| **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. | | **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. |
| **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). | | **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). |
| **`viewport`** | `dict` (default: `None`) | Viewport dimensions dict. If set, overrides `viewport_width` and `viewport_height`. |
| **`proxy`** | `str` (deprecated) | Deprecated. Use `proxy_config` instead. If set, it will be auto-converted internally. | | **`proxy`** | `str` (deprecated) | Deprecated. Use `proxy_config` instead. If set, it will be auto-converted internally. |
| **`proxy_config`** | `dict` (default: `None`) | For advanced or multi-proxy needs, specify details like `{"server": "...", "username": "...", ...}`. | | **`proxy_config`** | `ProxyConfig or dict` (default: `None`)| For advanced or multi-proxy needs, specify `ProxyConfig` object or dict like `{"server": "...", "username": "...", "password": "..."}`. |
| **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. | | **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. |
| **`user_data_dir`** | `str or None` (default: `None`) | Directory to store user data (profiles, cookies). Must be set if you want permanent sessions. | | **`user_data_dir`** | `str or None` (default: `None`) | Directory to store user data (profiles, cookies). Must be set if you want permanent sessions. |
| **`chrome_channel`** | `str` (default: `"chromium"`) | Chrome channel to launch (e.g., "chrome", "msedge"). Only for `browser_type="chromium"`. Auto-set to empty for Firefox/WebKit. |
| **`channel`** | `str` (default: `"chromium"`) | Alias for `chrome_channel`. |
| **`accept_downloads`** | `bool` (default: `False`) | Whether to allow file downloads. Requires `downloads_path` if `True`. |
| **`downloads_path`** | `str or None` (default: `None`) | Directory to store downloaded files. |
| **`storage_state`** | `str or dict or None` (default: `None`)| In-memory storage state (cookies, localStorage) to restore browser state. |
| **`ignore_https_errors`** | `bool` (default: `True`) | If `True`, continues despite invalid certificates (common in dev/staging). | | **`ignore_https_errors`** | `bool` (default: `True`) | If `True`, continues despite invalid certificates (common in dev/staging). |
| **`java_script_enabled`** | `bool` (default: `True`) | Disable if you want no JS overhead, or if only static content is needed. | | **`java_script_enabled`** | `bool` (default: `True`) | Disable if you want no JS overhead, or if only static content is needed. |
| **`sleep_on_close`** | `bool` (default: `False`) | Add a small delay when closing browser (can help with cleanup issues). |
| **`cookies`** | `list` (default: `[]`) | Pre-set cookies, each a dict like `{"name": "session", "value": "...", "url": "..."}`. | | **`cookies`** | `list` (default: `[]`) | Pre-set cookies, each a dict like `{"name": "session", "value": "...", "url": "..."}`. |
| **`headers`** | `dict` (default: `{}`) | Extra HTTP headers for every request, e.g. `{"Accept-Language": "en-US"}`. | | **`headers`** | `dict` (default: `{}`) | Extra HTTP headers for every request, e.g. `{"Accept-Language": "en-US"}`. |
| **`user_agent`** | `str` (default: Chrome-based UA) | Your custom or random user agent. `user_agent_mode="random"` can shuffle it. | | **`user_agent`** | `str` (default: Chrome-based UA) | Your custom user agent string. |
| **`light_mode`** | `bool` (default: `False`) | Disables some background features for performance gains. | | **`user_agent_mode`** | `str` (default: `""`) | Set to `"random"` to randomize user agent from a pool (helps with bot detection). |
| **`user_agent_generator_config`** | `dict` (default: `{}`) | Configuration dict for user agent generation when `user_agent_mode="random"`. |
| **`text_mode`** | `bool` (default: `False`) | If `True`, tries to disable images/other heavy content for speed. | | **`text_mode`** | `bool` (default: `False`) | If `True`, tries to disable images/other heavy content for speed. |
| **`use_managed_browser`** | `bool` (default: `False`) | For advanced “managed” interactions (debugging, CDP usage). Typically set automatically if persistent context is on. | | **`light_mode`** | `bool` (default: `False`) | Disables some background features for performance gains. |
| **`extra_args`** | `list` (default: `[]`) | Additional flags for the underlying browser process, e.g. `["--disable-extensions"]`. | | **`extra_args`** | `list` (default: `[]`) | Additional flags for the underlying browser process, e.g. `["--disable-extensions"]`. |
| **`enable_stealth`** | `bool` (default: `False`) | Enable playwright-stealth mode to bypass bot detection. Cannot be used with `browser_mode="builtin"`. |
**Tips**: **Tips**:
- Set `headless=False` to visually **debug** how pages load or how interactions proceed. - Set `headless=False` to visually **debug** how pages load or how interactions proceed.
@@ -70,6 +84,7 @@ We group them by category.
|------------------------------|--------------------------------------|-------------------------------------------------------------------------------------------------| |------------------------------|--------------------------------------|-------------------------------------------------------------------------------------------------|
| **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. | | **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. |
| **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). | | **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). |
| **`chunking_strategy`** | `ChunkingStrategy` (default: RegexChunking()) | Strategy to chunk content before extraction. Can be customized for different chunking approaches. |
| **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). Can be customized with options such as `content_source` parameter to select the HTML input source ('cleaned_html', 'raw_html', or 'fit_html'). | | **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). Can be customized with options such as `content_source` parameter to select the HTML input source ('cleaned_html', 'raw_html', or 'fit_html'). |
| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. Affects the entire extraction process. | | **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. Affects the entire extraction process. |
| **`target_elements`** | `List[str]` (None) | List of CSS selectors for elements to focus on for markdown generation and data extraction, while still processing the entire page for links, media, etc. Provides more flexibility than `css_selector`. | | **`target_elements`** | `List[str]` (None) | List of CSS selectors for elements to focus on for markdown generation and data extraction, while still processing the entire page for links, media, etc. Provides more flexibility than `css_selector`. |
@@ -78,32 +93,50 @@ We group them by category.
| **`only_text`** | `bool` (False) | If `True`, tries to extract text-only content. | | **`only_text`** | `bool` (False) | If `True`, tries to extract text-only content. |
| **`prettiify`** | `bool` (False) | If `True`, beautifies final HTML (slower, purely cosmetic). | | **`prettiify`** | `bool` (False) | If `True`, beautifies final HTML (slower, purely cosmetic). |
| **`keep_data_attributes`** | `bool` (False) | If `True`, preserve `data-*` attributes in cleaned HTML. | | **`keep_data_attributes`** | `bool` (False) | If `True`, preserve `data-*` attributes in cleaned HTML. |
| **`keep_attrs`** | `list` (default: []) | List of HTML attributes to keep during processing (e.g., `["id", "class", "data-value"]`). |
| **`remove_forms`** | `bool` (False) | If `True`, remove all `<form>` elements. | | **`remove_forms`** | `bool` (False) | If `True`, remove all `<form>` elements. |
| **`parser_type`** | `str` (default: "lxml") | HTML parser to use (e.g., "lxml", "html.parser"). |
| **`scraping_strategy`** | `ContentScrapingStrategy` (default: LXMLWebScrapingStrategy()) | Strategy to use for content scraping. Can be customized for different scraping needs (e.g., PDF extraction). |
--- ---
### B) **Caching & Session** ### B) **Browser Location and Identity**
| **Parameter** | **Type / Default** | **What It Does** |
|------------------------|---------------------------|--------------------------------------------------------------------------------------------------------|
| **`locale`** | `str or None` (None) | Browser's locale (e.g., "en-US", "fr-FR") for language preferences. |
| **`timezone_id`** | `str or None` (None) | Browser's timezone (e.g., "America/New_York", "Europe/Paris"). |
| **`geolocation`** | `GeolocationConfig or None` (None) | GPS coordinates configuration. Use `GeolocationConfig(latitude=..., longitude=..., accuracy=...)`. |
| **`fetch_ssl_certificate`** | `bool` (False) | If `True`, fetches and includes SSL certificate information in the result. |
| **`proxy_config`** | `ProxyConfig or dict or None` (None) | Proxy configuration for this specific crawl. Can override browser-level proxy settings. |
| **`proxy_rotation_strategy`** | `ProxyRotationStrategy` (None) | Strategy for rotating proxies during crawl operations. |
---
### C) **Caching & Session**
| **Parameter** | **Type / Default** | **What It Does** | | **Parameter** | **Type / Default** | **What It Does** |
|-------------------------|------------------------|------------------------------------------------------------------------------------------------------------------------------| |-------------------------|------------------------|------------------------------------------------------------------------------------------------------------------------------|
| **`cache_mode`** | `CacheMode or None` | Controls how caching is handled (`ENABLED`, `BYPASS`, `DISABLED`, etc.). If `None`, typically defaults to `ENABLED`. | | **`cache_mode`** | `CacheMode or None` | Controls how caching is handled (`ENABLED`, `BYPASS`, `DISABLED`, etc.). If `None`, typically defaults to `ENABLED`. |
| **`session_id`** | `str or None` | Assign a unique ID to reuse a single browser session across multiple `arun()` calls. | | **`session_id`** | `str or None` | Assign a unique ID to reuse a single browser session across multiple `arun()` calls. |
| **`bypass_cache`** | `bool` (False) | If `True`, acts like `CacheMode.BYPASS`. | | **`bypass_cache`** | `bool` (False) | **Deprecated.** If `True`, acts like `CacheMode.BYPASS`. Use `cache_mode` instead. |
| **`disable_cache`** | `bool` (False) | If `True`, acts like `CacheMode.DISABLED`. | | **`disable_cache`** | `bool` (False) | **Deprecated.** If `True`, acts like `CacheMode.DISABLED`. Use `cache_mode` instead. |
| **`no_cache_read`** | `bool` (False) | If `True`, acts like `CacheMode.WRITE_ONLY` (writes cache but never reads). | | **`no_cache_read`** | `bool` (False) | **Deprecated.** If `True`, acts like `CacheMode.WRITE_ONLY` (writes cache but never reads). Use `cache_mode` instead. |
| **`no_cache_write`** | `bool` (False) | If `True`, acts like `CacheMode.READ_ONLY` (reads cache but never writes). | | **`no_cache_write`** | `bool` (False) | **Deprecated.** If `True`, acts like `CacheMode.READ_ONLY` (reads cache but never writes). Use `cache_mode` instead. |
| **`shared_data`** | `dict or None` (None) | Shared data to be passed between hooks and accessible across crawl operations. |
Use these for controlling whether you read or write from a local content cache. Handy for large batch crawls or repeated site visits. Use these for controlling whether you read or write from a local content cache. Handy for large batch crawls or repeated site visits.
--- ---
### C) **Page Navigation & Timing** ### D) **Page Navigation & Timing**
| **Parameter** | **Type / Default** | **What It Does** | | **Parameter** | **Type / Default** | **What It Does** |
|----------------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------| |----------------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------|
| **`wait_until`** | `str` (domcontentloaded)| Condition for navigation to complete. Often `"networkidle"` or `"domcontentloaded"`. | | **`wait_until`** | `str` (domcontentloaded)| Condition for navigation to "complete". Often `"networkidle"` or `"domcontentloaded"`. |
| **`page_timeout`** | `int` (60000 ms) | Timeout for page navigation or JS steps. Increase for slow sites. | | **`page_timeout`** | `int` (60000 ms) | Timeout for page navigation or JS steps. Increase for slow sites. |
| **`wait_for`** | `str or None` | Wait for a CSS (`"css:selector"`) or JS (`"js:() => bool"`) condition before content extraction. | | **`wait_for`** | `str or None` | Wait for a CSS (`"css:selector"`) or JS (`"js:() => bool"`) condition before content extraction. |
| **`wait_for_timeout`** | `int or None` (None) | Specific timeout in ms for the `wait_for` condition. If None, uses `page_timeout`. |
| **`wait_for_images`** | `bool` (False) | Wait for images to load before finishing. Slows down if you only want text. | | **`wait_for_images`** | `bool` (False) | Wait for images to load before finishing. Slows down if you only want text. |
| **`delay_before_return_html`** | `float` (0.1) | Additional pause (seconds) before final HTML is captured. Good for last-second updates. | | **`delay_before_return_html`** | `float` (0.1) | Additional pause (seconds) before final HTML is captured. Good for last-second updates. |
| **`check_robots_txt`** | `bool` (False) | Whether to check and respect robots.txt rules before crawling. If True, caches robots.txt for efficiency. | | **`check_robots_txt`** | `bool` (False) | Whether to check and respect robots.txt rules before crawling. If True, caches robots.txt for efficiency. |
@@ -112,15 +145,17 @@ Use these for controlling whether you read or write from a local content cache.
--- ---
### D) **Page Interaction** ### E) **Page Interaction**
| **Parameter** | **Type / Default** | **What It Does** | | **Parameter** | **Type / Default** | **What It Does** |
|----------------------------|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------| |----------------------------|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------|
| **`js_code`** | `str or list[str]` (None) | JavaScript to run after load. E.g. `"document.querySelector('button')?.click();"`. | | **`js_code`** | `str or list[str]` (None) | JavaScript to run after load. E.g. `"document.querySelector('button')?.click();"`. |
| **`js_only`** | `bool` (False) | If `True`, indicates were reusing an existing session and only applying JS. No full reload. | | **`c4a_script`** | `str or list[str]` (None) | C4A script that compiles to JavaScript. Alternative to writing raw JS. |
| **`js_only`** | `bool` (False) | If `True`, indicates we're reusing an existing session and only applying JS. No full reload. |
| **`ignore_body_visibility`** | `bool` (True) | Skip checking if `<body>` is visible. Usually best to keep `True`. | | **`ignore_body_visibility`** | `bool` (True) | Skip checking if `<body>` is visible. Usually best to keep `True`. |
| **`scan_full_page`** | `bool` (False) | If `True`, auto-scroll the page to load dynamic content (infinite scroll). | | **`scan_full_page`** | `bool` (False) | If `True`, auto-scroll the page to load dynamic content (infinite scroll). |
| **`scroll_delay`** | `float` (0.2) | Delay between scroll steps if `scan_full_page=True`. | | **`scroll_delay`** | `float` (0.2) | Delay between scroll steps if `scan_full_page=True`. |
| **`max_scroll_steps`** | `int or None` (None) | Maximum number of scroll steps during full page scan. If None, scrolls until entire page is loaded. |
| **`process_iframes`** | `bool` (False) | Inlines iframe content for single-page extraction. | | **`process_iframes`** | `bool` (False) | Inlines iframe content for single-page extraction. |
| **`remove_overlay_elements`** | `bool` (False) | Removes potential modals/popups blocking the main content. | | **`remove_overlay_elements`** | `bool` (False) | Removes potential modals/popups blocking the main content. |
| **`simulate_user`** | `bool` (False) | Simulate user interactions (mouse movements) to avoid bot detection. | | **`simulate_user`** | `bool` (False) | Simulate user interactions (mouse movements) to avoid bot detection. |
@@ -132,7 +167,7 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
--- ---
### E) **Media Handling** ### F) **Media Handling**
| **Parameter** | **Type / Default** | **What It Does** | | **Parameter** | **Type / Default** | **What It Does** |
|--------------------------------------------|---------------------|-----------------------------------------------------------------------------------------------------------| |--------------------------------------------|---------------------|-----------------------------------------------------------------------------------------------------------|
@@ -141,13 +176,16 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
| **`screenshot_height_threshold`** | `int` (~20000) | If the page is taller than this, alternate screenshot strategies are used. | | **`screenshot_height_threshold`** | `int` (~20000) | If the page is taller than this, alternate screenshot strategies are used. |
| **`pdf`** | `bool` (False) | If `True`, returns a PDF in `result.pdf`. | | **`pdf`** | `bool` (False) | If `True`, returns a PDF in `result.pdf`. |
| **`capture_mhtml`** | `bool` (False) | If `True`, captures an MHTML snapshot of the page in `result.mhtml`. MHTML includes all page resources (CSS, images, etc.) in a single file. | | **`capture_mhtml`** | `bool` (False) | If `True`, captures an MHTML snapshot of the page in `result.mhtml`. MHTML includes all page resources (CSS, images, etc.) in a single file. |
| **`image_description_min_word_threshold`** | `int` (~50) | Minimum words for an images alt text or description to be considered valid. | | **`image_description_min_word_threshold`** | `int` (~50) | Minimum words for an image's alt text or description to be considered valid. |
| **`image_score_threshold`** | `int` (~3) | Filter out low-scoring images. The crawler scores images by relevance (size, context, etc.). | | **`image_score_threshold`** | `int` (~3) | Filter out low-scoring images. The crawler scores images by relevance (size, context, etc.). |
| **`exclude_external_images`** | `bool` (False) | Exclude images from other domains. | | **`exclude_external_images`** | `bool` (False) | Exclude images from other domains. |
| **`exclude_all_images`** | `bool` (False) | If `True`, excludes all images from processing (both internal and external). |
| **`table_score_threshold`** | `int` (7) | Minimum score threshold for processing a table. Lower values include more tables. |
| **`table_extraction`** | `TableExtractionStrategy` (DefaultTableExtraction) | Strategy for table extraction. Defaults to DefaultTableExtraction with configured threshold. |
--- ---
### F) **Link/Domain Handling** ### G) **Link/Domain Handling**
| **Parameter** | **Type / Default** | **What It Does** | | **Parameter** | **Type / Default** | **What It Does** |
|------------------------------|-------------------------|-----------------------------------------------------------------------------------------------------------------------------| |------------------------------|-------------------------|-----------------------------------------------------------------------------------------------------------------------------|
@@ -155,23 +193,39 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
| **`exclude_external_links`** | `bool` (False) | Removes all links pointing outside the current domain. | | **`exclude_external_links`** | `bool` (False) | Removes all links pointing outside the current domain. |
| **`exclude_social_media_links`** | `bool` (False) | Strips links specifically to social sites (like Facebook or Twitter). | | **`exclude_social_media_links`** | `bool` (False) | Strips links specifically to social sites (like Facebook or Twitter). |
| **`exclude_domains`** | `list` ([]) | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`). | | **`exclude_domains`** | `list` ([]) | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`). |
| **`exclude_internal_links`** | `bool` (False) | If `True`, excludes internal links from the results. |
| **`score_links`** | `bool` (False) | If `True`, calculates intrinsic quality scores for all links using URL structure, text quality, and contextual metrics. |
| **`preserve_https_for_internal_links`** | `bool` (False) | If `True`, preserves HTTPS scheme for internal links even when the server redirects to HTTP. Useful for security-conscious crawling. | | **`preserve_https_for_internal_links`** | `bool` (False) | If `True`, preserves HTTPS scheme for internal links even when the server redirects to HTTP. Useful for security-conscious crawling. |
Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains). Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains).
--- ---
### G) **Debug & Logging** ### H) **Debug, Logging & Network Monitoring**
| **Parameter** | **Type / Default** | **What It Does** | | **Parameter** | **Type / Default** | **What It Does** |
|----------------|--------------------|---------------------------------------------------------------------------| |----------------|--------------------|---------------------------------------------------------------------------|
| **`verbose`** | `bool` (True) | Prints logs detailing each step of crawling, interactions, or errors. | | **`verbose`** | `bool` (True) | Prints logs detailing each step of crawling, interactions, or errors. |
| **`log_console`** | `bool` (False) | Logs the pages JavaScript console output if you want deeper JS debugging.| | **`log_console`** | `bool` (False) | Logs the page's JavaScript console output if you want deeper JS debugging.|
| **`capture_network_requests`** | `bool` (False) | If `True`, captures network requests made by the page in `result.captured_requests`. |
| **`capture_console_messages`** | `bool` (False) | If `True`, captures console messages from the page in `result.console_messages`. |
--- ---
### I) **Connection & HTTP Parameters**
### H) **Virtual Scroll Configuration** | **Parameter** | **Type / Default** | **What It Does** |
|-----------------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------|
| **`method`** | `str` ("GET") | HTTP method to use when using AsyncHTTPCrawlerStrategy (e.g., "GET", "POST"). |
| **`stream`** | `bool` (False) | If `True`, enables streaming mode for `arun_many()` to process URLs as they complete rather than waiting for all. |
| **`url`** | `str or None` (None) | URL for this specific config. Not typically set directly but used internally for URL-specific configurations. |
| **`user_agent`** | `str or None` (None) | Custom User-Agent string for this crawl. Can override browser-level user agent. |
| **`user_agent_mode`** | `str or None` (None) | Set to `"random"` to randomize user agent. Can override browser-level setting. |
| **`user_agent_generator_config`** | `dict` ({}) | Configuration for user agent generation when `user_agent_mode="random"`. |
---
### J) **Virtual Scroll Configuration**
| **Parameter** | **Type / Default** | **What It Does** | | **Parameter** | **Type / Default** | **What It Does** |
|------------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------| |------------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
@@ -211,7 +265,7 @@ See [Virtual Scroll documentation](../../advanced/virtual-scroll.md) for detaile
--- ---
### I) **URL Matching Configuration** ### K) **URL Matching Configuration**
| **Parameter** | **Type / Default** | **What It Does** | | **Parameter** | **Type / Default** | **What It Does** |
|------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------| |------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
@@ -274,7 +328,25 @@ default_config = CrawlerRunConfig() # No url_matcher = matches everything
- If no config matches a URL and there's no default config (one without `url_matcher`), the URL will fail with "No matching configuration found" - If no config matches a URL and there's no default config (one without `url_matcher`), the URL will fail with "No matching configuration found"
- Always include a default config as the last item if you want to handle all URLs - Always include a default config as the last item if you want to handle all URLs
---## 2.2 Helper Methods ---
### L) **Advanced Crawling Features**
| **Parameter** | **Type / Default** | **What It Does** |
|-----------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
| **`deep_crawl_strategy`** | `DeepCrawlStrategy or None` (None) | Strategy for deep/recursive crawling. Enables automatic link following and multi-level site crawling. |
| **`link_preview_config`** | `LinkPreviewConfig or dict or None` (None) | Configuration for link head extraction and scoring. Fetches and scores link metadata without full page loads. |
| **`experimental`** | `dict or None` (None) | Dictionary for experimental/beta features not yet integrated into main parameters. Use with caution. |
**Deep Crawl Strategy** enables automatic site exploration by following links according to defined rules. Useful for sitemap generation or comprehensive site archiving.
**Link Preview Config** allows efficient link discovery and scoring by fetching only the `<head>` section of linked pages, enabling smart crawl prioritization without the overhead of full page loads.
**Experimental** parameters are features in beta testing. They may change or be removed in future versions. Check documentation for currently available experimental features.
---
## 2.2 Helper Methods
Both `BrowserConfig` and `CrawlerRunConfig` provide a `clone()` method to create modified copies: Both `BrowserConfig` and `CrawlerRunConfig` provide a `clone()` method to create modified copies:

View File

@@ -17,6 +17,11 @@ class BrowserConfig:
def __init__( def __init__(
browser_type="chromium", browser_type="chromium",
headless=True, headless=True,
browser_mode="dedicated",
use_managed_browser=False,
cdp_url=None,
debugging_port=9222,
host="localhost",
proxy_config=None, proxy_config=None,
viewport_width=1080, viewport_width=1080,
viewport_height=600, viewport_height=600,
@@ -25,7 +30,13 @@ class BrowserConfig:
user_data_dir=None, user_data_dir=None,
cookies=None, cookies=None,
headers=None, headers=None,
user_agent=None, user_agent=(
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 "
# "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
# "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36"
),
user_agent_mode="",
text_mode=False, text_mode=False,
light_mode=False, light_mode=False,
extra_args=None, extra_args=None,
@@ -37,17 +48,33 @@ class BrowserConfig:
### Key Fields to Note ### Key Fields to Note
1. **`browser_type`** 1.**`browser_type`**
- Options: `"chromium"`, `"firefox"`, or `"webkit"`. - Options: `"chromium"`, `"firefox"`, or `"webkit"`.
- Defaults to `"chromium"`. - Defaults to `"chromium"`.
- If you need a different engine, specify it here. - If you need a different engine, specify it here.
2. **`headless`** 2.**`headless`**
- `True`: Runs the browser in headless mode (invisible browser). - `True`: Runs the browser in headless mode (invisible browser).
- `False`: Runs the browser in visible mode, which helps with debugging. - `False`: Runs the browser in visible mode, which helps with debugging.
3. **`proxy_config`** 3.**`browser_mode`**
- A dictionary with fields like: - Determines how the browser should be initialized:
- `"dedicated"` (default): Creates a new browser instance each time
- `"builtin"`: Uses the builtin CDP browser running in background
- `"custom"`: Uses explicit CDP settings provided in `cdp_url`
- `"docker"`: Runs browser in Docker container with isolation
4.**`use_managed_browser`** & **`cdp_url`**
- `use_managed_browser=True`: Launch browser using Chrome DevTools Protocol (CDP) for advanced control
- `cdp_url`: URL for CDP endpoint (e.g., `"ws://localhost:9222/devtools/browser/"`)
- Automatically set based on `browser_mode`
5.**`debugging_port`** & **`host`**
- `debugging_port`: Port for browser debugging protocol (default: 9222)
- `host`: Host for browser connection (default: "localhost")
6.**`proxy_config`**
- A `ProxyConfig` object or dictionary with fields like:
```json ```json
{ {
"server": "http://proxy.example.com:8080", "server": "http://proxy.example.com:8080",
@@ -57,35 +84,35 @@ class BrowserConfig:
``` ```
- Leave as `None` if a proxy is not required. - Leave as `None` if a proxy is not required.
4. **`viewport_width` & `viewport_height`**: 7.**`viewport_width` & `viewport_height`**
- The initial window size. - The initial window size.
- Some sites behave differently with smaller or bigger viewports. - Some sites behave differently with smaller or bigger viewports.
5. **`verbose`**: 8.**`verbose`**
- If `True`, prints extra logs. - If `True`, prints extra logs.
- Handy for debugging. - Handy for debugging.
6. **`use_persistent_context`**: 9.**`use_persistent_context`**
- If `True`, uses a **persistent** browser profile, storing cookies/local storage across runs. - If `True`, uses a **persistent** browser profile, storing cookies/local storage across runs.
- Typically also set `user_data_dir` to point to a folder. - Typically also set `user_data_dir` to point to a folder.
7. **`cookies`** & **`headers`**: 10.**`cookies`** & **`headers`**
- If you want to start with specific cookies or add universal HTTP headers, set them here. - If you want to start with specific cookies or add universal HTTP headers to the browser context, set them here.
- E.g. `cookies=[{"name": "session", "value": "abc123", "domain": "example.com"}]`. - E.g. `cookies=[{"name": "session", "value": "abc123", "domain": "example.com"}]`.
8. **`user_agent`**: 11.**`user_agent`** & **`user_agent_mode`**
- Custom User-Agent string. If `None`, a default is used. - `user_agent`: Custom User-Agent string. If `None`, a default is used.
- You can also set `user_agent_mode="random"` for randomization (if you want to fight bot detection). - `user_agent_mode`: Set to `"random"` for randomization (helps fight bot detection).
9. **`text_mode`** & **`light_mode`**: 12.**`text_mode`** & **`light_mode`**
- `text_mode=True` disables images, possibly speeding up text-only crawls. - `text_mode=True` disables images, possibly speeding up text-only crawls.
- `light_mode=True` turns off certain background features for performance. - `light_mode=True` turns off certain background features for performance.
10. **`extra_args`**: 13.**`extra_args`**
- Additional flags for the underlying browser. - Additional flags for the underlying browser.
- E.g. `["--disable-extensions"]`. - E.g. `["--disable-extensions"]`.
11. **`enable_stealth`**: 14.**`enable_stealth`**
- If `True`, enables stealth mode using playwright-stealth. - If `True`, enables stealth mode using playwright-stealth.
- Modifies browser fingerprints to avoid basic bot detection. - Modifies browser fingerprints to avoid basic bot detection.
- Default is `False`. Recommended for sites with bot protection. - Default is `False`. Recommended for sites with bot protection.
@@ -134,9 +161,11 @@ class CrawlerRunConfig:
def __init__( def __init__(
word_count_threshold=200, word_count_threshold=200,
extraction_strategy=None, extraction_strategy=None,
chunking_strategy=RegexChunking(),
markdown_generator=None, markdown_generator=None,
cache_mode=None, cache_mode=CacheMode.BYPASS,
js_code=None, js_code=None,
c4a_script=None,
wait_for=None, wait_for=None,
screenshot=False, screenshot=False,
pdf=False, pdf=False,
@@ -145,13 +174,18 @@ class CrawlerRunConfig:
locale=None, # e.g. "en-US", "fr-FR" locale=None, # e.g. "en-US", "fr-FR"
timezone_id=None, # e.g. "America/New_York" timezone_id=None, # e.g. "America/New_York"
geolocation=None, # GeolocationConfig object geolocation=None, # GeolocationConfig object
# Resource Management # Proxy Configuration
enable_rate_limiting=False, proxy_config=None,
rate_limit_config=None, proxy_rotation_strategy=None,
memory_threshold_percent=70.0, # Page Interaction Parameters
check_interval=1.0, scan_full_page=False,
max_session_permit=20, scroll_delay=0.2,
display_mode=None, wait_until="domcontentloaded",
page_timeout=60000,
delay_before_return_html=0.1,
# URL Matching Parameters
url_matcher=None, # For URL-specific configurations
match_mode=MatchMode.OR,
verbose=True, verbose=True,
stream=False, # Enable streaming for arun_many() stream=False, # Enable streaming for arun_many()
# ... other advanced parameters omitted # ... other advanced parameters omitted
@@ -161,69 +195,68 @@ class CrawlerRunConfig:
### Key Fields to Note ### Key Fields to Note
1. **`word_count_threshold`**: 1.**`word_count_threshold`**:
- The minimum word count before a block is considered. - The minimum word count before a block is considered.
- If your site has lots of short paragraphs or items, you can lower it. - If your site has lots of short paragraphs or items, you can lower it.
2. **`extraction_strategy`**: 2.**`extraction_strategy`**:
- Where you plug in JSON-based extraction (CSS, LLM, etc.). - Where you plug in JSON-based extraction (CSS, LLM, etc.).
- If `None`, no structured extraction is done (only raw/cleaned HTML + markdown). - If `None`, no structured extraction is done (only raw/cleaned HTML + markdown).
3. **`markdown_generator`**: 3.**`chunking_strategy`**:
- Strategy to chunk content before extraction.
- Defaults to `RegexChunking()`. Can be customized for different chunking approaches.
4.**`markdown_generator`**:
- E.g., `DefaultMarkdownGenerator(...)`, controlling how HTML→Markdown conversion is done. - E.g., `DefaultMarkdownGenerator(...)`, controlling how HTML→Markdown conversion is done.
- If `None`, a default approach is used. - If `None`, a default approach is used.
4. **`cache_mode`**: 5.**`cache_mode`**:
- Controls caching behavior (`ENABLED`, `BYPASS`, `DISABLED`, etc.). - Controls caching behavior (`ENABLED`, `BYPASS`, `DISABLED`, etc.).
- If `None`, defaults to some level of caching or you can specify `CacheMode.ENABLED`. - Defaults to `CacheMode.BYPASS`.
5. **`js_code`**: 6.**`js_code`** & **`c4a_script`**:
- A string or list of JS strings to execute. - `js_code`: A string or list of JavaScript strings to execute.
- `c4a_script`: C4A script that compiles to JavaScript.
- Great for "Load More" buttons or user interactions. - Great for "Load More" buttons or user interactions.
6. **`wait_for`**: 7.**`wait_for`**:
- A CSS or JS expression to wait for before extracting content. - A CSS or JS expression to wait for before extracting content.
- Common usage: `wait_for="css:.main-loaded"` or `wait_for="js:() => window.loaded === true"`. - Common usage: `wait_for="css:.main-loaded"` or `wait_for="js:() => window.loaded === true"`.
7. **`screenshot`**, **`pdf`**, & **`capture_mhtml`**: 8.**`screenshot`**, **`pdf`**, & **`capture_mhtml`**:
- If `True`, captures a screenshot, PDF, or MHTML snapshot after the page is fully loaded. - If `True`, captures a screenshot, PDF, or MHTML snapshot after the page is fully loaded.
- The results go to `result.screenshot` (base64), `result.pdf` (bytes), or `result.mhtml` (string). - The results go to `result.screenshot` (base64), `result.pdf` (bytes), or `result.mhtml` (string).
8. **Location Parameters**: 9.**Location Parameters**:
- **`locale`**: Browser's locale (e.g., `"en-US"`, `"fr-FR"`) for language preferences - **`locale`**: Browser's locale (e.g., `"en-US"`, `"fr-FR"`) for language preferences
- **`timezone_id`**: Browser's timezone (e.g., `"America/New_York"`, `"Europe/Paris"`) - **`timezone_id`**: Browser's timezone (e.g., `"America/New_York"`, `"Europe/Paris"`)
- **`geolocation`**: GPS coordinates via `GeolocationConfig(latitude=48.8566, longitude=2.3522)` - **`geolocation`**: GPS coordinates via `GeolocationConfig(latitude=48.8566, longitude=2.3522)`
- See [Identity Based Crawling](../advanced/identity-based-crawling.md#7-locale-timezone-and-geolocation-control) - See [Identity Based Crawling](../advanced/identity-based-crawling.md#7-locale-timezone-and-geolocation-control)
9. **`verbose`**: 10.**Proxy Configuration**:
- Logs additional runtime details. - **`proxy_config`**: Proxy server configuration (ProxyConfig object or dict) e.g. {"server": "...", "username": "...", "password"}
- Overlaps with the browser's verbosity if also set to `True` in `BrowserConfig`. - **`proxy_rotation_strategy`**: Strategy for rotating proxies during crawls
10. **`enable_rate_limiting`**: 11.**Page Interaction Parameters**:
- If `True`, enables rate limiting for batch processing. - **`scan_full_page`**: If `True`, scroll through the entire page to load all content
- Requires `rate_limit_config` to be set. - **`wait_until`**: Condition to wait for when navigating (e.g., "domcontentloaded", "networkidle")
- **`page_timeout`**: Timeout in milliseconds for page operations (default: 60000)
- **`delay_before_return_html`**: Delay in seconds before retrieving final HTML.
11. **`memory_threshold_percent`**: 12.**`url_matcher`** & **`match_mode`**:
- The memory threshold (as a percentage) to monitor.
- If exceeded, the crawler will pause or slow down.
12. **`check_interval`**:
- The interval (in seconds) to check system resources.
- Affects how often memory and CPU usage are monitored.
13. **`max_session_permit`**:
- The maximum number of concurrent crawl sessions.
- Helps prevent overwhelming the system.
14. **`url_matcher`** & **`match_mode`**:
- Enable URL-specific configurations when used with `arun_many()`. - Enable URL-specific configurations when used with `arun_many()`.
- Set `url_matcher` to patterns (glob, function, or list) to match specific URLs. - Set `url_matcher` to patterns (glob, function, or list) to match specific URLs.
- Use `match_mode` (OR/AND) to control how multiple patterns combine. - Use `match_mode` (OR/AND) to control how multiple patterns combine.
- See [URL-Specific Configurations](../api/arun_many.md#url-specific-configurations) for examples. - See [URL-Specific Configurations](../api/arun_many.md#url-specific-configurations) for examples.
15. **`display_mode`**: 13.**`verbose`**:
- The display mode for progress information (`DETAILED`, `BRIEF`, etc.). - Logs additional runtime details.
- Affects how much information is printed during the crawl. - Overlaps with the browser's verbosity if also set to `True` in `BrowserConfig`.
14.**`stream`**:
- If `True`, enables streaming mode for `arun_many()` to process URLs as they complete.
- Allows handling results incrementally instead of waiting for all URLs to finish.
### Helper Methods ### Helper Methods
@@ -263,16 +296,16 @@ The `clone()` method:
### Key fields to note ### Key fields to note
1. **`provider`**: 1.**`provider`**:
- Which LLM provider to use. - Which LLM provider to use.
- Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)* - Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)*
2. **`api_token`**: 2.**`api_token`**:
- Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables - Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables
- API token of LLM provider <br/> eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"` - API token of LLM provider <br/> eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"`
- Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"` - Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"`
3. **`base_url`**: 3.**`base_url`**:
- If your provider has a custom endpoint - If your provider has a custom endpoint
```python ```python