Merge branch 'develop' into release/v0.7.7
This commit is contained in:
@@ -4,14 +4,26 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
|
|||||||
from ..models import CrawlResult
|
from ..models import CrawlResult
|
||||||
from .bfs_strategy import BFSDeepCrawlStrategy # noqa
|
from .bfs_strategy import BFSDeepCrawlStrategy # noqa
|
||||||
from ..types import AsyncWebCrawler, CrawlerRunConfig
|
from ..types import AsyncWebCrawler, CrawlerRunConfig
|
||||||
|
from ..utils import normalize_url_for_deep_crawl
|
||||||
|
|
||||||
class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
||||||
"""
|
"""
|
||||||
Depth-First Search (DFS) deep crawling strategy.
|
Depth-first deep crawling with familiar BFS rules.
|
||||||
|
|
||||||
Inherits URL validation and link discovery from BFSDeepCrawlStrategy.
|
We reuse the same filters, scoring, and page limits from :class:`BFSDeepCrawlStrategy`,
|
||||||
Overrides _arun_batch and _arun_stream to use a stack (LIFO) for DFS traversal.
|
but walk the graph with a stack so we fully explore one branch before hopping to the
|
||||||
|
next. DFS also keeps its own ``_dfs_seen`` set so we can drop duplicate links at
|
||||||
|
discovery time without accidentally marking them as “already crawled”.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self._dfs_seen: Set[str] = set()
|
||||||
|
|
||||||
|
def _reset_seen(self, start_url: str) -> None:
|
||||||
|
"""Start each crawl with a clean dedupe set seeded with the root URL."""
|
||||||
|
self._dfs_seen = {start_url}
|
||||||
|
|
||||||
async def _arun_batch(
|
async def _arun_batch(
|
||||||
self,
|
self,
|
||||||
start_url: str,
|
start_url: str,
|
||||||
@@ -19,14 +31,19 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
|||||||
config: CrawlerRunConfig,
|
config: CrawlerRunConfig,
|
||||||
) -> List[CrawlResult]:
|
) -> List[CrawlResult]:
|
||||||
"""
|
"""
|
||||||
Batch (non-streaming) DFS mode.
|
Crawl level-by-level but emit results at the end.
|
||||||
Uses a stack to traverse URLs in DFS order, aggregating CrawlResults into a list.
|
|
||||||
|
We keep a stack of ``(url, parent, depth)`` tuples, pop one at a time, and
|
||||||
|
hand it to ``crawler.arun_many`` with deep crawling disabled so we remain
|
||||||
|
in control of traversal. Every successful page bumps ``_pages_crawled`` and
|
||||||
|
seeds new stack items discovered via :meth:`link_discovery`.
|
||||||
"""
|
"""
|
||||||
visited: Set[str] = set()
|
visited: Set[str] = set()
|
||||||
# Stack items: (url, parent_url, depth)
|
# Stack items: (url, parent_url, depth)
|
||||||
stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
|
stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
|
||||||
depths: Dict[str, int] = {start_url: 0}
|
depths: Dict[str, int] = {start_url: 0}
|
||||||
results: List[CrawlResult] = []
|
results: List[CrawlResult] = []
|
||||||
|
self._reset_seen(start_url)
|
||||||
|
|
||||||
while stack and not self._cancel_event.is_set():
|
while stack and not self._cancel_event.is_set():
|
||||||
url, parent, depth = stack.pop()
|
url, parent, depth = stack.pop()
|
||||||
@@ -71,12 +88,16 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
|||||||
config: CrawlerRunConfig,
|
config: CrawlerRunConfig,
|
||||||
) -> AsyncGenerator[CrawlResult, None]:
|
) -> AsyncGenerator[CrawlResult, None]:
|
||||||
"""
|
"""
|
||||||
Streaming DFS mode.
|
Same traversal as :meth:`_arun_batch`, but yield pages immediately.
|
||||||
Uses a stack to traverse URLs in DFS order and yields CrawlResults as they become available.
|
|
||||||
|
Each popped URL is crawled, its metadata annotated, then the result gets
|
||||||
|
yielded before we even look at the next stack entry. Successful crawls
|
||||||
|
still feed :meth:`link_discovery`, keeping DFS order intact.
|
||||||
"""
|
"""
|
||||||
visited: Set[str] = set()
|
visited: Set[str] = set()
|
||||||
stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
|
stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
|
||||||
depths: Dict[str, int] = {start_url: 0}
|
depths: Dict[str, int] = {start_url: 0}
|
||||||
|
self._reset_seen(start_url)
|
||||||
|
|
||||||
while stack and not self._cancel_event.is_set():
|
while stack and not self._cancel_event.is_set():
|
||||||
url, parent, depth = stack.pop()
|
url, parent, depth = stack.pop()
|
||||||
@@ -108,3 +129,92 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
|||||||
for new_url, new_parent in reversed(new_links):
|
for new_url, new_parent in reversed(new_links):
|
||||||
new_depth = depths.get(new_url, depth + 1)
|
new_depth = depths.get(new_url, depth + 1)
|
||||||
stack.append((new_url, new_parent, new_depth))
|
stack.append((new_url, new_parent, new_depth))
|
||||||
|
|
||||||
|
async def link_discovery(
|
||||||
|
self,
|
||||||
|
result: CrawlResult,
|
||||||
|
source_url: str,
|
||||||
|
current_depth: int,
|
||||||
|
_visited: Set[str],
|
||||||
|
next_level: List[Tuple[str, Optional[str]]],
|
||||||
|
depths: Dict[str, int],
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Find the next URLs we should push onto the DFS stack.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
result : CrawlResult
|
||||||
|
Output of the page we just crawled; its ``links`` block is our raw material.
|
||||||
|
source_url : str
|
||||||
|
URL of the parent page; stored so callers can track ancestry.
|
||||||
|
current_depth : int
|
||||||
|
Depth of the parent; children naturally sit at ``current_depth + 1``.
|
||||||
|
_visited : Set[str]
|
||||||
|
Present to match the BFS signature, but we rely on ``_dfs_seen`` instead.
|
||||||
|
next_level : list of tuples
|
||||||
|
The stack buffer supplied by the caller; we append new ``(url, parent)`` items here.
|
||||||
|
depths : dict
|
||||||
|
Shared depth map so future metadata tagging knows how deep each URL lives.
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
- ``_dfs_seen`` keeps us from pushing duplicates without touching the traversal guard.
|
||||||
|
- Validation, scoring, and capacity trimming mirror the BFS version so behaviour stays consistent.
|
||||||
|
"""
|
||||||
|
next_depth = current_depth + 1
|
||||||
|
if next_depth > self.max_depth:
|
||||||
|
return
|
||||||
|
|
||||||
|
remaining_capacity = self.max_pages - self._pages_crawled
|
||||||
|
if remaining_capacity <= 0:
|
||||||
|
self.logger.info(
|
||||||
|
f"Max pages limit ({self.max_pages}) reached, stopping link discovery"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
links = result.links.get("internal", [])
|
||||||
|
if self.include_external:
|
||||||
|
links += result.links.get("external", [])
|
||||||
|
|
||||||
|
seen = self._dfs_seen
|
||||||
|
valid_links: List[Tuple[str, float]] = []
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
raw_url = link.get("href")
|
||||||
|
if not raw_url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
normalized_url = normalize_url_for_deep_crawl(raw_url, source_url)
|
||||||
|
if not normalized_url or normalized_url in seen:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not await self.can_process_url(raw_url, next_depth):
|
||||||
|
self.stats.urls_skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
score = self.url_scorer.score(normalized_url) if self.url_scorer else 0
|
||||||
|
if score < self.score_threshold:
|
||||||
|
self.logger.debug(
|
||||||
|
f"URL {normalized_url} skipped: score {score} below threshold {self.score_threshold}"
|
||||||
|
)
|
||||||
|
self.stats.urls_skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
seen.add(normalized_url)
|
||||||
|
valid_links.append((normalized_url, score))
|
||||||
|
|
||||||
|
if len(valid_links) > remaining_capacity:
|
||||||
|
if self.url_scorer:
|
||||||
|
valid_links.sort(key=lambda x: x[1], reverse=True)
|
||||||
|
valid_links = valid_links[:remaining_capacity]
|
||||||
|
self.logger.info(
|
||||||
|
f"Limiting to {remaining_capacity} URLs due to max_pages limit"
|
||||||
|
)
|
||||||
|
|
||||||
|
for url, score in valid_links:
|
||||||
|
if score:
|
||||||
|
result.metadata = result.metadata or {}
|
||||||
|
result.metadata["score"] = score
|
||||||
|
next_level.append((url, source_url))
|
||||||
|
depths[url] = next_depth
|
||||||
|
|||||||
39
docs/examples/dfs_crawl_demo.py
Normal file
39
docs/examples/dfs_crawl_demo.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
"""
|
||||||
|
Simple demonstration of the DFS deep crawler visiting multiple pages.
|
||||||
|
|
||||||
|
Run with: python docs/examples/dfs_crawl_demo.py
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||||
|
from crawl4ai.cache_context import CacheMode
|
||||||
|
from crawl4ai.deep_crawling.dfs_strategy import DFSDeepCrawlStrategy
|
||||||
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||||
|
|
||||||
|
|
||||||
|
async def main() -> None:
|
||||||
|
dfs_strategy = DFSDeepCrawlStrategy(
|
||||||
|
max_depth=3,
|
||||||
|
max_pages=50,
|
||||||
|
include_external=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
deep_crawl_strategy=dfs_strategy,
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
markdown_generator=DefaultMarkdownGenerator(),
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
seed_url = "https://docs.python.org/3/" # Plenty of internal links
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
|
||||||
|
async for result in await crawler.arun(url=seed_url, config=config):
|
||||||
|
depth = result.metadata.get("depth")
|
||||||
|
status = "SUCCESS" if result.success else "FAILED"
|
||||||
|
print(f"[{status}] depth={depth} url={result.url}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
@@ -21,21 +21,35 @@ browser_cfg = BrowserConfig(
|
|||||||
|-----------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------|
|
|-----------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| **`browser_type`** | `"chromium"`, `"firefox"`, `"webkit"`<br/>*(default: `"chromium"`)* | Which browser engine to use. `"chromium"` is typical for many sites, `"firefox"` or `"webkit"` for specialized tests. |
|
| **`browser_type`** | `"chromium"`, `"firefox"`, `"webkit"`<br/>*(default: `"chromium"`)* | Which browser engine to use. `"chromium"` is typical for many sites, `"firefox"` or `"webkit"` for specialized tests. |
|
||||||
| **`headless`** | `bool` (default: `True`) | Headless means no visible UI. `False` is handy for debugging. |
|
| **`headless`** | `bool` (default: `True`) | Headless means no visible UI. `False` is handy for debugging. |
|
||||||
|
| **`browser_mode`** | `str` (default: `"dedicated"`) | How browser is initialized: `"dedicated"` (new instance), `"builtin"` (CDP background), `"custom"` (explicit CDP), `"docker"` (container). |
|
||||||
|
| **`use_managed_browser`** | `bool` (default: `False`) | Launch browser via CDP for advanced control. Set automatically based on `browser_mode`. |
|
||||||
|
| **`cdp_url`** | `str` (default: `None`) | Chrome DevTools Protocol endpoint URL (e.g., `"ws://localhost:9222/devtools/browser/"`). Set automatically based on `browser_mode`. |
|
||||||
|
| **`debugging_port`** | `int` (default: `9222`) | Port for browser debugging protocol. |
|
||||||
|
| **`host`** | `str` (default: `"localhost"`) | Host for browser connection. |
|
||||||
| **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. |
|
| **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. |
|
||||||
| **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). |
|
| **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). |
|
||||||
|
| **`viewport`** | `dict` (default: `None`) | Viewport dimensions dict. If set, overrides `viewport_width` and `viewport_height`. |
|
||||||
| **`proxy`** | `str` (deprecated) | Deprecated. Use `proxy_config` instead. If set, it will be auto-converted internally. |
|
| **`proxy`** | `str` (deprecated) | Deprecated. Use `proxy_config` instead. If set, it will be auto-converted internally. |
|
||||||
| **`proxy_config`** | `dict` (default: `None`) | For advanced or multi-proxy needs, specify details like `{"server": "...", "username": "...", ...}`. |
|
| **`proxy_config`** | `ProxyConfig or dict` (default: `None`)| For advanced or multi-proxy needs, specify `ProxyConfig` object or dict like `{"server": "...", "username": "...", "password": "..."}`. |
|
||||||
| **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. |
|
| **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. |
|
||||||
| **`user_data_dir`** | `str or None` (default: `None`) | Directory to store user data (profiles, cookies). Must be set if you want permanent sessions. |
|
| **`user_data_dir`** | `str or None` (default: `None`) | Directory to store user data (profiles, cookies). Must be set if you want permanent sessions. |
|
||||||
|
| **`chrome_channel`** | `str` (default: `"chromium"`) | Chrome channel to launch (e.g., "chrome", "msedge"). Only for `browser_type="chromium"`. Auto-set to empty for Firefox/WebKit. |
|
||||||
|
| **`channel`** | `str` (default: `"chromium"`) | Alias for `chrome_channel`. |
|
||||||
|
| **`accept_downloads`** | `bool` (default: `False`) | Whether to allow file downloads. Requires `downloads_path` if `True`. |
|
||||||
|
| **`downloads_path`** | `str or None` (default: `None`) | Directory to store downloaded files. |
|
||||||
|
| **`storage_state`** | `str or dict or None` (default: `None`)| In-memory storage state (cookies, localStorage) to restore browser state. |
|
||||||
| **`ignore_https_errors`** | `bool` (default: `True`) | If `True`, continues despite invalid certificates (common in dev/staging). |
|
| **`ignore_https_errors`** | `bool` (default: `True`) | If `True`, continues despite invalid certificates (common in dev/staging). |
|
||||||
| **`java_script_enabled`** | `bool` (default: `True`) | Disable if you want no JS overhead, or if only static content is needed. |
|
| **`java_script_enabled`** | `bool` (default: `True`) | Disable if you want no JS overhead, or if only static content is needed. |
|
||||||
|
| **`sleep_on_close`** | `bool` (default: `False`) | Add a small delay when closing browser (can help with cleanup issues). |
|
||||||
| **`cookies`** | `list` (default: `[]`) | Pre-set cookies, each a dict like `{"name": "session", "value": "...", "url": "..."}`. |
|
| **`cookies`** | `list` (default: `[]`) | Pre-set cookies, each a dict like `{"name": "session", "value": "...", "url": "..."}`. |
|
||||||
| **`headers`** | `dict` (default: `{}`) | Extra HTTP headers for every request, e.g. `{"Accept-Language": "en-US"}`. |
|
| **`headers`** | `dict` (default: `{}`) | Extra HTTP headers for every request, e.g. `{"Accept-Language": "en-US"}`. |
|
||||||
| **`user_agent`** | `str` (default: Chrome-based UA) | Your custom or random user agent. `user_agent_mode="random"` can shuffle it. |
|
| **`user_agent`** | `str` (default: Chrome-based UA) | Your custom user agent string. |
|
||||||
| **`light_mode`** | `bool` (default: `False`) | Disables some background features for performance gains. |
|
| **`user_agent_mode`** | `str` (default: `""`) | Set to `"random"` to randomize user agent from a pool (helps with bot detection). |
|
||||||
|
| **`user_agent_generator_config`** | `dict` (default: `{}`) | Configuration dict for user agent generation when `user_agent_mode="random"`. |
|
||||||
| **`text_mode`** | `bool` (default: `False`) | If `True`, tries to disable images/other heavy content for speed. |
|
| **`text_mode`** | `bool` (default: `False`) | If `True`, tries to disable images/other heavy content for speed. |
|
||||||
| **`use_managed_browser`** | `bool` (default: `False`) | For advanced “managed” interactions (debugging, CDP usage). Typically set automatically if persistent context is on. |
|
| **`light_mode`** | `bool` (default: `False`) | Disables some background features for performance gains. |
|
||||||
| **`extra_args`** | `list` (default: `[]`) | Additional flags for the underlying browser process, e.g. `["--disable-extensions"]`. |
|
| **`extra_args`** | `list` (default: `[]`) | Additional flags for the underlying browser process, e.g. `["--disable-extensions"]`. |
|
||||||
|
| **`enable_stealth`** | `bool` (default: `False`) | Enable playwright-stealth mode to bypass bot detection. Cannot be used with `browser_mode="builtin"`. |
|
||||||
|
|
||||||
**Tips**:
|
**Tips**:
|
||||||
- Set `headless=False` to visually **debug** how pages load or how interactions proceed.
|
- Set `headless=False` to visually **debug** how pages load or how interactions proceed.
|
||||||
@@ -70,6 +84,7 @@ We group them by category.
|
|||||||
|------------------------------|--------------------------------------|-------------------------------------------------------------------------------------------------|
|
|------------------------------|--------------------------------------|-------------------------------------------------------------------------------------------------|
|
||||||
| **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. |
|
| **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. |
|
||||||
| **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). |
|
| **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). |
|
||||||
|
| **`chunking_strategy`** | `ChunkingStrategy` (default: RegexChunking()) | Strategy to chunk content before extraction. Can be customized for different chunking approaches. |
|
||||||
| **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). Can be customized with options such as `content_source` parameter to select the HTML input source ('cleaned_html', 'raw_html', or 'fit_html'). |
|
| **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). Can be customized with options such as `content_source` parameter to select the HTML input source ('cleaned_html', 'raw_html', or 'fit_html'). |
|
||||||
| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. Affects the entire extraction process. |
|
| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. Affects the entire extraction process. |
|
||||||
| **`target_elements`** | `List[str]` (None) | List of CSS selectors for elements to focus on for markdown generation and data extraction, while still processing the entire page for links, media, etc. Provides more flexibility than `css_selector`. |
|
| **`target_elements`** | `List[str]` (None) | List of CSS selectors for elements to focus on for markdown generation and data extraction, while still processing the entire page for links, media, etc. Provides more flexibility than `css_selector`. |
|
||||||
@@ -78,32 +93,50 @@ We group them by category.
|
|||||||
| **`only_text`** | `bool` (False) | If `True`, tries to extract text-only content. |
|
| **`only_text`** | `bool` (False) | If `True`, tries to extract text-only content. |
|
||||||
| **`prettiify`** | `bool` (False) | If `True`, beautifies final HTML (slower, purely cosmetic). |
|
| **`prettiify`** | `bool` (False) | If `True`, beautifies final HTML (slower, purely cosmetic). |
|
||||||
| **`keep_data_attributes`** | `bool` (False) | If `True`, preserve `data-*` attributes in cleaned HTML. |
|
| **`keep_data_attributes`** | `bool` (False) | If `True`, preserve `data-*` attributes in cleaned HTML. |
|
||||||
|
| **`keep_attrs`** | `list` (default: []) | List of HTML attributes to keep during processing (e.g., `["id", "class", "data-value"]`). |
|
||||||
| **`remove_forms`** | `bool` (False) | If `True`, remove all `<form>` elements. |
|
| **`remove_forms`** | `bool` (False) | If `True`, remove all `<form>` elements. |
|
||||||
|
| **`parser_type`** | `str` (default: "lxml") | HTML parser to use (e.g., "lxml", "html.parser"). |
|
||||||
|
| **`scraping_strategy`** | `ContentScrapingStrategy` (default: LXMLWebScrapingStrategy()) | Strategy to use for content scraping. Can be customized for different scraping needs (e.g., PDF extraction). |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### B) **Caching & Session**
|
### B) **Browser Location and Identity**
|
||||||
|
|
||||||
|
| **Parameter** | **Type / Default** | **What It Does** |
|
||||||
|
|------------------------|---------------------------|--------------------------------------------------------------------------------------------------------|
|
||||||
|
| **`locale`** | `str or None` (None) | Browser's locale (e.g., "en-US", "fr-FR") for language preferences. |
|
||||||
|
| **`timezone_id`** | `str or None` (None) | Browser's timezone (e.g., "America/New_York", "Europe/Paris"). |
|
||||||
|
| **`geolocation`** | `GeolocationConfig or None` (None) | GPS coordinates configuration. Use `GeolocationConfig(latitude=..., longitude=..., accuracy=...)`. |
|
||||||
|
| **`fetch_ssl_certificate`** | `bool` (False) | If `True`, fetches and includes SSL certificate information in the result. |
|
||||||
|
| **`proxy_config`** | `ProxyConfig or dict or None` (None) | Proxy configuration for this specific crawl. Can override browser-level proxy settings. |
|
||||||
|
| **`proxy_rotation_strategy`** | `ProxyRotationStrategy` (None) | Strategy for rotating proxies during crawl operations. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### C) **Caching & Session**
|
||||||
|
|
||||||
| **Parameter** | **Type / Default** | **What It Does** |
|
| **Parameter** | **Type / Default** | **What It Does** |
|
||||||
|-------------------------|------------------------|------------------------------------------------------------------------------------------------------------------------------|
|
|-------------------------|------------------------|------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| **`cache_mode`** | `CacheMode or None` | Controls how caching is handled (`ENABLED`, `BYPASS`, `DISABLED`, etc.). If `None`, typically defaults to `ENABLED`. |
|
| **`cache_mode`** | `CacheMode or None` | Controls how caching is handled (`ENABLED`, `BYPASS`, `DISABLED`, etc.). If `None`, typically defaults to `ENABLED`. |
|
||||||
| **`session_id`** | `str or None` | Assign a unique ID to reuse a single browser session across multiple `arun()` calls. |
|
| **`session_id`** | `str or None` | Assign a unique ID to reuse a single browser session across multiple `arun()` calls. |
|
||||||
| **`bypass_cache`** | `bool` (False) | If `True`, acts like `CacheMode.BYPASS`. |
|
| **`bypass_cache`** | `bool` (False) | **Deprecated.** If `True`, acts like `CacheMode.BYPASS`. Use `cache_mode` instead. |
|
||||||
| **`disable_cache`** | `bool` (False) | If `True`, acts like `CacheMode.DISABLED`. |
|
| **`disable_cache`** | `bool` (False) | **Deprecated.** If `True`, acts like `CacheMode.DISABLED`. Use `cache_mode` instead. |
|
||||||
| **`no_cache_read`** | `bool` (False) | If `True`, acts like `CacheMode.WRITE_ONLY` (writes cache but never reads). |
|
| **`no_cache_read`** | `bool` (False) | **Deprecated.** If `True`, acts like `CacheMode.WRITE_ONLY` (writes cache but never reads). Use `cache_mode` instead. |
|
||||||
| **`no_cache_write`** | `bool` (False) | If `True`, acts like `CacheMode.READ_ONLY` (reads cache but never writes). |
|
| **`no_cache_write`** | `bool` (False) | **Deprecated.** If `True`, acts like `CacheMode.READ_ONLY` (reads cache but never writes). Use `cache_mode` instead. |
|
||||||
|
| **`shared_data`** | `dict or None` (None) | Shared data to be passed between hooks and accessible across crawl operations. |
|
||||||
|
|
||||||
Use these for controlling whether you read or write from a local content cache. Handy for large batch crawls or repeated site visits.
|
Use these for controlling whether you read or write from a local content cache. Handy for large batch crawls or repeated site visits.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### C) **Page Navigation & Timing**
|
### D) **Page Navigation & Timing**
|
||||||
|
|
||||||
| **Parameter** | **Type / Default** | **What It Does** |
|
| **Parameter** | **Type / Default** | **What It Does** |
|
||||||
|----------------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------|
|
|----------------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------|
|
||||||
| **`wait_until`** | `str` (domcontentloaded)| Condition for navigation to “complete”. Often `"networkidle"` or `"domcontentloaded"`. |
|
| **`wait_until`** | `str` (domcontentloaded)| Condition for navigation to "complete". Often `"networkidle"` or `"domcontentloaded"`. |
|
||||||
| **`page_timeout`** | `int` (60000 ms) | Timeout for page navigation or JS steps. Increase for slow sites. |
|
| **`page_timeout`** | `int` (60000 ms) | Timeout for page navigation or JS steps. Increase for slow sites. |
|
||||||
| **`wait_for`** | `str or None` | Wait for a CSS (`"css:selector"`) or JS (`"js:() => bool"`) condition before content extraction. |
|
| **`wait_for`** | `str or None` | Wait for a CSS (`"css:selector"`) or JS (`"js:() => bool"`) condition before content extraction. |
|
||||||
|
| **`wait_for_timeout`** | `int or None` (None) | Specific timeout in ms for the `wait_for` condition. If None, uses `page_timeout`. |
|
||||||
| **`wait_for_images`** | `bool` (False) | Wait for images to load before finishing. Slows down if you only want text. |
|
| **`wait_for_images`** | `bool` (False) | Wait for images to load before finishing. Slows down if you only want text. |
|
||||||
| **`delay_before_return_html`** | `float` (0.1) | Additional pause (seconds) before final HTML is captured. Good for last-second updates. |
|
| **`delay_before_return_html`** | `float` (0.1) | Additional pause (seconds) before final HTML is captured. Good for last-second updates. |
|
||||||
| **`check_robots_txt`** | `bool` (False) | Whether to check and respect robots.txt rules before crawling. If True, caches robots.txt for efficiency. |
|
| **`check_robots_txt`** | `bool` (False) | Whether to check and respect robots.txt rules before crawling. If True, caches robots.txt for efficiency. |
|
||||||
@@ -112,15 +145,17 @@ Use these for controlling whether you read or write from a local content cache.
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### D) **Page Interaction**
|
### E) **Page Interaction**
|
||||||
|
|
||||||
| **Parameter** | **Type / Default** | **What It Does** |
|
| **Parameter** | **Type / Default** | **What It Does** |
|
||||||
|----------------------------|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------|
|
|----------------------------|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| **`js_code`** | `str or list[str]` (None) | JavaScript to run after load. E.g. `"document.querySelector('button')?.click();"`. |
|
| **`js_code`** | `str or list[str]` (None) | JavaScript to run after load. E.g. `"document.querySelector('button')?.click();"`. |
|
||||||
| **`js_only`** | `bool` (False) | If `True`, indicates we’re reusing an existing session and only applying JS. No full reload. |
|
| **`c4a_script`** | `str or list[str]` (None) | C4A script that compiles to JavaScript. Alternative to writing raw JS. |
|
||||||
|
| **`js_only`** | `bool` (False) | If `True`, indicates we're reusing an existing session and only applying JS. No full reload. |
|
||||||
| **`ignore_body_visibility`** | `bool` (True) | Skip checking if `<body>` is visible. Usually best to keep `True`. |
|
| **`ignore_body_visibility`** | `bool` (True) | Skip checking if `<body>` is visible. Usually best to keep `True`. |
|
||||||
| **`scan_full_page`** | `bool` (False) | If `True`, auto-scroll the page to load dynamic content (infinite scroll). |
|
| **`scan_full_page`** | `bool` (False) | If `True`, auto-scroll the page to load dynamic content (infinite scroll). |
|
||||||
| **`scroll_delay`** | `float` (0.2) | Delay between scroll steps if `scan_full_page=True`. |
|
| **`scroll_delay`** | `float` (0.2) | Delay between scroll steps if `scan_full_page=True`. |
|
||||||
|
| **`max_scroll_steps`** | `int or None` (None) | Maximum number of scroll steps during full page scan. If None, scrolls until entire page is loaded. |
|
||||||
| **`process_iframes`** | `bool` (False) | Inlines iframe content for single-page extraction. |
|
| **`process_iframes`** | `bool` (False) | Inlines iframe content for single-page extraction. |
|
||||||
| **`remove_overlay_elements`** | `bool` (False) | Removes potential modals/popups blocking the main content. |
|
| **`remove_overlay_elements`** | `bool` (False) | Removes potential modals/popups blocking the main content. |
|
||||||
| **`simulate_user`** | `bool` (False) | Simulate user interactions (mouse movements) to avoid bot detection. |
|
| **`simulate_user`** | `bool` (False) | Simulate user interactions (mouse movements) to avoid bot detection. |
|
||||||
@@ -132,7 +167,7 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### E) **Media Handling**
|
### F) **Media Handling**
|
||||||
|
|
||||||
| **Parameter** | **Type / Default** | **What It Does** |
|
| **Parameter** | **Type / Default** | **What It Does** |
|
||||||
|--------------------------------------------|---------------------|-----------------------------------------------------------------------------------------------------------|
|
|--------------------------------------------|---------------------|-----------------------------------------------------------------------------------------------------------|
|
||||||
@@ -141,13 +176,16 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
|
|||||||
| **`screenshot_height_threshold`** | `int` (~20000) | If the page is taller than this, alternate screenshot strategies are used. |
|
| **`screenshot_height_threshold`** | `int` (~20000) | If the page is taller than this, alternate screenshot strategies are used. |
|
||||||
| **`pdf`** | `bool` (False) | If `True`, returns a PDF in `result.pdf`. |
|
| **`pdf`** | `bool` (False) | If `True`, returns a PDF in `result.pdf`. |
|
||||||
| **`capture_mhtml`** | `bool` (False) | If `True`, captures an MHTML snapshot of the page in `result.mhtml`. MHTML includes all page resources (CSS, images, etc.) in a single file. |
|
| **`capture_mhtml`** | `bool` (False) | If `True`, captures an MHTML snapshot of the page in `result.mhtml`. MHTML includes all page resources (CSS, images, etc.) in a single file. |
|
||||||
| **`image_description_min_word_threshold`** | `int` (~50) | Minimum words for an image’s alt text or description to be considered valid. |
|
| **`image_description_min_word_threshold`** | `int` (~50) | Minimum words for an image's alt text or description to be considered valid. |
|
||||||
| **`image_score_threshold`** | `int` (~3) | Filter out low-scoring images. The crawler scores images by relevance (size, context, etc.). |
|
| **`image_score_threshold`** | `int` (~3) | Filter out low-scoring images. The crawler scores images by relevance (size, context, etc.). |
|
||||||
| **`exclude_external_images`** | `bool` (False) | Exclude images from other domains. |
|
| **`exclude_external_images`** | `bool` (False) | Exclude images from other domains. |
|
||||||
|
| **`exclude_all_images`** | `bool` (False) | If `True`, excludes all images from processing (both internal and external). |
|
||||||
|
| **`table_score_threshold`** | `int` (7) | Minimum score threshold for processing a table. Lower values include more tables. |
|
||||||
|
| **`table_extraction`** | `TableExtractionStrategy` (DefaultTableExtraction) | Strategy for table extraction. Defaults to DefaultTableExtraction with configured threshold. |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### F) **Link/Domain Handling**
|
### G) **Link/Domain Handling**
|
||||||
|
|
||||||
| **Parameter** | **Type / Default** | **What It Does** |
|
| **Parameter** | **Type / Default** | **What It Does** |
|
||||||
|------------------------------|-------------------------|-----------------------------------------------------------------------------------------------------------------------------|
|
|------------------------------|-------------------------|-----------------------------------------------------------------------------------------------------------------------------|
|
||||||
@@ -155,23 +193,39 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
|
|||||||
| **`exclude_external_links`** | `bool` (False) | Removes all links pointing outside the current domain. |
|
| **`exclude_external_links`** | `bool` (False) | Removes all links pointing outside the current domain. |
|
||||||
| **`exclude_social_media_links`** | `bool` (False) | Strips links specifically to social sites (like Facebook or Twitter). |
|
| **`exclude_social_media_links`** | `bool` (False) | Strips links specifically to social sites (like Facebook or Twitter). |
|
||||||
| **`exclude_domains`** | `list` ([]) | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`). |
|
| **`exclude_domains`** | `list` ([]) | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`). |
|
||||||
|
| **`exclude_internal_links`** | `bool` (False) | If `True`, excludes internal links from the results. |
|
||||||
|
| **`score_links`** | `bool` (False) | If `True`, calculates intrinsic quality scores for all links using URL structure, text quality, and contextual metrics. |
|
||||||
| **`preserve_https_for_internal_links`** | `bool` (False) | If `True`, preserves HTTPS scheme for internal links even when the server redirects to HTTP. Useful for security-conscious crawling. |
|
| **`preserve_https_for_internal_links`** | `bool` (False) | If `True`, preserves HTTPS scheme for internal links even when the server redirects to HTTP. Useful for security-conscious crawling. |
|
||||||
|
|
||||||
Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains).
|
Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains).
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### G) **Debug & Logging**
|
### H) **Debug, Logging & Network Monitoring**
|
||||||
|
|
||||||
| **Parameter** | **Type / Default** | **What It Does** |
|
| **Parameter** | **Type / Default** | **What It Does** |
|
||||||
|----------------|--------------------|---------------------------------------------------------------------------|
|
|----------------|--------------------|---------------------------------------------------------------------------|
|
||||||
| **`verbose`** | `bool` (True) | Prints logs detailing each step of crawling, interactions, or errors. |
|
| **`verbose`** | `bool` (True) | Prints logs detailing each step of crawling, interactions, or errors. |
|
||||||
| **`log_console`** | `bool` (False) | Logs the page’s JavaScript console output if you want deeper JS debugging.|
|
| **`log_console`** | `bool` (False) | Logs the page's JavaScript console output if you want deeper JS debugging.|
|
||||||
|
| **`capture_network_requests`** | `bool` (False) | If `True`, captures network requests made by the page in `result.captured_requests`. |
|
||||||
|
| **`capture_console_messages`** | `bool` (False) | If `True`, captures console messages from the page in `result.console_messages`. |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
### I) **Connection & HTTP Parameters**
|
||||||
|
|
||||||
### H) **Virtual Scroll Configuration**
|
| **Parameter** | **Type / Default** | **What It Does** |
|
||||||
|
|-----------------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------|
|
||||||
|
| **`method`** | `str` ("GET") | HTTP method to use when using AsyncHTTPCrawlerStrategy (e.g., "GET", "POST"). |
|
||||||
|
| **`stream`** | `bool` (False) | If `True`, enables streaming mode for `arun_many()` to process URLs as they complete rather than waiting for all. |
|
||||||
|
| **`url`** | `str or None` (None) | URL for this specific config. Not typically set directly but used internally for URL-specific configurations. |
|
||||||
|
| **`user_agent`** | `str or None` (None) | Custom User-Agent string for this crawl. Can override browser-level user agent. |
|
||||||
|
| **`user_agent_mode`** | `str or None` (None) | Set to `"random"` to randomize user agent. Can override browser-level setting. |
|
||||||
|
| **`user_agent_generator_config`** | `dict` ({}) | Configuration for user agent generation when `user_agent_mode="random"`. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### J) **Virtual Scroll Configuration**
|
||||||
|
|
||||||
| **Parameter** | **Type / Default** | **What It Does** |
|
| **Parameter** | **Type / Default** | **What It Does** |
|
||||||
|------------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
|
|------------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
@@ -211,7 +265,7 @@ See [Virtual Scroll documentation](../../advanced/virtual-scroll.md) for detaile
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### I) **URL Matching Configuration**
|
### K) **URL Matching Configuration**
|
||||||
|
|
||||||
| **Parameter** | **Type / Default** | **What It Does** |
|
| **Parameter** | **Type / Default** | **What It Does** |
|
||||||
|------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
|
|------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
@@ -274,7 +328,25 @@ default_config = CrawlerRunConfig() # No url_matcher = matches everything
|
|||||||
- If no config matches a URL and there's no default config (one without `url_matcher`), the URL will fail with "No matching configuration found"
|
- If no config matches a URL and there's no default config (one without `url_matcher`), the URL will fail with "No matching configuration found"
|
||||||
- Always include a default config as the last item if you want to handle all URLs
|
- Always include a default config as the last item if you want to handle all URLs
|
||||||
|
|
||||||
---## 2.2 Helper Methods
|
---
|
||||||
|
|
||||||
|
### L) **Advanced Crawling Features**
|
||||||
|
|
||||||
|
| **Parameter** | **Type / Default** | **What It Does** |
|
||||||
|
|-----------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
|
| **`deep_crawl_strategy`** | `DeepCrawlStrategy or None` (None) | Strategy for deep/recursive crawling. Enables automatic link following and multi-level site crawling. |
|
||||||
|
| **`link_preview_config`** | `LinkPreviewConfig or dict or None` (None) | Configuration for link head extraction and scoring. Fetches and scores link metadata without full page loads. |
|
||||||
|
| **`experimental`** | `dict or None` (None) | Dictionary for experimental/beta features not yet integrated into main parameters. Use with caution. |
|
||||||
|
|
||||||
|
**Deep Crawl Strategy** enables automatic site exploration by following links according to defined rules. Useful for sitemap generation or comprehensive site archiving.
|
||||||
|
|
||||||
|
**Link Preview Config** allows efficient link discovery and scoring by fetching only the `<head>` section of linked pages, enabling smart crawl prioritization without the overhead of full page loads.
|
||||||
|
|
||||||
|
**Experimental** parameters are features in beta testing. They may change or be removed in future versions. Check documentation for currently available experimental features.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2.2 Helper Methods
|
||||||
|
|
||||||
Both `BrowserConfig` and `CrawlerRunConfig` provide a `clone()` method to create modified copies:
|
Both `BrowserConfig` and `CrawlerRunConfig` provide a `clone()` method to create modified copies:
|
||||||
|
|
||||||
|
|||||||
@@ -17,6 +17,11 @@ class BrowserConfig:
|
|||||||
def __init__(
|
def __init__(
|
||||||
browser_type="chromium",
|
browser_type="chromium",
|
||||||
headless=True,
|
headless=True,
|
||||||
|
browser_mode="dedicated",
|
||||||
|
use_managed_browser=False,
|
||||||
|
cdp_url=None,
|
||||||
|
debugging_port=9222,
|
||||||
|
host="localhost",
|
||||||
proxy_config=None,
|
proxy_config=None,
|
||||||
viewport_width=1080,
|
viewport_width=1080,
|
||||||
viewport_height=600,
|
viewport_height=600,
|
||||||
@@ -25,7 +30,13 @@ class BrowserConfig:
|
|||||||
user_data_dir=None,
|
user_data_dir=None,
|
||||||
cookies=None,
|
cookies=None,
|
||||||
headers=None,
|
headers=None,
|
||||||
user_agent=None,
|
user_agent=(
|
||||||
|
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 "
|
||||||
|
# "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||||
|
# "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36"
|
||||||
|
),
|
||||||
|
user_agent_mode="",
|
||||||
text_mode=False,
|
text_mode=False,
|
||||||
light_mode=False,
|
light_mode=False,
|
||||||
extra_args=None,
|
extra_args=None,
|
||||||
@@ -37,17 +48,33 @@ class BrowserConfig:
|
|||||||
|
|
||||||
### Key Fields to Note
|
### Key Fields to Note
|
||||||
|
|
||||||
1. **`browser_type`**
|
1.⠀**`browser_type`**
|
||||||
- Options: `"chromium"`, `"firefox"`, or `"webkit"`.
|
- Options: `"chromium"`, `"firefox"`, or `"webkit"`.
|
||||||
- Defaults to `"chromium"`.
|
- Defaults to `"chromium"`.
|
||||||
- If you need a different engine, specify it here.
|
- If you need a different engine, specify it here.
|
||||||
|
|
||||||
2. **`headless`**
|
2.⠀**`headless`**
|
||||||
- `True`: Runs the browser in headless mode (invisible browser).
|
- `True`: Runs the browser in headless mode (invisible browser).
|
||||||
- `False`: Runs the browser in visible mode, which helps with debugging.
|
- `False`: Runs the browser in visible mode, which helps with debugging.
|
||||||
|
|
||||||
3. **`proxy_config`**
|
3.⠀**`browser_mode`**
|
||||||
- A dictionary with fields like:
|
- Determines how the browser should be initialized:
|
||||||
|
- `"dedicated"` (default): Creates a new browser instance each time
|
||||||
|
- `"builtin"`: Uses the builtin CDP browser running in background
|
||||||
|
- `"custom"`: Uses explicit CDP settings provided in `cdp_url`
|
||||||
|
- `"docker"`: Runs browser in Docker container with isolation
|
||||||
|
|
||||||
|
4.⠀**`use_managed_browser`** & **`cdp_url`**
|
||||||
|
- `use_managed_browser=True`: Launch browser using Chrome DevTools Protocol (CDP) for advanced control
|
||||||
|
- `cdp_url`: URL for CDP endpoint (e.g., `"ws://localhost:9222/devtools/browser/"`)
|
||||||
|
- Automatically set based on `browser_mode`
|
||||||
|
|
||||||
|
5.⠀**`debugging_port`** & **`host`**
|
||||||
|
- `debugging_port`: Port for browser debugging protocol (default: 9222)
|
||||||
|
- `host`: Host for browser connection (default: "localhost")
|
||||||
|
|
||||||
|
6.⠀**`proxy_config`**
|
||||||
|
- A `ProxyConfig` object or dictionary with fields like:
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"server": "http://proxy.example.com:8080",
|
"server": "http://proxy.example.com:8080",
|
||||||
@@ -57,35 +84,35 @@ class BrowserConfig:
|
|||||||
```
|
```
|
||||||
- Leave as `None` if a proxy is not required.
|
- Leave as `None` if a proxy is not required.
|
||||||
|
|
||||||
4. **`viewport_width` & `viewport_height`**:
|
7.⠀**`viewport_width` & `viewport_height`**
|
||||||
- The initial window size.
|
- The initial window size.
|
||||||
- Some sites behave differently with smaller or bigger viewports.
|
- Some sites behave differently with smaller or bigger viewports.
|
||||||
|
|
||||||
5. **`verbose`**:
|
8.⠀**`verbose`**
|
||||||
- If `True`, prints extra logs.
|
- If `True`, prints extra logs.
|
||||||
- Handy for debugging.
|
- Handy for debugging.
|
||||||
|
|
||||||
6. **`use_persistent_context`**:
|
9.⠀**`use_persistent_context`**
|
||||||
- If `True`, uses a **persistent** browser profile, storing cookies/local storage across runs.
|
- If `True`, uses a **persistent** browser profile, storing cookies/local storage across runs.
|
||||||
- Typically also set `user_data_dir` to point to a folder.
|
- Typically also set `user_data_dir` to point to a folder.
|
||||||
|
|
||||||
7. **`cookies`** & **`headers`**:
|
10.⠀**`cookies`** & **`headers`**
|
||||||
- If you want to start with specific cookies or add universal HTTP headers, set them here.
|
- If you want to start with specific cookies or add universal HTTP headers to the browser context, set them here.
|
||||||
- E.g. `cookies=[{"name": "session", "value": "abc123", "domain": "example.com"}]`.
|
- E.g. `cookies=[{"name": "session", "value": "abc123", "domain": "example.com"}]`.
|
||||||
|
|
||||||
8. **`user_agent`**:
|
11.⠀**`user_agent`** & **`user_agent_mode`**
|
||||||
- Custom User-Agent string. If `None`, a default is used.
|
- `user_agent`: Custom User-Agent string. If `None`, a default is used.
|
||||||
- You can also set `user_agent_mode="random"` for randomization (if you want to fight bot detection).
|
- `user_agent_mode`: Set to `"random"` for randomization (helps fight bot detection).
|
||||||
|
|
||||||
9. **`text_mode`** & **`light_mode`**:
|
12.⠀**`text_mode`** & **`light_mode`**
|
||||||
- `text_mode=True` disables images, possibly speeding up text-only crawls.
|
- `text_mode=True` disables images, possibly speeding up text-only crawls.
|
||||||
- `light_mode=True` turns off certain background features for performance.
|
- `light_mode=True` turns off certain background features for performance.
|
||||||
|
|
||||||
10. **`extra_args`**:
|
13.⠀**`extra_args`**
|
||||||
- Additional flags for the underlying browser.
|
- Additional flags for the underlying browser.
|
||||||
- E.g. `["--disable-extensions"]`.
|
- E.g. `["--disable-extensions"]`.
|
||||||
|
|
||||||
11. **`enable_stealth`**:
|
14.⠀**`enable_stealth`**
|
||||||
- If `True`, enables stealth mode using playwright-stealth.
|
- If `True`, enables stealth mode using playwright-stealth.
|
||||||
- Modifies browser fingerprints to avoid basic bot detection.
|
- Modifies browser fingerprints to avoid basic bot detection.
|
||||||
- Default is `False`. Recommended for sites with bot protection.
|
- Default is `False`. Recommended for sites with bot protection.
|
||||||
@@ -134,9 +161,11 @@ class CrawlerRunConfig:
|
|||||||
def __init__(
|
def __init__(
|
||||||
word_count_threshold=200,
|
word_count_threshold=200,
|
||||||
extraction_strategy=None,
|
extraction_strategy=None,
|
||||||
|
chunking_strategy=RegexChunking(),
|
||||||
markdown_generator=None,
|
markdown_generator=None,
|
||||||
cache_mode=None,
|
cache_mode=CacheMode.BYPASS,
|
||||||
js_code=None,
|
js_code=None,
|
||||||
|
c4a_script=None,
|
||||||
wait_for=None,
|
wait_for=None,
|
||||||
screenshot=False,
|
screenshot=False,
|
||||||
pdf=False,
|
pdf=False,
|
||||||
@@ -145,13 +174,18 @@ class CrawlerRunConfig:
|
|||||||
locale=None, # e.g. "en-US", "fr-FR"
|
locale=None, # e.g. "en-US", "fr-FR"
|
||||||
timezone_id=None, # e.g. "America/New_York"
|
timezone_id=None, # e.g. "America/New_York"
|
||||||
geolocation=None, # GeolocationConfig object
|
geolocation=None, # GeolocationConfig object
|
||||||
# Resource Management
|
# Proxy Configuration
|
||||||
enable_rate_limiting=False,
|
proxy_config=None,
|
||||||
rate_limit_config=None,
|
proxy_rotation_strategy=None,
|
||||||
memory_threshold_percent=70.0,
|
# Page Interaction Parameters
|
||||||
check_interval=1.0,
|
scan_full_page=False,
|
||||||
max_session_permit=20,
|
scroll_delay=0.2,
|
||||||
display_mode=None,
|
wait_until="domcontentloaded",
|
||||||
|
page_timeout=60000,
|
||||||
|
delay_before_return_html=0.1,
|
||||||
|
# URL Matching Parameters
|
||||||
|
url_matcher=None, # For URL-specific configurations
|
||||||
|
match_mode=MatchMode.OR,
|
||||||
verbose=True,
|
verbose=True,
|
||||||
stream=False, # Enable streaming for arun_many()
|
stream=False, # Enable streaming for arun_many()
|
||||||
# ... other advanced parameters omitted
|
# ... other advanced parameters omitted
|
||||||
@@ -161,69 +195,68 @@ class CrawlerRunConfig:
|
|||||||
|
|
||||||
### Key Fields to Note
|
### Key Fields to Note
|
||||||
|
|
||||||
1. **`word_count_threshold`**:
|
1.⠀**`word_count_threshold`**:
|
||||||
- The minimum word count before a block is considered.
|
- The minimum word count before a block is considered.
|
||||||
- If your site has lots of short paragraphs or items, you can lower it.
|
- If your site has lots of short paragraphs or items, you can lower it.
|
||||||
|
|
||||||
2. **`extraction_strategy`**:
|
2.⠀**`extraction_strategy`**:
|
||||||
- Where you plug in JSON-based extraction (CSS, LLM, etc.).
|
- Where you plug in JSON-based extraction (CSS, LLM, etc.).
|
||||||
- If `None`, no structured extraction is done (only raw/cleaned HTML + markdown).
|
- If `None`, no structured extraction is done (only raw/cleaned HTML + markdown).
|
||||||
|
|
||||||
3. **`markdown_generator`**:
|
3.⠀**`chunking_strategy`**:
|
||||||
|
- Strategy to chunk content before extraction.
|
||||||
|
- Defaults to `RegexChunking()`. Can be customized for different chunking approaches.
|
||||||
|
|
||||||
|
4.⠀**`markdown_generator`**:
|
||||||
- E.g., `DefaultMarkdownGenerator(...)`, controlling how HTML→Markdown conversion is done.
|
- E.g., `DefaultMarkdownGenerator(...)`, controlling how HTML→Markdown conversion is done.
|
||||||
- If `None`, a default approach is used.
|
- If `None`, a default approach is used.
|
||||||
|
|
||||||
4. **`cache_mode`**:
|
5.⠀**`cache_mode`**:
|
||||||
- Controls caching behavior (`ENABLED`, `BYPASS`, `DISABLED`, etc.).
|
- Controls caching behavior (`ENABLED`, `BYPASS`, `DISABLED`, etc.).
|
||||||
- If `None`, defaults to some level of caching or you can specify `CacheMode.ENABLED`.
|
- Defaults to `CacheMode.BYPASS`.
|
||||||
|
|
||||||
5. **`js_code`**:
|
6.⠀**`js_code`** & **`c4a_script`**:
|
||||||
- A string or list of JS strings to execute.
|
- `js_code`: A string or list of JavaScript strings to execute.
|
||||||
|
- `c4a_script`: C4A script that compiles to JavaScript.
|
||||||
- Great for "Load More" buttons or user interactions.
|
- Great for "Load More" buttons or user interactions.
|
||||||
|
|
||||||
6. **`wait_for`**:
|
7.⠀**`wait_for`**:
|
||||||
- A CSS or JS expression to wait for before extracting content.
|
- A CSS or JS expression to wait for before extracting content.
|
||||||
- Common usage: `wait_for="css:.main-loaded"` or `wait_for="js:() => window.loaded === true"`.
|
- Common usage: `wait_for="css:.main-loaded"` or `wait_for="js:() => window.loaded === true"`.
|
||||||
|
|
||||||
7. **`screenshot`**, **`pdf`**, & **`capture_mhtml`**:
|
8.⠀**`screenshot`**, **`pdf`**, & **`capture_mhtml`**:
|
||||||
- If `True`, captures a screenshot, PDF, or MHTML snapshot after the page is fully loaded.
|
- If `True`, captures a screenshot, PDF, or MHTML snapshot after the page is fully loaded.
|
||||||
- The results go to `result.screenshot` (base64), `result.pdf` (bytes), or `result.mhtml` (string).
|
- The results go to `result.screenshot` (base64), `result.pdf` (bytes), or `result.mhtml` (string).
|
||||||
|
|
||||||
8. **Location Parameters**:
|
9.⠀**Location Parameters**:
|
||||||
- **`locale`**: Browser's locale (e.g., `"en-US"`, `"fr-FR"`) for language preferences
|
- **`locale`**: Browser's locale (e.g., `"en-US"`, `"fr-FR"`) for language preferences
|
||||||
- **`timezone_id`**: Browser's timezone (e.g., `"America/New_York"`, `"Europe/Paris"`)
|
- **`timezone_id`**: Browser's timezone (e.g., `"America/New_York"`, `"Europe/Paris"`)
|
||||||
- **`geolocation`**: GPS coordinates via `GeolocationConfig(latitude=48.8566, longitude=2.3522)`
|
- **`geolocation`**: GPS coordinates via `GeolocationConfig(latitude=48.8566, longitude=2.3522)`
|
||||||
- See [Identity Based Crawling](../advanced/identity-based-crawling.md#7-locale-timezone-and-geolocation-control)
|
- See [Identity Based Crawling](../advanced/identity-based-crawling.md#7-locale-timezone-and-geolocation-control)
|
||||||
|
|
||||||
9. **`verbose`**:
|
10.⠀**Proxy Configuration**:
|
||||||
- Logs additional runtime details.
|
- **`proxy_config`**: Proxy server configuration (ProxyConfig object or dict) e.g. {"server": "...", "username": "...", "password"}
|
||||||
- Overlaps with the browser's verbosity if also set to `True` in `BrowserConfig`.
|
- **`proxy_rotation_strategy`**: Strategy for rotating proxies during crawls
|
||||||
|
|
||||||
10. **`enable_rate_limiting`**:
|
11.⠀**Page Interaction Parameters**:
|
||||||
- If `True`, enables rate limiting for batch processing.
|
- **`scan_full_page`**: If `True`, scroll through the entire page to load all content
|
||||||
- Requires `rate_limit_config` to be set.
|
- **`wait_until`**: Condition to wait for when navigating (e.g., "domcontentloaded", "networkidle")
|
||||||
|
- **`page_timeout`**: Timeout in milliseconds for page operations (default: 60000)
|
||||||
|
- **`delay_before_return_html`**: Delay in seconds before retrieving final HTML.
|
||||||
|
|
||||||
11. **`memory_threshold_percent`**:
|
12.⠀**`url_matcher`** & **`match_mode`**:
|
||||||
- The memory threshold (as a percentage) to monitor.
|
|
||||||
- If exceeded, the crawler will pause or slow down.
|
|
||||||
|
|
||||||
12. **`check_interval`**:
|
|
||||||
- The interval (in seconds) to check system resources.
|
|
||||||
- Affects how often memory and CPU usage are monitored.
|
|
||||||
|
|
||||||
13. **`max_session_permit`**:
|
|
||||||
- The maximum number of concurrent crawl sessions.
|
|
||||||
- Helps prevent overwhelming the system.
|
|
||||||
|
|
||||||
14. **`url_matcher`** & **`match_mode`**:
|
|
||||||
- Enable URL-specific configurations when used with `arun_many()`.
|
- Enable URL-specific configurations when used with `arun_many()`.
|
||||||
- Set `url_matcher` to patterns (glob, function, or list) to match specific URLs.
|
- Set `url_matcher` to patterns (glob, function, or list) to match specific URLs.
|
||||||
- Use `match_mode` (OR/AND) to control how multiple patterns combine.
|
- Use `match_mode` (OR/AND) to control how multiple patterns combine.
|
||||||
- See [URL-Specific Configurations](../api/arun_many.md#url-specific-configurations) for examples.
|
- See [URL-Specific Configurations](../api/arun_many.md#url-specific-configurations) for examples.
|
||||||
|
|
||||||
15. **`display_mode`**:
|
13.⠀**`verbose`**:
|
||||||
- The display mode for progress information (`DETAILED`, `BRIEF`, etc.).
|
- Logs additional runtime details.
|
||||||
- Affects how much information is printed during the crawl.
|
- Overlaps with the browser's verbosity if also set to `True` in `BrowserConfig`.
|
||||||
|
|
||||||
|
14.⠀**`stream`**:
|
||||||
|
- If `True`, enables streaming mode for `arun_many()` to process URLs as they complete.
|
||||||
|
- Allows handling results incrementally instead of waiting for all URLs to finish.
|
||||||
|
|
||||||
|
|
||||||
### Helper Methods
|
### Helper Methods
|
||||||
@@ -263,16 +296,16 @@ The `clone()` method:
|
|||||||
|
|
||||||
### Key fields to note
|
### Key fields to note
|
||||||
|
|
||||||
1. **`provider`**:
|
1.⠀**`provider`**:
|
||||||
- Which LLM provider to use.
|
- Which LLM provider to use.
|
||||||
- Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)*
|
- Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)*
|
||||||
|
|
||||||
2. **`api_token`**:
|
2.⠀**`api_token`**:
|
||||||
- Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables
|
- Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables
|
||||||
- API token of LLM provider <br/> eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"`
|
- API token of LLM provider <br/> eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"`
|
||||||
- Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"`
|
- Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"`
|
||||||
|
|
||||||
3. **`base_url`**:
|
3.⠀**`base_url`**:
|
||||||
- If your provider has a custom endpoint
|
- If your provider has a custom endpoint
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|||||||
Reference in New Issue
Block a user