Files
crawl4ai/crawl4ai/async_configs.py
UncleCode 4aeb7ef9ad refactor(proxy): consolidate proxy configuration handling
Moves ProxyConfig from configs/ directory into proxy_strategy.py to improve code organization and reduce fragmentation. Updates all imports and type hints to reflect the new location.

Key changes:
- Moved ProxyConfig class from configs/proxy_config.py to proxy_strategy.py
- Updated type hints in async_configs.py to support ProxyConfig
- Fixed proxy configuration handling in browser_manager.py
- Updated documentation and examples to use new import path

BREAKING CHANGE: ProxyConfig import path has changed from crawl4ai.configs to crawl4ai.proxy_strategy
2025-03-07 23:14:11 +08:00

1096 lines
49 KiB
Python

import os
from .config import (
DEFAULT_PROVIDER,
MIN_WORD_THRESHOLD,
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
PROVIDER_MODELS,
SCREENSHOT_HEIGHT_TRESHOLD,
PAGE_TIMEOUT,
IMAGE_SCORE_THRESHOLD,
SOCIAL_MEDIA_DOMAINS,
)
from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator
from .extraction_strategy import ExtractionStrategy
from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import MarkdownGenerationStrategy
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
from .deep_crawling import DeepCrawlStrategy
from .cache_context import CacheMode
from .proxy_strategy import ProxyRotationStrategy
from typing import Union, List
import inspect
from typing import Any, Dict, Optional
from enum import Enum
from .proxy_strategy import ProxyConfig
def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
"""
Recursively convert an object to a serializable dictionary using {type, params} structure
for complex objects.
"""
if obj is None:
return None
# Handle basic types
if isinstance(obj, (str, int, float, bool)):
return obj
# Handle Enum
if isinstance(obj, Enum):
return {"type": obj.__class__.__name__, "params": obj.value}
# Handle datetime objects
if hasattr(obj, "isoformat"):
return obj.isoformat()
# Handle lists, tuples, and sets, and basically any iterable
if isinstance(obj, (list, tuple, set)) or hasattr(obj, '__iter__') and not isinstance(obj, dict):
return [to_serializable_dict(item) for item in obj]
# Handle frozensets, which are not iterable
if isinstance(obj, frozenset):
return [to_serializable_dict(item) for item in list(obj)]
# Handle dictionaries - preserve them as-is
if isinstance(obj, dict):
return {
"type": "dict", # Mark as plain dictionary
"value": {str(k): to_serializable_dict(v) for k, v in obj.items()},
}
_type = obj.__class__.__name__
# Handle class instances
if hasattr(obj, "__class__"):
# Get constructor signature
sig = inspect.signature(obj.__class__.__init__)
params = sig.parameters
# Get current values
current_values = {}
for name, param in params.items():
if name == "self":
continue
value = getattr(obj, name, param.default)
# Only include if different from default, considering empty values
if not (is_empty_value(value) and is_empty_value(param.default)):
if value != param.default and not ignore_default_value:
current_values[name] = to_serializable_dict(value)
if hasattr(obj, '__slots__'):
for slot in obj.__slots__:
if slot.startswith('_'): # Handle private slots
attr_name = slot[1:] # Remove leading '_'
value = getattr(obj, slot, None)
if value is not None:
current_values[attr_name] = to_serializable_dict(value)
return {
"type": obj.__class__.__name__,
"params": current_values
}
return str(obj)
def from_serializable_dict(data: Any) -> Any:
"""
Recursively convert a serializable dictionary back to an object instance.
"""
if data is None:
return None
# Handle basic types
if isinstance(data, (str, int, float, bool)):
return data
# Handle typed data
if isinstance(data, dict) and "type" in data:
# Handle plain dictionaries
if data["type"] == "dict":
return {k: from_serializable_dict(v) for k, v in data["value"].items()}
# Import from crawl4ai for class instances
import crawl4ai
cls = getattr(crawl4ai, data["type"])
# Handle Enum
if issubclass(cls, Enum):
return cls(data["params"])
# Handle class instances
constructor_args = {
k: from_serializable_dict(v) for k, v in data["params"].items()
}
return cls(**constructor_args)
# Handle lists
if isinstance(data, list):
return [from_serializable_dict(item) for item in data]
# Handle raw dictionaries (legacy support)
if isinstance(data, dict):
return {k: from_serializable_dict(v) for k, v in data.items()}
return data
def is_empty_value(value: Any) -> bool:
"""Check if a value is effectively empty/null."""
if value is None:
return True
if isinstance(value, (list, tuple, set, dict, str)) and len(value) == 0:
return True
return False
class BrowserConfig:
"""
Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
This class centralizes all parameters that affect browser and context creation. Instead of passing
scattered keyword arguments, users can instantiate and modify this configuration object. The crawler
code will then reference these settings to initialize the browser in a consistent, documented manner.
Attributes:
browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
Default: "chromium".
headless (bool): Whether to run the browser in headless mode (no visible GUI).
Default: True.
use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing
advanced manipulation. Default: False.
cdp_url (str): URL for the Chrome DevTools Protocol (CDP) endpoint. Default: "ws://localhost:9222/devtools/browser/".
debugging_port (int): Port for the browser debugging protocol. Default: 9222.
use_persistent_context (bool): Use a persistent browser context (like a persistent profile).
Automatically sets use_managed_browser=True. Default: False.
user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
temporary directory may be used. Default: None.
chrome_channel (str): The Chrome channel to launch (e.g., "chrome", "msedge"). Only applies if browser_type
is "chromium". Default: "chromium".
channel (str): The channel to launch (e.g., "chromium", "chrome", "msedge"). Only applies if browser_type
is "chromium". Default: "chromium".
proxy (Optional[str]): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used.
Default: None.
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
If None, no additional proxy config. Default: None.
viewport_width (int): Default viewport width for pages. Default: 1080.
viewport_height (int): Default viewport height for pages. Default: 600.
viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height.
Default: None.
verbose (bool): Enable verbose logging.
Default: True.
accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path.
Default: False.
downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True,
a default path will be created. Default: None.
storage_state (str or dict or None): Path or object describing storage state (cookies, localStorage).
Default: None.
ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True.
java_script_enabled (bool): Enable JavaScript execution in pages. Default: True.
cookies (list): List of cookies to add to the browser context. Each cookie is a dict with fields like
{"name": "...", "value": "...", "url": "..."}.
Default: [].
headers (dict): Extra HTTP headers to apply to all requests in this context.
Default: {}.
user_agent (str): Custom User-Agent string to use. Default: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36".
user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided
user_agent as-is. Default: None.
user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
Default: None.
text_mode (bool): If True, disables images and other rich content for potentially faster load times.
Default: False.
light_mode (bool): Disables certain background features for performance gains. Default: False.
extra_args (list): Additional command-line arguments passed to the browser.
Default: [].
"""
def __init__(
self,
browser_type: str = "chromium",
headless: bool = True,
use_managed_browser: bool = False,
cdp_url: str = None,
use_persistent_context: bool = False,
user_data_dir: str = None,
chrome_channel: str = "chromium",
channel: str = "chromium",
proxy: str = None,
proxy_config: Union[ProxyConfig, dict, None] = None,
viewport_width: int = 1080,
viewport_height: int = 600,
viewport: dict = None,
accept_downloads: bool = False,
downloads_path: str = None,
storage_state: Union[str, dict, None] = None,
ignore_https_errors: bool = True,
java_script_enabled: bool = True,
sleep_on_close: bool = False,
verbose: bool = True,
cookies: list = None,
headers: dict = None,
user_agent: str = (
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 "
# "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
# "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36"
),
user_agent_mode: str = "",
user_agent_generator_config: dict = {},
text_mode: bool = False,
light_mode: bool = False,
extra_args: list = None,
debugging_port: int = 9222,
host: str = "localhost",
):
self.browser_type = browser_type
self.headless = headless
self.use_managed_browser = use_managed_browser
self.cdp_url = cdp_url
self.use_persistent_context = use_persistent_context
self.user_data_dir = user_data_dir
self.chrome_channel = chrome_channel or self.browser_type or "chromium"
self.channel = channel or self.browser_type or "chromium"
if self.browser_type in ["firefox", "webkit"]:
self.channel = ""
self.chrome_channel = ""
self.proxy = proxy
self.proxy_config = proxy_config
self.viewport_width = viewport_width
self.viewport_height = viewport_height
self.viewport = viewport
if self.viewport is not None:
self.viewport_width = self.viewport.get("width", 1080)
self.viewport_height = self.viewport.get("height", 600)
self.accept_downloads = accept_downloads
self.downloads_path = downloads_path
self.storage_state = storage_state
self.ignore_https_errors = ignore_https_errors
self.java_script_enabled = java_script_enabled
self.cookies = cookies if cookies is not None else []
self.headers = headers if headers is not None else {}
self.user_agent = user_agent
self.user_agent_mode = user_agent_mode
self.user_agent_generator_config = user_agent_generator_config
self.text_mode = text_mode
self.light_mode = light_mode
self.extra_args = extra_args if extra_args is not None else []
self.sleep_on_close = sleep_on_close
self.verbose = verbose
self.debugging_port = debugging_port
fa_user_agenr_generator = ValidUAGenerator()
if self.user_agent_mode == "random":
self.user_agent = fa_user_agenr_generator.generate(
**(self.user_agent_generator_config or {})
)
else:
pass
self.browser_hint = UAGen.generate_client_hints(self.user_agent)
self.headers.setdefault("sec-ch-ua", self.browser_hint)
# If persistent context is requested, ensure managed browser is enabled
if self.use_persistent_context:
self.use_managed_browser = True
@staticmethod
def from_kwargs(kwargs: dict) -> "BrowserConfig":
return BrowserConfig(
browser_type=kwargs.get("browser_type", "chromium"),
headless=kwargs.get("headless", True),
use_managed_browser=kwargs.get("use_managed_browser", False),
cdp_url=kwargs.get("cdp_url"),
use_persistent_context=kwargs.get("use_persistent_context", False),
user_data_dir=kwargs.get("user_data_dir"),
chrome_channel=kwargs.get("chrome_channel", "chromium"),
channel=kwargs.get("channel", "chromium"),
proxy=kwargs.get("proxy"),
proxy_config=kwargs.get("proxy_config", None),
viewport_width=kwargs.get("viewport_width", 1080),
viewport_height=kwargs.get("viewport_height", 600),
accept_downloads=kwargs.get("accept_downloads", False),
downloads_path=kwargs.get("downloads_path"),
storage_state=kwargs.get("storage_state"),
ignore_https_errors=kwargs.get("ignore_https_errors", True),
java_script_enabled=kwargs.get("java_script_enabled", True),
cookies=kwargs.get("cookies", []),
headers=kwargs.get("headers", {}),
user_agent=kwargs.get(
"user_agent",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
),
user_agent_mode=kwargs.get("user_agent_mode"),
user_agent_generator_config=kwargs.get("user_agent_generator_config"),
text_mode=kwargs.get("text_mode", False),
light_mode=kwargs.get("light_mode", False),
extra_args=kwargs.get("extra_args", []),
)
def to_dict(self):
return {
"browser_type": self.browser_type,
"headless": self.headless,
"use_managed_browser": self.use_managed_browser,
"cdp_url": self.cdp_url,
"use_persistent_context": self.use_persistent_context,
"user_data_dir": self.user_data_dir,
"chrome_channel": self.chrome_channel,
"channel": self.channel,
"proxy": self.proxy,
"proxy_config": self.proxy_config,
"viewport_width": self.viewport_width,
"viewport_height": self.viewport_height,
"accept_downloads": self.accept_downloads,
"downloads_path": self.downloads_path,
"storage_state": self.storage_state,
"ignore_https_errors": self.ignore_https_errors,
"java_script_enabled": self.java_script_enabled,
"cookies": self.cookies,
"headers": self.headers,
"user_agent": self.user_agent,
"user_agent_mode": self.user_agent_mode,
"user_agent_generator_config": self.user_agent_generator_config,
"text_mode": self.text_mode,
"light_mode": self.light_mode,
"extra_args": self.extra_args,
"sleep_on_close": self.sleep_on_close,
"verbose": self.verbose,
"debugging_port": self.debugging_port,
}
def clone(self, **kwargs):
"""Create a copy of this configuration with updated values.
Args:
**kwargs: Key-value pairs of configuration options to update
Returns:
BrowserConfig: A new instance with the specified updates
"""
config_dict = self.to_dict()
config_dict.update(kwargs)
return BrowserConfig.from_kwargs(config_dict)
# Create a funciton returns dict of the object
def dump(self) -> dict:
# Serialize the object to a dictionary
return to_serializable_dict(self)
@staticmethod
def load(data: dict) -> "BrowserConfig":
# Deserialize the object from a dictionary
config = from_serializable_dict(data)
if isinstance(config, BrowserConfig):
return config
return BrowserConfig.from_kwargs(config)
class HTTPCrawlerConfig:
"""HTTP-specific crawler configuration"""
method: str = "GET"
headers: Optional[Dict[str, str]] = None
data: Optional[Dict[str, Any]] = None
json: Optional[Dict[str, Any]] = None
follow_redirects: bool = True
verify_ssl: bool = True
def __init__(
self,
method: str = "GET",
headers: Optional[Dict[str, str]] = None,
data: Optional[Dict[str, Any]] = None,
json: Optional[Dict[str, Any]] = None,
follow_redirects: bool = True,
verify_ssl: bool = True,
):
self.method = method
self.headers = headers
self.data = data
self.json = json
self.follow_redirects = follow_redirects
self.verify_ssl = verify_ssl
@staticmethod
def from_kwargs(kwargs: dict) -> "HTTPCrawlerConfig":
return HTTPCrawlerConfig(
method=kwargs.get("method", "GET"),
headers=kwargs.get("headers"),
data=kwargs.get("data"),
json=kwargs.get("json"),
follow_redirects=kwargs.get("follow_redirects", True),
verify_ssl=kwargs.get("verify_ssl", True),
)
def to_dict(self):
return {
"method": self.method,
"headers": self.headers,
"data": self.data,
"json": self.json,
"follow_redirects": self.follow_redirects,
"verify_ssl": self.verify_ssl,
}
def clone(self, **kwargs):
"""Create a copy of this configuration with updated values.
Args:
**kwargs: Key-value pairs of configuration options to update
Returns:
HTTPCrawlerConfig: A new instance with the specified updates
"""
config_dict = self.to_dict()
config_dict.update(kwargs)
return HTTPCrawlerConfig.from_kwargs(config_dict)
def dump(self) -> dict:
return to_serializable_dict(self)
@staticmethod
def load(data: dict) -> "HTTPCrawlerConfig":
config = from_serializable_dict(data)
if isinstance(config, HTTPCrawlerConfig):
return config
return HTTPCrawlerConfig.from_kwargs(config)
class CrawlerRunConfig():
_UNWANTED_PROPS = {
'disable_cache' : 'Instead, use cache_mode=CacheMode.DISABLED',
'bypass_cache' : 'Instead, use cache_mode=CacheMode.BYPASS',
'no_cache_read' : 'Instead, use cache_mode=CacheMode.WRITE_ONLY',
'no_cache_write' : 'Instead, use cache_mode=CacheMode.READ_ONLY',
}
"""
Configuration class for controlling how the crawler runs each crawl operation.
This includes parameters for content extraction, page manipulation, waiting conditions,
caching, and other runtime behaviors.
This centralizes parameters that were previously scattered as kwargs to `arun()` and related methods.
By using this class, you have a single place to understand and adjust the crawling options.
Attributes:
# Deep Crawl Parameters
deep_crawl_strategy (DeepCrawlStrategy or None): Strategy to use for deep crawling.
# Content Processing Parameters
word_count_threshold (int): Minimum word count threshold before processing content.
Default: MIN_WORD_THRESHOLD (typically 200).
extraction_strategy (ExtractionStrategy or None): Strategy to extract structured data from crawled pages.
Default: None (NoExtractionStrategy is used if None).
chunking_strategy (ChunkingStrategy): Strategy to chunk content before extraction.
Default: RegexChunking().
markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown.
Default: None.
only_text (bool): If True, attempt to extract text-only content where applicable.
Default: False.
css_selector (str or None): CSS selector to extract a specific portion of the page.
Default: None.
excluded_tags (list of str or None): List of HTML tags to exclude from processing.
Default: None.
excluded_selector (str or None): CSS selector to exclude from processing.
Default: None.
keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes.
Default: False.
keep_attrs (list of str): List of HTML attributes to keep during processing.
Default: [].
remove_forms (bool): If True, remove all `<form>` elements from the HTML.
Default: False.
prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output.
Default: False.
parser_type (str): Type of parser to use for HTML parsing.
Default: "lxml".
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
Default: WebScrapingStrategy.
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
If None, no additional proxy config. Default: None.
# SSL Parameters
fetch_ssl_certificate: bool = False,
# Caching Parameters
cache_mode (CacheMode or None): Defines how caching is handled.
If None, defaults to CacheMode.ENABLED internally.
Default: CacheMode.BYPASS.
session_id (str or None): Optional session ID to persist the browser context and the created
page instance. If the ID already exists, the crawler does not
create a new page and uses the current page to preserve the state.
bypass_cache (bool): Legacy parameter, if True acts like CacheMode.BYPASS.
Default: False.
disable_cache (bool): Legacy parameter, if True acts like CacheMode.DISABLED.
Default: False.
no_cache_read (bool): Legacy parameter, if True acts like CacheMode.WRITE_ONLY.
Default: False.
no_cache_write (bool): Legacy parameter, if True acts like CacheMode.READ_ONLY.
Default: False.
shared_data (dict or None): Shared data to be passed between hooks.
Default: None.
# Page Navigation and Timing Parameters
wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded".
Default: "domcontentloaded".
page_timeout (int): Timeout in ms for page operations like navigation.
Default: 60000 (60 seconds).
wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
Default: None.
wait_for_images (bool): If True, wait for images to load before extracting content.
Default: False.
delay_before_return_html (float): Delay in seconds before retrieving final HTML.
Default: 0.1.
mean_delay (float): Mean base delay between requests when calling arun_many.
Default: 0.1.
max_range (float): Max random additional delay range for requests in arun_many.
Default: 0.3.
semaphore_count (int): Number of concurrent operations allowed.
Default: 5.
# Page Interaction Parameters
js_code (str or list of str or None): JavaScript code/snippets to run on the page.
Default: None.
js_only (bool): If True, indicates subsequent calls are JS-driven updates, not full page loads.
Default: False.
ignore_body_visibility (bool): If True, ignore whether the body is visible before proceeding.
Default: True.
scan_full_page (bool): If True, scroll through the entire page to load all content.
Default: False.
scroll_delay (float): Delay in seconds between scroll steps if scan_full_page is True.
Default: 0.2.
process_iframes (bool): If True, attempts to process and inline iframe content.
Default: False.
remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML.
Default: False.
simulate_user (bool): If True, simulate user interactions (mouse moves, clicks) for anti-bot measures.
Default: False.
override_navigator (bool): If True, overrides navigator properties for more human-like behavior.
Default: False.
magic (bool): If True, attempts automatic handling of overlays/popups.
Default: False.
adjust_viewport_to_content (bool): If True, adjust viewport according to the page content dimensions.
Default: False.
# Media Handling Parameters
screenshot (bool): Whether to take a screenshot after crawling.
Default: False.
screenshot_wait_for (float or None): Additional wait time before taking a screenshot.
Default: None.
screenshot_height_threshold (int): Threshold for page height to decide screenshot strategy.
Default: SCREENSHOT_HEIGHT_TRESHOLD (from config, e.g. 20000).
pdf (bool): Whether to generate a PDF of the page.
Default: False.
image_description_min_word_threshold (int): Minimum words for image description extraction.
Default: IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD (e.g., 50).
image_score_threshold (int): Minimum score threshold for processing an image.
Default: IMAGE_SCORE_THRESHOLD (e.g., 3).
exclude_external_images (bool): If True, exclude all external images from processing.
Default: False.
# Link and Domain Handling Parameters
exclude_social_media_domains (list of str): List of domains to exclude for social media links.
Default: SOCIAL_MEDIA_DOMAINS (from config).
exclude_external_links (bool): If True, exclude all external links from the results.
Default: False.
exclude_internal_links (bool): If True, exclude internal links from the results.
Default: False.
exclude_social_media_links (bool): If True, exclude links pointing to social media domains.
Default: False.
exclude_domains (list of str): List of specific domains to exclude from results.
Default: [].
exclude_internal_links (bool): If True, exclude internal links from the results.
Default: False.
# Debugging and Logging Parameters
verbose (bool): Enable verbose logging.
Default: True.
log_console (bool): If True, log console messages from the page.
Default: False.
# HTTP Crwler Strategy Parameters
method (str): HTTP method to use for the request, when using AsyncHTTPCrwalerStrategy.
Default: "GET".
data (dict): Data to send in the request body, when using AsyncHTTPCrwalerStrategy.
Default: None.
json (dict): JSON data to send in the request body, when using AsyncHTTPCrwalerStrategy.
# Connection Parameters
stream (bool): If True, enables streaming of crawled URLs as they are processed when used with arun_many.
Default: False.
check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
Default: False.
user_agent (str): Custom User-Agent string to use.
Default: None.
user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided user_agent as-is.
Default: None.
user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
Default: None.
url: str = None # This is not a compulsory parameter
"""
def __init__(
self,
# Content Processing Parameters
word_count_threshold: int = MIN_WORD_THRESHOLD,
extraction_strategy: ExtractionStrategy = None,
chunking_strategy: ChunkingStrategy = RegexChunking(),
markdown_generator: MarkdownGenerationStrategy = None,
only_text: bool = False,
css_selector: str = None,
excluded_tags: list = None,
excluded_selector: str = None,
keep_data_attributes: bool = False,
keep_attrs: list = None,
remove_forms: bool = False,
prettiify: bool = False,
parser_type: str = "lxml",
scraping_strategy: ContentScrapingStrategy = None,
proxy_config: Union[ProxyConfig, dict, None] = None,
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
# SSL Parameters
fetch_ssl_certificate: bool = False,
# Caching Parameters
cache_mode: CacheMode = CacheMode.BYPASS,
session_id: str = None,
bypass_cache: bool = False,
disable_cache: bool = False,
no_cache_read: bool = False,
no_cache_write: bool = False,
shared_data: dict = None,
# Page Navigation and Timing Parameters
wait_until: str = "domcontentloaded",
page_timeout: int = PAGE_TIMEOUT,
wait_for: str = None,
wait_for_images: bool = False,
delay_before_return_html: float = 0.1,
mean_delay: float = 0.1,
max_range: float = 0.3,
semaphore_count: int = 5,
# Page Interaction Parameters
js_code: Union[str, List[str]] = None,
js_only: bool = False,
ignore_body_visibility: bool = True,
scan_full_page: bool = False,
scroll_delay: float = 0.2,
process_iframes: bool = False,
remove_overlay_elements: bool = False,
simulate_user: bool = False,
override_navigator: bool = False,
magic: bool = False,
adjust_viewport_to_content: bool = False,
# Media Handling Parameters
screenshot: bool = False,
screenshot_wait_for: float = None,
screenshot_height_threshold: int = SCREENSHOT_HEIGHT_TRESHOLD,
pdf: bool = False,
image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
exclude_external_images: bool = False,
# Link and Domain Handling Parameters
exclude_social_media_domains: list = None,
exclude_external_links: bool = False,
exclude_social_media_links: bool = False,
exclude_domains: list = None,
exclude_internal_links: bool = False,
# Debugging and Logging Parameters
verbose: bool = True,
log_console: bool = False,
# Connection Parameters
method: str = "GET",
stream: bool = False,
url: str = None,
check_robots_txt: bool = False,
user_agent: str = None,
user_agent_mode: str = None,
user_agent_generator_config: dict = {},
# Deep Crawl Parameters
deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
):
# TODO: Planning to set properties dynamically based on the __init__ signature
self.url = url
# Content Processing Parameters
self.word_count_threshold = word_count_threshold
self.extraction_strategy = extraction_strategy
self.chunking_strategy = chunking_strategy
self.markdown_generator = markdown_generator
self.only_text = only_text
self.css_selector = css_selector
self.excluded_tags = excluded_tags or []
self.excluded_selector = excluded_selector or ""
self.keep_data_attributes = keep_data_attributes
self.keep_attrs = keep_attrs or []
self.remove_forms = remove_forms
self.prettiify = prettiify
self.parser_type = parser_type
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
self.proxy_config = proxy_config
self.proxy_rotation_strategy = proxy_rotation_strategy
# SSL Parameters
self.fetch_ssl_certificate = fetch_ssl_certificate
# Caching Parameters
self.cache_mode = cache_mode
self.session_id = session_id
self.bypass_cache = bypass_cache
self.disable_cache = disable_cache
self.no_cache_read = no_cache_read
self.no_cache_write = no_cache_write
self.shared_data = shared_data
# Page Navigation and Timing Parameters
self.wait_until = wait_until
self.page_timeout = page_timeout
self.wait_for = wait_for
self.wait_for_images = wait_for_images
self.delay_before_return_html = delay_before_return_html
self.mean_delay = mean_delay
self.max_range = max_range
self.semaphore_count = semaphore_count
# Page Interaction Parameters
self.js_code = js_code
self.js_only = js_only
self.ignore_body_visibility = ignore_body_visibility
self.scan_full_page = scan_full_page
self.scroll_delay = scroll_delay
self.process_iframes = process_iframes
self.remove_overlay_elements = remove_overlay_elements
self.simulate_user = simulate_user
self.override_navigator = override_navigator
self.magic = magic
self.adjust_viewport_to_content = adjust_viewport_to_content
# Media Handling Parameters
self.screenshot = screenshot
self.screenshot_wait_for = screenshot_wait_for
self.screenshot_height_threshold = screenshot_height_threshold
self.pdf = pdf
self.image_description_min_word_threshold = image_description_min_word_threshold
self.image_score_threshold = image_score_threshold
self.exclude_external_images = exclude_external_images
# Link and Domain Handling Parameters
self.exclude_social_media_domains = (
exclude_social_media_domains or SOCIAL_MEDIA_DOMAINS
)
self.exclude_external_links = exclude_external_links
self.exclude_social_media_links = exclude_social_media_links
self.exclude_domains = exclude_domains or []
self.exclude_internal_links = exclude_internal_links
# Debugging and Logging Parameters
self.verbose = verbose
self.log_console = log_console
# Connection Parameters
self.stream = stream
self.method = method
# Robots.txt Handling Parameters
self.check_robots_txt = check_robots_txt
# User Agent Parameters
self.user_agent = user_agent
self.user_agent_mode = user_agent_mode
self.user_agent_generator_config = user_agent_generator_config
# Validate type of extraction strategy and chunking strategy if they are provided
if self.extraction_strategy is not None and not isinstance(
self.extraction_strategy, ExtractionStrategy
):
raise ValueError(
"extraction_strategy must be an instance of ExtractionStrategy"
)
if self.chunking_strategy is not None and not isinstance(
self.chunking_strategy, ChunkingStrategy
):
raise ValueError(
"chunking_strategy must be an instance of ChunkingStrategy"
)
# Set default chunking strategy if None
if self.chunking_strategy is None:
self.chunking_strategy = RegexChunking()
# Deep Crawl Parameters
self.deep_crawl_strategy = deep_crawl_strategy
def __getattr__(self, name):
"""Handle attribute access."""
if name in self._UNWANTED_PROPS:
raise AttributeError(f"Getting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
raise AttributeError(f"'{self.__class__.__name__}' has no attribute '{name}'")
def __setattr__(self, name, value):
"""Handle attribute setting."""
# TODO: Planning to set properties dynamically based on the __init__ signature
sig = inspect.signature(self.__init__)
all_params = sig.parameters # Dictionary of parameter names and their details
if name in self._UNWANTED_PROPS and value is not all_params[name].default:
raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
super().__setattr__(name, value)
@staticmethod
def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
return CrawlerRunConfig(
# Content Processing Parameters
word_count_threshold=kwargs.get("word_count_threshold", 200),
extraction_strategy=kwargs.get("extraction_strategy"),
chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
markdown_generator=kwargs.get("markdown_generator"),
only_text=kwargs.get("only_text", False),
css_selector=kwargs.get("css_selector"),
excluded_tags=kwargs.get("excluded_tags", []),
excluded_selector=kwargs.get("excluded_selector", ""),
keep_data_attributes=kwargs.get("keep_data_attributes", False),
keep_attrs=kwargs.get("keep_attrs", []),
remove_forms=kwargs.get("remove_forms", False),
prettiify=kwargs.get("prettiify", False),
parser_type=kwargs.get("parser_type", "lxml"),
scraping_strategy=kwargs.get("scraping_strategy"),
proxy_config=kwargs.get("proxy_config"),
proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"),
# SSL Parameters
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
# Caching Parameters
cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS),
session_id=kwargs.get("session_id"),
bypass_cache=kwargs.get("bypass_cache", False),
disable_cache=kwargs.get("disable_cache", False),
no_cache_read=kwargs.get("no_cache_read", False),
no_cache_write=kwargs.get("no_cache_write", False),
shared_data=kwargs.get("shared_data", None),
# Page Navigation and Timing Parameters
wait_until=kwargs.get("wait_until", "domcontentloaded"),
page_timeout=kwargs.get("page_timeout", 60000),
wait_for=kwargs.get("wait_for"),
wait_for_images=kwargs.get("wait_for_images", False),
delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
mean_delay=kwargs.get("mean_delay", 0.1),
max_range=kwargs.get("max_range", 0.3),
semaphore_count=kwargs.get("semaphore_count", 5),
# Page Interaction Parameters
js_code=kwargs.get("js_code"),
js_only=kwargs.get("js_only", False),
ignore_body_visibility=kwargs.get("ignore_body_visibility", True),
scan_full_page=kwargs.get("scan_full_page", False),
scroll_delay=kwargs.get("scroll_delay", 0.2),
process_iframes=kwargs.get("process_iframes", False),
remove_overlay_elements=kwargs.get("remove_overlay_elements", False),
simulate_user=kwargs.get("simulate_user", False),
override_navigator=kwargs.get("override_navigator", False),
magic=kwargs.get("magic", False),
adjust_viewport_to_content=kwargs.get("adjust_viewport_to_content", False),
# Media Handling Parameters
screenshot=kwargs.get("screenshot", False),
screenshot_wait_for=kwargs.get("screenshot_wait_for"),
screenshot_height_threshold=kwargs.get(
"screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD
),
pdf=kwargs.get("pdf", False),
image_description_min_word_threshold=kwargs.get(
"image_description_min_word_threshold",
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
),
image_score_threshold=kwargs.get(
"image_score_threshold", IMAGE_SCORE_THRESHOLD
),
exclude_external_images=kwargs.get("exclude_external_images", False),
# Link and Domain Handling Parameters
exclude_social_media_domains=kwargs.get(
"exclude_social_media_domains", SOCIAL_MEDIA_DOMAINS
),
exclude_external_links=kwargs.get("exclude_external_links", False),
exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
exclude_domains=kwargs.get("exclude_domains", []),
exclude_internal_links=kwargs.get("exclude_internal_links", False),
# Debugging and Logging Parameters
verbose=kwargs.get("verbose", True),
log_console=kwargs.get("log_console", False),
# Connection Parameters
method=kwargs.get("method", "GET"),
stream=kwargs.get("stream", False),
check_robots_txt=kwargs.get("check_robots_txt", False),
user_agent=kwargs.get("user_agent"),
user_agent_mode=kwargs.get("user_agent_mode"),
user_agent_generator_config=kwargs.get("user_agent_generator_config", {}),
# Deep Crawl Parameters
deep_crawl_strategy=kwargs.get("deep_crawl_strategy"),
url=kwargs.get("url"),
)
# Create a funciton returns dict of the object
def dump(self) -> dict:
# Serialize the object to a dictionary
return to_serializable_dict(self)
@staticmethod
def load(data: dict) -> "CrawlerRunConfig":
# Deserialize the object from a dictionary
config = from_serializable_dict(data)
if isinstance(config, CrawlerRunConfig):
return config
return CrawlerRunConfig.from_kwargs(config)
def to_dict(self):
return {
"word_count_threshold": self.word_count_threshold,
"extraction_strategy": self.extraction_strategy,
"chunking_strategy": self.chunking_strategy,
"markdown_generator": self.markdown_generator,
"only_text": self.only_text,
"css_selector": self.css_selector,
"excluded_tags": self.excluded_tags,
"excluded_selector": self.excluded_selector,
"keep_data_attributes": self.keep_data_attributes,
"keep_attrs": self.keep_attrs,
"remove_forms": self.remove_forms,
"prettiify": self.prettiify,
"parser_type": self.parser_type,
"scraping_strategy": self.scraping_strategy,
"proxy_config": self.proxy_config,
"proxy_rotation_strategy": self.proxy_rotation_strategy,
"fetch_ssl_certificate": self.fetch_ssl_certificate,
"cache_mode": self.cache_mode,
"session_id": self.session_id,
"bypass_cache": self.bypass_cache,
"disable_cache": self.disable_cache,
"no_cache_read": self.no_cache_read,
"no_cache_write": self.no_cache_write,
"shared_data": self.shared_data,
"wait_until": self.wait_until,
"page_timeout": self.page_timeout,
"wait_for": self.wait_for,
"wait_for_images": self.wait_for_images,
"delay_before_return_html": self.delay_before_return_html,
"mean_delay": self.mean_delay,
"max_range": self.max_range,
"semaphore_count": self.semaphore_count,
"js_code": self.js_code,
"js_only": self.js_only,
"ignore_body_visibility": self.ignore_body_visibility,
"scan_full_page": self.scan_full_page,
"scroll_delay": self.scroll_delay,
"process_iframes": self.process_iframes,
"remove_overlay_elements": self.remove_overlay_elements,
"simulate_user": self.simulate_user,
"override_navigator": self.override_navigator,
"magic": self.magic,
"adjust_viewport_to_content": self.adjust_viewport_to_content,
"screenshot": self.screenshot,
"screenshot_wait_for": self.screenshot_wait_for,
"screenshot_height_threshold": self.screenshot_height_threshold,
"pdf": self.pdf,
"image_description_min_word_threshold": self.image_description_min_word_threshold,
"image_score_threshold": self.image_score_threshold,
"exclude_external_images": self.exclude_external_images,
"exclude_social_media_domains": self.exclude_social_media_domains,
"exclude_external_links": self.exclude_external_links,
"exclude_social_media_links": self.exclude_social_media_links,
"exclude_domains": self.exclude_domains,
"exclude_internal_links": self.exclude_internal_links,
"verbose": self.verbose,
"log_console": self.log_console,
"method": self.method,
"stream": self.stream,
"check_robots_txt": self.check_robots_txt,
"user_agent": self.user_agent,
"user_agent_mode": self.user_agent_mode,
"user_agent_generator_config": self.user_agent_generator_config,
"deep_crawl_strategy": self.deep_crawl_strategy,
"url": self.url,
}
def clone(self, **kwargs):
"""Create a copy of this configuration with updated values.
Args:
**kwargs: Key-value pairs of configuration options to update
Returns:
CrawlerRunConfig: A new instance with the specified updates
Example:
```python
# Create a new config with streaming enabled
stream_config = config.clone(stream=True)
# Create a new config with multiple updates
new_config = config.clone(
stream=True,
cache_mode=CacheMode.BYPASS,
verbose=True
)
```
"""
config_dict = self.to_dict()
config_dict.update(kwargs)
return CrawlerRunConfig.from_kwargs(config_dict)
class LLMConfig:
def __init__(
self,
provider: str = DEFAULT_PROVIDER,
api_token: Optional[str] = None,
base_url: Optional[str] = None,
):
"""Configuaration class for LLM provider and API token."""
self.provider = provider
if api_token and not api_token.startswith("env:"):
self.api_token = api_token
elif api_token and api_token.startswith("env:"):
self.api_token = os.getenv(api_token[4:])
else:
self.api_token = PROVIDER_MODELS.get(provider, "no-token") or os.getenv(
"OPENAI_API_KEY"
)
self.base_url = base_url
@staticmethod
def from_kwargs(kwargs: dict) -> "LLMConfig":
return LLMConfig(
provider=kwargs.get("provider", DEFAULT_PROVIDER),
api_token=kwargs.get("api_token"),
base_url=kwargs.get("base_url"),
)
def to_dict(self):
return {
"provider": self.provider,
"api_token": self.api_token,
"base_url": self.base_url
}
def clone(self, **kwargs):
"""Create a copy of this configuration with updated values.
Args:
**kwargs: Key-value pairs of configuration options to update
Returns:
llm_config: A new instance with the specified updates
"""
config_dict = self.to_dict()
config_dict.update(kwargs)
return LLMConfig.from_kwargs(config_dict)