diff --git a/README.md b/README.md index a9fcdd19..088adb2b 100644 --- a/README.md +++ b/README.md @@ -407,7 +407,7 @@ if __name__ == "__main__": ```python import os import asyncio -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig from crawl4ai.extraction_strategy import LLMExtractionStrategy from pydantic import BaseModel, Field @@ -423,7 +423,7 @@ async def main(): extraction_strategy=LLMExtractionStrategy( # Here you can use any provider that Litellm library supports, for instance: ollama/qwen2 # provider="ollama/qwen2", api_token="no-token", - provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'), + llmConfig = LlmConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')), schema=OpenAIModelFee.schema(), extraction_type="schema", instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 685caeb1..4c89d506 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1,13 +1,16 @@ +import os from .config import ( + DEFAULT_PROVIDER, MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, + PROVIDER_MODELS, SCREENSHOT_HEIGHT_TRESHOLD, PAGE_TIMEOUT, IMAGE_SCORE_THRESHOLD, SOCIAL_MEDIA_DOMAINS, ) -from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator +from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator from .extraction_strategy import ExtractionStrategy from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import MarkdownGenerationStrategy @@ -19,7 +22,8 @@ from .proxy_strategy import ProxyRotationStrategy import inspect from typing import Any, Dict, Optional -from enum import Enum +from enum import Enum + def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict: """ @@ -28,26 +32,23 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict: """ if obj is None: return None - + # Handle basic types if isinstance(obj, (str, int, float, bool)): return obj - + # Handle Enum if isinstance(obj, Enum): - return { - "type": obj.__class__.__name__, - "params": obj.value - } - + return {"type": obj.__class__.__name__, "params": obj.value} + # Handle datetime objects - if hasattr(obj, 'isoformat'): + if hasattr(obj, "isoformat"): return obj.isoformat() - + # Handle lists, tuples, and sets, and basically any iterable if isinstance(obj, (list, tuple, set)) or hasattr(obj, '__iter__') and not isinstance(obj, dict): return [to_serializable_dict(item) for item in obj] - + # Handle frozensets, which are not iterable if isinstance(obj, frozenset): return [to_serializable_dict(item) for item in list(obj)] @@ -56,25 +57,25 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict: if isinstance(obj, dict): return { "type": "dict", # Mark as plain dictionary - "value": {str(k): to_serializable_dict(v) for k, v in obj.items()} + "value": {str(k): to_serializable_dict(v) for k, v in obj.items()}, } _type = obj.__class__.__name__ # Handle class instances - if hasattr(obj, '__class__'): + if hasattr(obj, "__class__"): # Get constructor signature sig = inspect.signature(obj.__class__.__init__) params = sig.parameters - + # Get current values current_values = {} for name, param in params.items(): - if name == 'self': + if name == "self": continue - + value = getattr(obj, name, param.default) - + # Only include if different from default, considering empty values if not (is_empty_value(value) and is_empty_value(param.default)): if value != param.default and not ignore_default_value: @@ -97,47 +98,50 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict: return str(obj) + def from_serializable_dict(data: Any) -> Any: """ Recursively convert a serializable dictionary back to an object instance. """ if data is None: return None - + # Handle basic types if isinstance(data, (str, int, float, bool)): return data - + # Handle typed data if isinstance(data, dict) and "type" in data: # Handle plain dictionaries if data["type"] == "dict": return {k: from_serializable_dict(v) for k, v in data["value"].items()} - + # Import from crawl4ai for class instances import crawl4ai + cls = getattr(crawl4ai, data["type"]) - + # Handle Enum if issubclass(cls, Enum): return cls(data["params"]) - + # Handle class instances constructor_args = { k: from_serializable_dict(v) for k, v in data["params"].items() } return cls(**constructor_args) - + # Handle lists if isinstance(data, list): return [from_serializable_dict(item) for item in data] - + # Handle raw dictionaries (legacy support) if isinstance(data, dict): return {k: from_serializable_dict(v) for k, v in data.items()} - + return data - + + def is_empty_value(value: Any) -> bool: """Check if a value is effectively empty/null.""" if value is None: @@ -146,7 +150,8 @@ def is_empty_value(value: Any) -> bool: return True return False -class BrowserConfig(): + +class BrowserConfig: """ Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy. @@ -224,7 +229,7 @@ class BrowserConfig(): viewport: dict = None, accept_downloads: bool = False, downloads_path: str = None, - storage_state : Union[str, dict, None]=None, + storage_state: Union[str, dict, None] = None, ignore_https_errors: bool = True, java_script_enabled: bool = True, sleep_on_close: bool = False, @@ -288,7 +293,7 @@ class BrowserConfig(): ) else: pass - + self.browser_hint = UAGen.generate_client_hints(self.user_agent) self.headers.setdefault("sec-ch-ua", self.browser_hint) @@ -364,10 +369,10 @@ class BrowserConfig(): def clone(self, **kwargs): """Create a copy of this configuration with updated values. - + Args: **kwargs: Key-value pairs of configuration options to update - + Returns: BrowserConfig: A new instance with the specified updates """ @@ -381,24 +386,33 @@ class BrowserConfig(): return to_serializable_dict(self) @staticmethod - def load( data: dict) -> "BrowserConfig": + def load(data: dict) -> "BrowserConfig": # Deserialize the object from a dictionary - config = from_serializable_dict(data) + config = from_serializable_dict(data) if isinstance(config, BrowserConfig): return config return BrowserConfig.from_kwargs(config) -class HTTPCrawlerConfig(): +class HTTPCrawlerConfig: """HTTP-specific crawler configuration""" + method: str = "GET" headers: Optional[Dict[str, str]] = None data: Optional[Dict[str, Any]] = None - json: Optional[Dict[str, Any]] = None + json: Optional[Dict[str, Any]] = None follow_redirects: bool = True verify_ssl: bool = True - def __init__(self, method: str = "GET", headers: Optional[Dict[str, str]] = None, data: Optional[Dict[str, Any]] = None, json: Optional[Dict[str, Any]] = None, follow_redirects: bool = True, verify_ssl: bool = True): + def __init__( + self, + method: str = "GET", + headers: Optional[Dict[str, str]] = None, + data: Optional[Dict[str, Any]] = None, + json: Optional[Dict[str, Any]] = None, + follow_redirects: bool = True, + verify_ssl: bool = True, + ): self.method = method self.headers = headers self.data = data @@ -426,23 +440,23 @@ class HTTPCrawlerConfig(): "follow_redirects": self.follow_redirects, "verify_ssl": self.verify_ssl, } - + def clone(self, **kwargs): """Create a copy of this configuration with updated values. - + Args: **kwargs: Key-value pairs of configuration options to update - + Returns: HTTPCrawlerConfig: A new instance with the specified updates """ config_dict = self.to_dict() config_dict.update(kwargs) return HTTPCrawlerConfig.from_kwargs(config_dict) - + def dump(self) -> dict: return to_serializable_dict(self) - + @staticmethod def load(data: dict) -> "HTTPCrawlerConfig": config = from_serializable_dict(data) @@ -469,7 +483,7 @@ class CrawlerRunConfig(): Attributes: # Deep Crawl Parameters deep_crawl_strategy (DeepCrawlStrategy or None): Strategy to use for deep crawling. - + # Content Processing Parameters word_count_threshold (int): Minimum word count threshold before processing content. Default: MIN_WORD_THRESHOLD (typically 200). @@ -606,20 +620,20 @@ class CrawlerRunConfig(): data (dict): Data to send in the request body, when using AsyncHTTPCrwalerStrategy. Default: None. json (dict): JSON data to send in the request body, when using AsyncHTTPCrwalerStrategy. - + # Connection Parameters stream (bool): If True, enables streaming of crawled URLs as they are processed when used with arun_many. Default: False. - + check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False - Default: False. - user_agent (str): Custom User-Agent string to use. + Default: False. + user_agent (str): Custom User-Agent string to use. Default: None. - user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided user_agent as-is. + user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided user_agent as-is. Default: None. user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set. Default: None. - + url: str = None # This is not a compulsory parameter """ @@ -700,7 +714,6 @@ class CrawlerRunConfig(): user_agent_generator_config: dict = {}, # Deep Crawl Parameters deep_crawl_strategy: Optional[DeepCrawlStrategy] = None, - ): # TODO: Planning to set properties dynamically based on the __init__ signature self.url = url @@ -810,7 +823,6 @@ class CrawlerRunConfig(): if self.chunking_strategy is None: self.chunking_strategy = RegexChunking() - # Deep Crawl Parameters self.deep_crawl_strategy = deep_crawl_strategy @@ -918,7 +930,6 @@ class CrawlerRunConfig(): user_agent_generator_config=kwargs.get("user_agent_generator_config", {}), # Deep Crawl Parameters deep_crawl_strategy=kwargs.get("deep_crawl_strategy"), - url=kwargs.get("url"), ) @@ -930,7 +941,7 @@ class CrawlerRunConfig(): @staticmethod def load(data: dict) -> "CrawlerRunConfig": # Deserialize the object from a dictionary - config = from_serializable_dict(data) + config = from_serializable_dict(data) if isinstance(config, CrawlerRunConfig): return config return CrawlerRunConfig.from_kwargs(config) @@ -1006,18 +1017,18 @@ class CrawlerRunConfig(): def clone(self, **kwargs): """Create a copy of this configuration with updated values. - + Args: **kwargs: Key-value pairs of configuration options to update - + Returns: CrawlerRunConfig: A new instance with the specified updates - + Example: ```python # Create a new config with streaming enabled stream_config = config.clone(stream=True) - + # Create a new config with multiple updates new_config = config.clone( stream=True, @@ -1031,3 +1042,50 @@ class CrawlerRunConfig(): return CrawlerRunConfig.from_kwargs(config_dict) +class LlmConfig: + def __init__( + self, + provider: str = DEFAULT_PROVIDER, + api_token: Optional[str] = None, + base_url: Optional[str] = None, + ): + """Configuaration class for LLM provider and API token.""" + self.provider = provider + if api_token and not api_token.startswith("env:"): + self.api_token = api_token + elif api_token and api_token.startswith("env:"): + self.api_token = os.getenv(api_token[4:]) + else: + self.api_token = PROVIDER_MODELS.get(provider, "no-token") or os.getenv( + "OPENAI_API_KEY" + ) + self.base_url = base_url + + + @staticmethod + def from_kwargs(kwargs: dict) -> "LlmConfig": + return LlmConfig( + provider=kwargs.get("provider", DEFAULT_PROVIDER), + api_token=kwargs.get("api_token"), + base_url=kwargs.get("base_url"), + ) + + def to_dict(self): + return { + "provider": self.provider, + "api_token": self.api_token, + "base_url": self.base_url + } + + def clone(self, **kwargs): + """Create a copy of this configuration with updated values. + + Args: + **kwargs: Key-value pairs of configuration options to update + + Returns: + LLMConfig: A new instance with the specified updates + """ + config_dict = self.to_dict() + config_dict.update(kwargs) + return LlmConfig.from_kwargs(config_dict) diff --git a/crawl4ai/config.py b/crawl4ai/config.py index 51fe4434..790ba6d0 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -21,6 +21,12 @@ PROVIDER_MODELS = { "anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"), "anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"), "anthropic/claude-3-5-sonnet-20240620": os.getenv("ANTHROPIC_API_KEY"), + "gemini/gemini-pro": os.getenv("GEMINI_API_KEY"), + 'gemini/gemini-1.5-pro': os.getenv("GEMINI_API_KEY"), + 'gemini/gemini-2.0-flash': os.getenv("GEMINI_API_KEY"), + 'gemini/gemini-2.0-flash-exp': os.getenv("GEMINI_API_KEY"), + 'gemini/gemini-2.0-flash-lite-preview-02-05': os.getenv("GEMINI_API_KEY"), + "deepseek/deepseek-chat": os.getenv("DEEPSEEK_API_KEY"), } # Chunk token threshold diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index 55c60e8e..06c09eba 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -1,3 +1,4 @@ +import inspect import re import time from bs4 import BeautifulSoup, Tag @@ -5,7 +6,16 @@ from typing import List, Tuple, Dict, Optional from rank_bm25 import BM25Okapi from collections import deque from bs4 import NavigableString, Comment -from .utils import clean_tokens, perform_completion_with_backoff, escape_json_string, sanitize_html, get_home_folder, extract_xml_data, merge_chunks + +from .utils import ( + clean_tokens, + perform_completion_with_backoff, + escape_json_string, + sanitize_html, + get_home_folder, + extract_xml_data, + merge_chunks, +) from abc import ABC, abstractmethod import math from snowballstemmer import stemmer @@ -20,10 +30,16 @@ from concurrent.futures import ThreadPoolExecutor from .async_logger import AsyncLogger, LogLevel from colorama import Fore, Style + class RelevantContentFilter(ABC): """Abstract base class for content filtering strategies""" - def __init__(self, user_query: str = None, verbose: bool = False, logger: Optional[AsyncLogger] = None): + def __init__( + self, + user_query: str = None, + verbose: bool = False, + logger: Optional[AsyncLogger] = None, + ): """ Initializes the RelevantContentFilter class with optional user query. @@ -362,6 +378,7 @@ class RelevantContentFilter(ABC): except Exception: return str(tag) # Fallback to original if anything fails + class BM25ContentFilter(RelevantContentFilter): """ Content filtering using BM25 algorithm with priority tag handling. @@ -504,6 +521,7 @@ class BM25ContentFilter(RelevantContentFilter): return [self.clean_element(tag) for _, _, tag in selected_candidates] + class PruningContentFilter(RelevantContentFilter): """ Content filtering using pruning algorithm with dynamic threshold. @@ -750,13 +768,21 @@ class PruningContentFilter(RelevantContentFilter): class_id_score -= 0.5 return class_id_score + class LLMContentFilter(RelevantContentFilter): """Content filtering using LLMs to generate relevant markdown.""" + _UNWANTED_PROPS = { + 'provider' : 'Instead, use llmConfig=LlmConfig(provider="...")', + 'api_token' : 'Instead, use llmConfig=LlMConfig(api_token="...")', + 'base_url' : 'Instead, use llmConfig=LlmConfig(base_url="...")', + 'api_base' : 'Instead, use llmConfig=LlmConfig(base_url="...")', + } def __init__( self, provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, + llmConfig: "LlmConfig" = None, instruction: str = None, chunk_token_threshold: int = int(1e9), overlap_rate: float = OVERLAP_RATE, @@ -768,15 +794,13 @@ class LLMContentFilter(RelevantContentFilter): # chunk_mode: str = "char", verbose: bool = False, logger: Optional[AsyncLogger] = None, - ignore_cache: bool = False, + ignore_cache: bool = True, ): super().__init__(None) self.provider = provider - self.api_token = ( - api_token - or PROVIDER_MODELS.get(provider, "no-token") - or os.getenv("OPENAI_API_KEY") - ) + self.api_token = api_token + self.base_url = base_url or api_base + self.llmConfig = llmConfig self.instruction = instruction self.chunk_token_threshold = chunk_token_threshold self.overlap_rate = overlap_rate @@ -785,12 +809,10 @@ class LLMContentFilter(RelevantContentFilter): # self.char_token_rate = char_token_rate or word_token_rate / 5 # self.token_rate = word_token_rate if chunk_mode == "word" else self.char_token_rate self.token_rate = word_token_rate or WORD_TOKEN_RATE - self.base_url = base_url - self.api_base = api_base or base_url self.extra_args = extra_args or {} self.ignore_cache = ignore_cache self.verbose = verbose - + # Setup logger with custom styling for LLM operations if logger: self.logger = logger @@ -801,19 +823,31 @@ class LLMContentFilter(RelevantContentFilter): **AsyncLogger.DEFAULT_ICONS, "LLM": "★", # Star for LLM operations "CHUNK": "◈", # Diamond for chunks - "CACHE": "⚡", # Lightning for cache operations + "CACHE": "⚡", # Lightning for cache operations }, colors={ **AsyncLogger.DEFAULT_COLORS, - LogLevel.INFO: Fore.MAGENTA + Style.DIM, # Dimmed purple for LLM ops - } + LogLevel.INFO: Fore.MAGENTA + + Style.DIM, # Dimmed purple for LLM ops + }, ) else: self.logger = None - + self.usages = [] self.total_usage = TokenUsage() + + def __setattr__(self, name, value): + """Handle attribute setting.""" + # TODO: Planning to set properties dynamically based on the __init__ signature + sig = inspect.signature(self.__init__) + all_params = sig.parameters # Dictionary of parameter names and their details + if name in self._UNWANTED_PROPS and value is not all_params[name].default: + raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}") + + super().__setattr__(name, value) + def _get_cache_key(self, html: str, instruction: str) -> str: """Generate a unique cache key based on HTML and instruction""" content = f"{html}{instruction}" @@ -823,14 +857,12 @@ class LLMContentFilter(RelevantContentFilter): """Split text into chunks with overlap using char or word mode.""" ov = int(self.chunk_token_threshold * self.overlap_rate) sections = merge_chunks( - docs = [text], - target_size= self.chunk_token_threshold, + docs=[text], + target_size=self.chunk_token_threshold, overlap=ov, - word_token_ratio=self.word_token_rate + word_token_ratio=self.word_token_rate, ) return sections - - def filter_content(self, html: str, ignore_cache: bool = True) -> List[str]: if not html or not isinstance(html, str): @@ -838,10 +870,10 @@ class LLMContentFilter(RelevantContentFilter): if self.logger: self.logger.info( - "Starting LLM markdown content filtering process", + "Starting LLM markdown content filtering process", tag="LLM", - params={"provider": self.provider}, - colors={"provider": Fore.CYAN} + params={"provider": self.llmConfig.provider}, + colors={"provider": Fore.CYAN}, ) # Cache handling @@ -857,47 +889,47 @@ class LLMContentFilter(RelevantContentFilter): if self.logger: self.logger.info("Found cached markdown result", tag="CACHE") try: - with cache_file.open('r') as f: + with cache_file.open("r") as f: cached_data = json.load(f) - usage = TokenUsage(**cached_data['usage']) + usage = TokenUsage(**cached_data["usage"]) self.usages.append(usage) self.total_usage.completion_tokens += usage.completion_tokens self.total_usage.prompt_tokens += usage.prompt_tokens self.total_usage.total_tokens += usage.total_tokens - return cached_data['blocks'] + return cached_data["blocks"] except Exception as e: if self.logger: - self.logger.error(f"LLM markdown: Cache read error: {str(e)}", tag="CACHE") + self.logger.error( + f"LLM markdown: Cache read error: {str(e)}", tag="CACHE" + ) # Split into chunks html_chunks = self._merge_chunks(html) if self.logger: self.logger.info( - "LLM markdown: Split content into {chunk_count} chunks", + "LLM markdown: Split content into {chunk_count} chunks", tag="CHUNK", params={"chunk_count": len(html_chunks)}, - colors={"chunk_count": Fore.YELLOW} + colors={"chunk_count": Fore.YELLOW}, ) - + start_time = time.time() - + # Process chunks in parallel with ThreadPoolExecutor(max_workers=4) as executor: futures = [] for i, chunk in enumerate(html_chunks): if self.logger: self.logger.debug( - "LLM markdown: Processing chunk {chunk_num}/{total_chunks}", + "LLM markdown: Processing chunk {chunk_num}/{total_chunks}", tag="CHUNK", - params={ - "chunk_num": i + 1, - "total_chunks": len(html_chunks) - } + params={"chunk_num": i + 1, "total_chunks": len(html_chunks)}, ) prompt_variables = { "HTML": escape_json_string(sanitize_html(chunk)), - "REQUEST": self.instruction or "Convert this HTML into clean, relevant markdown, removing any noise or irrelevant content." + "REQUEST": self.instruction + or "Convert this HTML into clean, relevant markdown, removing any noise or irrelevant content.", } prompt = PROMPT_FILTER_CONTENT @@ -905,95 +937,96 @@ class LLMContentFilter(RelevantContentFilter): prompt = prompt.replace("{" + var + "}", value) def _proceed_with_chunk( - provider: str, - prompt: str, - api_token: str, - base_url: Optional[str] = None, - extra_args: Dict = {} - ) -> List[str]: + provider: str, + prompt: str, + api_token: str, + base_url: Optional[str] = None, + extra_args: Dict = {}, + ) -> List[str]: if self.logger: self.logger.info( - "LLM Markdown: Processing chunk {chunk_num}", + "LLM Markdown: Processing chunk {chunk_num}", tag="CHUNK", - params={"chunk_num": i + 1} + params={"chunk_num": i + 1}, ) return perform_completion_with_backoff( provider, prompt, api_token, base_url=base_url, - extra_args=extra_args + extra_args=extra_args, ) future = executor.submit( _proceed_with_chunk, - self.provider, + self.llmConfig.provider, prompt, - self.api_token, - self.api_base, - self.extra_args + self.llmConfig.api_token, + self.llmConfig.base_url, + self.extra_args, ) futures.append((i, future)) - # Collect results in order ordered_results = [] for i, future in sorted(futures): try: response = future.result() - + # Track usage usage = TokenUsage( completion_tokens=response.usage.completion_tokens, prompt_tokens=response.usage.prompt_tokens, total_tokens=response.usage.total_tokens, - completion_tokens_details=response.usage.completion_tokens_details.__dict__ - if response.usage.completion_tokens_details else {}, - prompt_tokens_details=response.usage.prompt_tokens_details.__dict__ - if response.usage.prompt_tokens_details else {}, + completion_tokens_details=( + response.usage.completion_tokens_details.__dict__ + if response.usage.completion_tokens_details + else {} + ), + prompt_tokens_details=( + response.usage.prompt_tokens_details.__dict__ + if response.usage.prompt_tokens_details + else {} + ), ) self.usages.append(usage) self.total_usage.completion_tokens += usage.completion_tokens self.total_usage.prompt_tokens += usage.prompt_tokens self.total_usage.total_tokens += usage.total_tokens - blocks = extract_xml_data(["content"], response.choices[0].message.content)["content"] + blocks = extract_xml_data( + ["content"], response.choices[0].message.content + )["content"] if blocks: ordered_results.append(blocks) if self.logger: self.logger.success( - "LLM markdown: Successfully processed chunk {chunk_num}", + "LLM markdown: Successfully processed chunk {chunk_num}", tag="CHUNK", - params={"chunk_num": i + 1} + params={"chunk_num": i + 1}, ) except Exception as e: if self.logger: self.logger.error( - "LLM markdown: Error processing chunk {chunk_num}: {error}", + "LLM markdown: Error processing chunk {chunk_num}: {error}", tag="CHUNK", - params={ - "chunk_num": i + 1, - "error": str(e) - } + params={"chunk_num": i + 1, "error": str(e)}, ) end_time = time.time() if self.logger: self.logger.success( - "LLM markdown: Completed processing in {time:.2f}s", + "LLM markdown: Completed processing in {time:.2f}s", tag="LLM", params={"time": end_time - start_time}, - colors={"time": Fore.YELLOW} + colors={"time": Fore.YELLOW}, ) result = ordered_results if ordered_results else [] # Cache the final result - cache_data = { - 'blocks': result, - 'usage': self.total_usage.__dict__ - } - with cache_file.open('w') as f: + cache_data = {"blocks": result, "usage": self.total_usage.__dict__} + with cache_file.open("w") as f: json.dump(cache_data, f) if self.logger: self.logger.info("Cached results for future use", tag="CACHE") @@ -1017,4 +1050,4 @@ class LLMContentFilter(RelevantContentFilter): print( f"{i:<10} {usage.completion_tokens:>12,} " f"{usage.prompt_tokens:>12,} {usage.total_tokens:>12,}" - ) \ No newline at end of file + ) diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index f7abab17..ebd826a2 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +import inspect from typing import Any, List, Dict, Optional from concurrent.futures import ThreadPoolExecutor, as_completed import json @@ -496,20 +497,26 @@ class LLMExtractionStrategy(ExtractionStrategy): usages: List of individual token usages. total_usage: Accumulated token usage. """ - + _UNWANTED_PROPS = { + 'provider' : 'Instead, use llmConfig=LlmConfig(provider="...")', + 'api_token' : 'Instead, use llmConfig=LlMConfig(api_token="...")', + 'base_url' : 'Instead, use llmConfig=LlmConfig(base_url="...")', + 'api_base' : 'Instead, use llmConfig=LlmConfig(base_url="...")', + } def __init__( self, + llmConfig: 'LLMConfig' = None, + instruction: str = None, provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, - instruction: str = None, + base_url: str = None, + api_base: str = None, schema: Dict = None, extraction_type="block", chunk_token_threshold=CHUNK_TOKEN_THRESHOLD, overlap_rate=OVERLAP_RATE, word_token_rate=WORD_TOKEN_RATE, apply_chunking=True, - api_base: str =None, - base_url: str =None, input_format: str = "markdown", verbose=False, **kwargs, @@ -518,6 +525,7 @@ class LLMExtractionStrategy(ExtractionStrategy): Initialize the strategy with clustering parameters. Args: + llmConfig: The LLM configuration object. provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". api_token: The API token for the provider. instruction: The instruction to use for the LLM model. @@ -536,41 +544,39 @@ class LLMExtractionStrategy(ExtractionStrategy): """ super().__init__( input_format=input_format, **kwargs) + self.llmConfig = llmConfig self.provider = provider - if api_token and not api_token.startswith("env:"): - self.api_token = api_token - elif api_token and api_token.startswith("env:"): - self.api_token = os.getenv(api_token[4:]) - else: - self.api_token = ( - PROVIDER_MODELS.get(provider, "no-token") - or os.getenv("OPENAI_API_KEY") - ) + self.api_token = api_token + self.base_url = base_url + self.api_base = api_base self.instruction = instruction self.extract_type = extraction_type self.schema = schema if schema: self.extract_type = "schema" - self.chunk_token_threshold = chunk_token_threshold or CHUNK_TOKEN_THRESHOLD self.overlap_rate = overlap_rate self.word_token_rate = word_token_rate self.apply_chunking = apply_chunking - self.base_url = base_url - self.api_base = api_base or base_url self.extra_args = kwargs.get("extra_args", {}) if not self.apply_chunking: self.chunk_token_threshold = 1e9 - self.verbose = verbose self.usages = [] # Store individual usages self.total_usage = TokenUsage() # Accumulated usage - if not self.api_token: - raise ValueError( - "API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable." - ) + + def __setattr__(self, name, value): + """Handle attribute setting.""" + # TODO: Planning to set properties dynamically based on the __init__ signature + sig = inspect.signature(self.__init__) + all_params = sig.parameters # Dictionary of parameter names and their details + if name in self._UNWANTED_PROPS and value is not all_params[name].default: + raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}") + + super().__setattr__(name, value) + def extract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]: """ Extract meaningful blocks or chunks from the given HTML using an LLM. @@ -603,7 +609,7 @@ class LLMExtractionStrategy(ExtractionStrategy): prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION if self.extract_type == "schema" and self.schema: - variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) + variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) # if type of self.schema is dict else self.schema prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION for variable in variable_values: @@ -612,10 +618,10 @@ class LLMExtractionStrategy(ExtractionStrategy): ) response = perform_completion_with_backoff( - self.provider, + self.llmConfig.provider, prompt_with_variables, - self.api_token, - base_url=self.api_base or self.base_url, + self.llmConfig.api_token, + base_url=self.llmConfig.base_url, extra_args=self.extra_args, ) # , json_response=self.extract_type == "schema") # Track usage @@ -695,7 +701,7 @@ class LLMExtractionStrategy(ExtractionStrategy): overlap=int(self.chunk_token_threshold * self.overlap_rate), ) extracted_content = [] - if self.provider.startswith("groq/"): + if self.llmConfig.provider.startswith("groq/"): # Sequential processing with a delay for ix, section in enumerate(merged_sections): extract_func = partial(self.extract, url) @@ -1036,14 +1042,20 @@ class JsonElementExtractionStrategy(ExtractionStrategy): """Get attribute value from element""" pass + _GENERATE_SCHEMA_UNWANTED_PROPS = { + 'provider': 'Instead, use llmConfig=LlmConfig(provider="...")', + 'api_token': 'Instead, use llmConfig=LlMConfig(api_token="...")', + } + @staticmethod def generate_schema( html: str, schema_type: str = "CSS", # or XPATH query: str = None, target_json_example: str = None, - provider: str = "gpt-4o", - api_token: str = os.getenv("OPENAI_API_KEY"), + llmConfig: 'LLMConfig' = None, + provider: str = None, + api_token: str = None, **kwargs ) -> dict: """ @@ -1052,8 +1064,9 @@ class JsonElementExtractionStrategy(ExtractionStrategy): Args: html (str): The HTML content to analyze query (str, optional): Natural language description of what data to extract - provider (str): LLM provider to use - api_token (str): API token for LLM provider + provider (str): Legacy Parameter. LLM provider to use + api_token (str): Legacy Parameter. API token for LLM provider + llmConfig (LlmConfig): LLM configuration object prompt (str, optional): Custom prompt template to use **kwargs: Additional args passed to perform_completion_with_backoff @@ -1062,6 +1075,9 @@ class JsonElementExtractionStrategy(ExtractionStrategy): """ from .prompts import JSON_SCHEMA_BUILDER from .utils import perform_completion_with_backoff + for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items(): + if locals()[name] is not None: + raise AttributeError(f"Setting '{name}' is deprecated. {message}") # Use default or custom prompt prompt_template = JSON_SCHEMA_BUILDER if schema_type == "CSS" else JSON_SCHEMA_BUILDER_XPATH @@ -1114,10 +1130,10 @@ In this scenario, use your best judgment to generate the schema. Try to maximize try: # Call LLM with backoff handling response = perform_completion_with_backoff( - provider=provider, + provider=llmConfig.provider, prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]), json_response = True, - api_token=api_token, + api_token=llmConfig.api_token, **kwargs ) diff --git a/docs/examples/extraction_strategies_examples.py b/docs/examples/extraction_strategies_examples.py index 658f7521..3e42be6c 100644 --- a/docs/examples/extraction_strategies_examples.py +++ b/docs/examples/extraction_strategies_examples.py @@ -11,6 +11,7 @@ import asyncio import os from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai.async_configs import LlmConfig from crawl4ai.extraction_strategy import ( LLMExtractionStrategy, JsonCssExtractionStrategy, @@ -60,22 +61,19 @@ async def main(): # 1. LLM Extraction with different input formats markdown_strategy = LLMExtractionStrategy( - provider="openai/gpt-4o-mini", - api_token=os.getenv("OPENAI_API_KEY"), + llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), instruction="Extract product information including name, price, and description", ) html_strategy = LLMExtractionStrategy( input_format="html", - provider="openai/gpt-4o-mini", - api_token=os.getenv("OPENAI_API_KEY"), + llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), instruction="Extract product information from HTML including structured data", ) fit_markdown_strategy = LLMExtractionStrategy( input_format="fit_markdown", - provider="openai/gpt-4o-mini", - api_token=os.getenv("OPENAI_API_KEY"), + llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")), instruction="Extract product information from cleaned markdown", ) diff --git a/docs/examples/llm_extraction_openai_pricing.py b/docs/examples/llm_extraction_openai_pricing.py index e9e90dd2..72742bd5 100644 --- a/docs/examples/llm_extraction_openai_pricing.py +++ b/docs/examples/llm_extraction_openai_pricing.py @@ -1,3 +1,4 @@ +from crawl4ai.async_configs import LlmConfig from crawl4ai.extraction_strategy import * from crawl4ai.crawler_strategy import * import asyncio @@ -25,8 +26,7 @@ async def main(): word_count_threshold=1, extraction_strategy=LLMExtractionStrategy( # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), - provider="groq/llama-3.1-70b-versatile", - api_token=os.getenv("GROQ_API_KEY"), + llmConfig=LlmConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")), schema=OpenAIModelFee.model_json_schema(), extraction_type="schema", instruction="From the crawled content, extract all mentioned model names along with their " diff --git a/docs/examples/llm_markdown_generator.py b/docs/examples/llm_markdown_generator.py index 8c673734..f3e18df4 100644 --- a/docs/examples/llm_markdown_generator.py +++ b/docs/examples/llm_markdown_generator.py @@ -1,6 +1,7 @@ import os import asyncio from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai.async_configs import LlmConfig from crawl4ai.content_filter_strategy import LLMContentFilter async def test_llm_filter(): @@ -22,8 +23,7 @@ async def test_llm_filter(): # Initialize LLM filter with focused instruction filter = LLMContentFilter( - provider="openai/gpt-4o", - api_token=os.getenv('OPENAI_API_KEY'), + llmConfig=LlmConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')), instruction=""" Focus on extracting the core educational content about Python classes. Include: @@ -43,8 +43,7 @@ async def test_llm_filter(): ) filter = LLMContentFilter( - provider="openai/gpt-4o", - api_token=os.getenv('OPENAI_API_KEY'), + llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')), chunk_token_threshold=2 ** 12 * 2, # 2048 * 2 ignore_cache = True, instruction=""" diff --git a/docs/examples/quickstart_async.config.py b/docs/examples/quickstart_async.config.py index b58443bd..6792f7d9 100644 --- a/docs/examples/quickstart_async.config.py +++ b/docs/examples/quickstart_async.config.py @@ -1,5 +1,7 @@ import os, sys +from crawl4ai.async_configs import LlmConfig + sys.path.append( os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) ) @@ -209,8 +211,7 @@ async def extract_structured_data_using_llm( word_count_threshold=1, page_timeout=80000, extraction_strategy=LLMExtractionStrategy( - provider=provider, - api_token=api_token, + llmConfig=LlmConfig(provider=provider,api_token=api_token), schema=OpenAIModelFee.model_json_schema(), extraction_type="schema", instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 1585ebea..a3f11272 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -1,5 +1,7 @@ import os, sys +from crawl4ai.async_configs import LlmConfig + # append parent directory to system path sys.path.append( os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -145,8 +147,7 @@ async def extract_structured_data_using_llm( url="https://openai.com/api/pricing/", word_count_threshold=1, extraction_strategy=LLMExtractionStrategy( - provider=provider, - api_token=api_token, + llmConfig=LlmConfig(provider=provider,api_token=api_token), schema=OpenAIModelFee.model_json_schema(), extraction_type="schema", instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. @@ -569,8 +570,7 @@ async def generate_knowledge_graph(): relationships: List[Relationship] extraction_strategy = LLMExtractionStrategy( - provider="openai/gpt-4o-mini", # Or any other provider, including Ollama and open source models - api_token=os.getenv("OPENAI_API_KEY"), # In case of Ollama just pass "no-token" + llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), # In case of Ollama just pass "no-token" schema=KnowledgeGraph.model_json_schema(), extraction_type="schema", instruction="""Extract entities and relationships from the given text.""", diff --git a/docs/examples/quickstart_sync.py b/docs/examples/quickstart_sync.py index 0248af29..3be1baf0 100644 --- a/docs/examples/quickstart_sync.py +++ b/docs/examples/quickstart_sync.py @@ -1,5 +1,6 @@ import os import time +from crawl4ai.async_configs import LlmConfig from crawl4ai.web_crawler import WebCrawler from crawl4ai.chunking_strategy import * from crawl4ai.extraction_strategy import * @@ -178,7 +179,7 @@ def add_llm_extraction_strategy(crawler): result = crawler.run( url="https://www.nbcnews.com/business", extraction_strategy=LLMExtractionStrategy( - provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY") + llmConfig = LlmConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")) ), ) cprint( @@ -197,8 +198,7 @@ def add_llm_extraction_strategy(crawler): result = crawler.run( url="https://www.nbcnews.com/business", extraction_strategy=LLMExtractionStrategy( - provider="openai/gpt-4o", - api_token=os.getenv("OPENAI_API_KEY"), + llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")), instruction="I am interested in only financial news", ), ) @@ -210,8 +210,7 @@ def add_llm_extraction_strategy(crawler): result = crawler.run( url="https://www.nbcnews.com/business", extraction_strategy=LLMExtractionStrategy( - provider="openai/gpt-4o", - api_token=os.getenv("OPENAI_API_KEY"), + llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")), instruction="Extract only content related to technology", ), ) diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index 9db3d767..ed3828c8 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -245,11 +245,32 @@ run_config = CrawlerRunConfig( ) ``` -## 3. Putting It All Together +# 3. **LlmConfig** - Setting up LLM providers +LlmConfig is useful to pass LLM provider config to strategies and functions that rely on LLMs to do extraction, filtering, schema generation etc. Currently it can be used in the following - + +1. LLMExtractionStrategy +2. LLMContentFilter +3. JsonCssExtractionStrategy.generate_schema +4. JsonXPathExtractionStrategy.generate_schema + +## 3.1 Parameters +| **Parameter** | **Type / Default** | **What It Does** | +|-----------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------| +| **`provider`** | `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`
*(default: `"openai/gpt-4o-mini"`)* | Which LLM provoder to use. +| **`api_token`** |1.Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables
2. API token of LLM provider
eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"`
3. Environment variable - use with prefix "env:"
eg:`api_token = "env: GROQ_API_KEY"` | API token to use for the given provider +| **`base_url`** |Optional. Custom API endpoint | If your provider has a custom endpoint + +## 3.2 Example Usage +```python +llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")) +``` + +## 4. Putting It All Together - **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent. - **Use** `CrawlerRunConfig` for each crawl’s **context**: how to filter content, handle caching, wait for dynamic elements, or run JS. - **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`). +- **Use** `LlmConfig` for LLM provider configurations that can be used across all extraction, filtering, schema generation tasks. Can be used in - `LLMExtractionStrategy`, `LLMContentFilter`, `JsonCssExtractionStrategy.generate_schema` & `JsonXPathExtractionStrategy.generate_schema` ```python # Create a modified copy with the clone() method diff --git a/docs/md_v2/api/strategies.md b/docs/md_v2/api/strategies.md index 06b757d4..fbf7a6ee 100644 --- a/docs/md_v2/api/strategies.md +++ b/docs/md_v2/api/strategies.md @@ -131,6 +131,7 @@ OverlappingWindowChunking( ```python from pydantic import BaseModel from crawl4ai.extraction_strategy import LLMExtractionStrategy +from crawl4ai.async_configs import LlmConfig # Define schema class Article(BaseModel): @@ -140,7 +141,7 @@ class Article(BaseModel): # Create strategy strategy = LLMExtractionStrategy( - provider="ollama/llama2", + llmConfig = LlmConfig(provider="ollama/llama2"), schema=Article.schema(), instruction="Extract article details" ) @@ -197,6 +198,7 @@ result = await crawler.arun( ```python from crawl4ai.chunking_strategy import OverlappingWindowChunking +from crawl4ai.async_configs import LlmConfig # Create chunking strategy chunker = OverlappingWindowChunking( @@ -206,7 +208,7 @@ chunker = OverlappingWindowChunking( # Use with extraction strategy strategy = LLMExtractionStrategy( - provider="ollama/llama2", + llmConfig = LlmConfig(provider="ollama/llama2"), chunking_strategy=chunker ) diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md index f80bb04a..33ef81ca 100644 --- a/docs/md_v2/core/browser-crawler-config.md +++ b/docs/md_v2/core/browser-crawler-config.md @@ -1,9 +1,10 @@ -# Browser & Crawler Configuration (Quick Overview) +# Browser, Crawler & LLM Configuration (Quick Overview) Crawl4AI’s flexibility stems from two key classes: 1. **`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent). -2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.). +2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.). +3. **`LlmConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.) In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md). @@ -234,13 +235,37 @@ The `clone()` method: --- -## 3. Putting It All Together -In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` depending on each call’s needs: + + + +## 3. LlmConfig Essentials + +### Key fields to note + +1. **`provider`**: +- Which LLM provoder to use. +- Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`
*(default: `"openai/gpt-4o-mini"`)* + +2. **`api_token`**: + - Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables + - API token of LLM provider
eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"` + - Environment variable - use with prefix "env:"
eg:`api_token = "env: GROQ_API_KEY"` + +3. **`base_url`**: + - If your provider has a custom endpoint + +```python +llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")) +``` + +## 4. Putting It All Together + +In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LlmConfig` depending on each call’s needs: ```python import asyncio -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig from crawl4ai.extraction_strategy import JsonCssExtractionStrategy async def main(): @@ -262,8 +287,39 @@ async def main(): } extraction = JsonCssExtractionStrategy(schema) - # 3) Crawler run config: skip cache, use extraction + # 3) Example LLM content filtering + + gemini_config = LlmConfig( + provider="gemini/gemini-1.5-pro" + api_token = "env:GEMINI_API_TOKEN" + ) + + # Initialize LLM filter with specific instruction + filter = LLMContentFilter( + llmConfig=gemini_config, # or your preferred provider + instruction=""" + Focus on extracting the core educational content. + Include: + - Key concepts and explanations + - Important code examples + - Essential technical details + Exclude: + - Navigation elements + - Sidebars + - Footer content + Format the output as clean markdown with proper code blocks and headers. + """, + chunk_token_threshold=500, # Adjust based on your needs + verbose=True + ) + + md_generator = DefaultMarkdownGenerator( + content_filter=filter, + options={"ignore_links": True} + + # 4) Crawler run config: skip cache, use extraction run_conf = CrawlerRunConfig( + markdown_generator=md_generator, extraction_strategy=extraction, cache_mode=CacheMode.BYPASS, ) @@ -283,11 +339,11 @@ if __name__ == "__main__": --- -## 4. Next Steps +## 5. Next Steps For a **detailed list** of available parameters (including advanced ones), see: -- [BrowserConfig and CrawlerRunConfig Reference](../api/parameters.md) +- [BrowserConfig, CrawlerRunConfig & LlmConfig Reference](../api/parameters.md) You can explore topics like: @@ -298,11 +354,12 @@ You can explore topics like: --- -## 5. Conclusion +## 6. Conclusion -**BrowserConfig** and **CrawlerRunConfig** give you straightforward ways to define: +**BrowserConfig**, **CrawlerRunConfig** and **LlmConfig** give you straightforward ways to define: - **Which** browser to launch, how it should run, and any proxy or user agent needs. - **How** each crawl should behave—caching, timeouts, JavaScript code, extraction strategies, etc. +- **Which** LLM provider to use, api token, temperature and base url for custom endpoints Use them together for **clear, maintainable** code, and when you need more specialized behavior, check out the advanced parameters in the [reference docs](../api/parameters.md). Happy crawling! \ No newline at end of file diff --git a/docs/md_v2/core/content-selection.md b/docs/md_v2/core/content-selection.md index 0e59d465..9f145852 100644 --- a/docs/md_v2/core/content-selection.md +++ b/docs/md_v2/core/content-selection.md @@ -211,7 +211,7 @@ if __name__ == "__main__": import asyncio import json from pydantic import BaseModel, Field -from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LlmConfig from crawl4ai.extraction_strategy import LLMExtractionStrategy class ArticleData(BaseModel): @@ -220,8 +220,7 @@ class ArticleData(BaseModel): async def main(): llm_strategy = LLMExtractionStrategy( - provider="openai/gpt-4", - api_token="sk-YOUR_API_KEY", + llmConfig = LlmConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY") schema=ArticleData.schema(), extraction_type="schema", instruction="Extract 'headline' and a short 'summary' from the content." diff --git a/docs/md_v2/core/markdown-generation.md b/docs/md_v2/core/markdown-generation.md index ab8f9b05..0e030abb 100644 --- a/docs/md_v2/core/markdown-generation.md +++ b/docs/md_v2/core/markdown-generation.md @@ -175,14 +175,13 @@ prune_filter = PruningContentFilter( For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure: ```python -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LlmConfig from crawl4ai.content_filter_strategy import LLMContentFilter async def main(): # Initialize LLM filter with specific instruction filter = LLMContentFilter( - provider="openai/gpt-4o", # or your preferred provider - api_token="your-api-token", # or use environment variable + llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable instruction=""" Focus on extracting the core educational content. Include: diff --git a/docs/md_v2/core/quickstart.md b/docs/md_v2/core/quickstart.md index 7b5a2583..9372f863 100644 --- a/docs/md_v2/core/quickstart.md +++ b/docs/md_v2/core/quickstart.md @@ -128,6 +128,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B ```python from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai.async_configs import LlmConfig # Generate a schema (one-time cost) html = "

Gaming Laptop

$999.99
" @@ -135,15 +136,13 @@ html = "

Gaming Laptop

$999.99 Dict: """ Recursively convert an object to a serializable dictionary using {type, params} structure @@ -222,7 +224,7 @@ if __name__ == "__main__": config3 = CrawlerRunConfig( markdown_generator=DefaultMarkdownGenerator( content_filter=LLMContentFilter( - provider="openai/gpt-4", + llmConfig = LlmConfig(provider="openai/gpt-4"), instruction="Extract key technical concepts", chunk_token_threshold=2000, overlap_rate=0.1 diff --git a/tests/test_web_crawler.py b/tests/test_web_crawler.py index d6eddfdc..07a380fe 100644 --- a/tests/test_web_crawler.py +++ b/tests/test_web_crawler.py @@ -1,4 +1,5 @@ import unittest, os +from crawl4ai.async_configs import LlmConfig from crawl4ai.web_crawler import WebCrawler from crawl4ai.chunking_strategy import ( RegexChunking, @@ -42,7 +43,7 @@ class TestWebCrawler(unittest.TestCase): word_count_threshold=5, chunking_strategy=FixedLengthWordChunking(chunk_size=100), extraction_strategy=LLMExtractionStrategy( - provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY") + llmConfig=LlmConfig(provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY")) ), bypass_cache=True, )