Feat/llm config (#724)

* feature: Add LlmConfig to easily configure and pass LLM configs to different strategies

* pulled in next branch and resolved conflicts

* feat: Add gemini and deepseek providers. Make ignore_cache in llm content filter to true by default to avoid confusions

* Refactor: Update LlmConfig in LLMExtractionStrategy class and deprecate old params

* updated tests, docs and readme
This commit is contained in:
Aravind
2025-02-21 13:11:37 +05:30
committed by GitHub
parent 3cb28875c3
commit 2af958e12c
25 changed files with 420 additions and 240 deletions

View File

@@ -407,7 +407,7 @@ if __name__ == "__main__":
```python ```python
import os import os
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai.extraction_strategy import LLMExtractionStrategy
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
@@ -423,7 +423,7 @@ async def main():
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
# Here you can use any provider that Litellm library supports, for instance: ollama/qwen2 # Here you can use any provider that Litellm library supports, for instance: ollama/qwen2
# provider="ollama/qwen2", api_token="no-token", # provider="ollama/qwen2", api_token="no-token",
provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'), llmConfig = LlmConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
schema=OpenAIModelFee.schema(), schema=OpenAIModelFee.schema(),
extraction_type="schema", extraction_type="schema",
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.

View File

@@ -1,13 +1,16 @@
import os
from .config import ( from .config import (
DEFAULT_PROVIDER,
MIN_WORD_THRESHOLD, MIN_WORD_THRESHOLD,
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
PROVIDER_MODELS,
SCREENSHOT_HEIGHT_TRESHOLD, SCREENSHOT_HEIGHT_TRESHOLD,
PAGE_TIMEOUT, PAGE_TIMEOUT,
IMAGE_SCORE_THRESHOLD, IMAGE_SCORE_THRESHOLD,
SOCIAL_MEDIA_DOMAINS, SOCIAL_MEDIA_DOMAINS,
) )
from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator
from .extraction_strategy import ExtractionStrategy from .extraction_strategy import ExtractionStrategy
from .chunking_strategy import ChunkingStrategy, RegexChunking from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import MarkdownGenerationStrategy from .markdown_generation_strategy import MarkdownGenerationStrategy
@@ -21,6 +24,7 @@ import inspect
from typing import Any, Dict, Optional from typing import Any, Dict, Optional
from enum import Enum from enum import Enum
def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict: def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
""" """
Recursively convert an object to a serializable dictionary using {type, params} structure Recursively convert an object to a serializable dictionary using {type, params} structure
@@ -35,13 +39,10 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
# Handle Enum # Handle Enum
if isinstance(obj, Enum): if isinstance(obj, Enum):
return { return {"type": obj.__class__.__name__, "params": obj.value}
"type": obj.__class__.__name__,
"params": obj.value
}
# Handle datetime objects # Handle datetime objects
if hasattr(obj, 'isoformat'): if hasattr(obj, "isoformat"):
return obj.isoformat() return obj.isoformat()
# Handle lists, tuples, and sets, and basically any iterable # Handle lists, tuples, and sets, and basically any iterable
@@ -56,13 +57,13 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
if isinstance(obj, dict): if isinstance(obj, dict):
return { return {
"type": "dict", # Mark as plain dictionary "type": "dict", # Mark as plain dictionary
"value": {str(k): to_serializable_dict(v) for k, v in obj.items()} "value": {str(k): to_serializable_dict(v) for k, v in obj.items()},
} }
_type = obj.__class__.__name__ _type = obj.__class__.__name__
# Handle class instances # Handle class instances
if hasattr(obj, '__class__'): if hasattr(obj, "__class__"):
# Get constructor signature # Get constructor signature
sig = inspect.signature(obj.__class__.__init__) sig = inspect.signature(obj.__class__.__init__)
params = sig.parameters params = sig.parameters
@@ -70,7 +71,7 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
# Get current values # Get current values
current_values = {} current_values = {}
for name, param in params.items(): for name, param in params.items():
if name == 'self': if name == "self":
continue continue
value = getattr(obj, name, param.default) value = getattr(obj, name, param.default)
@@ -97,6 +98,7 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
return str(obj) return str(obj)
def from_serializable_dict(data: Any) -> Any: def from_serializable_dict(data: Any) -> Any:
""" """
Recursively convert a serializable dictionary back to an object instance. Recursively convert a serializable dictionary back to an object instance.
@@ -116,6 +118,7 @@ def from_serializable_dict(data: Any) -> Any:
# Import from crawl4ai for class instances # Import from crawl4ai for class instances
import crawl4ai import crawl4ai
cls = getattr(crawl4ai, data["type"]) cls = getattr(crawl4ai, data["type"])
# Handle Enum # Handle Enum
@@ -138,6 +141,7 @@ def from_serializable_dict(data: Any) -> Any:
return data return data
def is_empty_value(value: Any) -> bool: def is_empty_value(value: Any) -> bool:
"""Check if a value is effectively empty/null.""" """Check if a value is effectively empty/null."""
if value is None: if value is None:
@@ -146,7 +150,8 @@ def is_empty_value(value: Any) -> bool:
return True return True
return False return False
class BrowserConfig():
class BrowserConfig:
""" """
Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy. Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
@@ -224,7 +229,7 @@ class BrowserConfig():
viewport: dict = None, viewport: dict = None,
accept_downloads: bool = False, accept_downloads: bool = False,
downloads_path: str = None, downloads_path: str = None,
storage_state : Union[str, dict, None]=None, storage_state: Union[str, dict, None] = None,
ignore_https_errors: bool = True, ignore_https_errors: bool = True,
java_script_enabled: bool = True, java_script_enabled: bool = True,
sleep_on_close: bool = False, sleep_on_close: bool = False,
@@ -381,7 +386,7 @@ class BrowserConfig():
return to_serializable_dict(self) return to_serializable_dict(self)
@staticmethod @staticmethod
def load( data: dict) -> "BrowserConfig": def load(data: dict) -> "BrowserConfig":
# Deserialize the object from a dictionary # Deserialize the object from a dictionary
config = from_serializable_dict(data) config = from_serializable_dict(data)
if isinstance(config, BrowserConfig): if isinstance(config, BrowserConfig):
@@ -389,8 +394,9 @@ class BrowserConfig():
return BrowserConfig.from_kwargs(config) return BrowserConfig.from_kwargs(config)
class HTTPCrawlerConfig(): class HTTPCrawlerConfig:
"""HTTP-specific crawler configuration""" """HTTP-specific crawler configuration"""
method: str = "GET" method: str = "GET"
headers: Optional[Dict[str, str]] = None headers: Optional[Dict[str, str]] = None
data: Optional[Dict[str, Any]] = None data: Optional[Dict[str, Any]] = None
@@ -398,7 +404,15 @@ class HTTPCrawlerConfig():
follow_redirects: bool = True follow_redirects: bool = True
verify_ssl: bool = True verify_ssl: bool = True
def __init__(self, method: str = "GET", headers: Optional[Dict[str, str]] = None, data: Optional[Dict[str, Any]] = None, json: Optional[Dict[str, Any]] = None, follow_redirects: bool = True, verify_ssl: bool = True): def __init__(
self,
method: str = "GET",
headers: Optional[Dict[str, str]] = None,
data: Optional[Dict[str, Any]] = None,
json: Optional[Dict[str, Any]] = None,
follow_redirects: bool = True,
verify_ssl: bool = True,
):
self.method = method self.method = method
self.headers = headers self.headers = headers
self.data = data self.data = data
@@ -700,7 +714,6 @@ class CrawlerRunConfig():
user_agent_generator_config: dict = {}, user_agent_generator_config: dict = {},
# Deep Crawl Parameters # Deep Crawl Parameters
deep_crawl_strategy: Optional[DeepCrawlStrategy] = None, deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
): ):
# TODO: Planning to set properties dynamically based on the __init__ signature # TODO: Planning to set properties dynamically based on the __init__ signature
self.url = url self.url = url
@@ -810,7 +823,6 @@ class CrawlerRunConfig():
if self.chunking_strategy is None: if self.chunking_strategy is None:
self.chunking_strategy = RegexChunking() self.chunking_strategy = RegexChunking()
# Deep Crawl Parameters # Deep Crawl Parameters
self.deep_crawl_strategy = deep_crawl_strategy self.deep_crawl_strategy = deep_crawl_strategy
@@ -918,7 +930,6 @@ class CrawlerRunConfig():
user_agent_generator_config=kwargs.get("user_agent_generator_config", {}), user_agent_generator_config=kwargs.get("user_agent_generator_config", {}),
# Deep Crawl Parameters # Deep Crawl Parameters
deep_crawl_strategy=kwargs.get("deep_crawl_strategy"), deep_crawl_strategy=kwargs.get("deep_crawl_strategy"),
url=kwargs.get("url"), url=kwargs.get("url"),
) )
@@ -1031,3 +1042,50 @@ class CrawlerRunConfig():
return CrawlerRunConfig.from_kwargs(config_dict) return CrawlerRunConfig.from_kwargs(config_dict)
class LlmConfig:
def __init__(
self,
provider: str = DEFAULT_PROVIDER,
api_token: Optional[str] = None,
base_url: Optional[str] = None,
):
"""Configuaration class for LLM provider and API token."""
self.provider = provider
if api_token and not api_token.startswith("env:"):
self.api_token = api_token
elif api_token and api_token.startswith("env:"):
self.api_token = os.getenv(api_token[4:])
else:
self.api_token = PROVIDER_MODELS.get(provider, "no-token") or os.getenv(
"OPENAI_API_KEY"
)
self.base_url = base_url
@staticmethod
def from_kwargs(kwargs: dict) -> "LlmConfig":
return LlmConfig(
provider=kwargs.get("provider", DEFAULT_PROVIDER),
api_token=kwargs.get("api_token"),
base_url=kwargs.get("base_url"),
)
def to_dict(self):
return {
"provider": self.provider,
"api_token": self.api_token,
"base_url": self.base_url
}
def clone(self, **kwargs):
"""Create a copy of this configuration with updated values.
Args:
**kwargs: Key-value pairs of configuration options to update
Returns:
LLMConfig: A new instance with the specified updates
"""
config_dict = self.to_dict()
config_dict.update(kwargs)
return LlmConfig.from_kwargs(config_dict)

View File

@@ -21,6 +21,12 @@ PROVIDER_MODELS = {
"anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"), "anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"),
"anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"), "anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"),
"anthropic/claude-3-5-sonnet-20240620": os.getenv("ANTHROPIC_API_KEY"), "anthropic/claude-3-5-sonnet-20240620": os.getenv("ANTHROPIC_API_KEY"),
"gemini/gemini-pro": os.getenv("GEMINI_API_KEY"),
'gemini/gemini-1.5-pro': os.getenv("GEMINI_API_KEY"),
'gemini/gemini-2.0-flash': os.getenv("GEMINI_API_KEY"),
'gemini/gemini-2.0-flash-exp': os.getenv("GEMINI_API_KEY"),
'gemini/gemini-2.0-flash-lite-preview-02-05': os.getenv("GEMINI_API_KEY"),
"deepseek/deepseek-chat": os.getenv("DEEPSEEK_API_KEY"),
} }
# Chunk token threshold # Chunk token threshold

View File

@@ -1,3 +1,4 @@
import inspect
import re import re
import time import time
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
@@ -5,7 +6,16 @@ from typing import List, Tuple, Dict, Optional
from rank_bm25 import BM25Okapi from rank_bm25 import BM25Okapi
from collections import deque from collections import deque
from bs4 import NavigableString, Comment from bs4 import NavigableString, Comment
from .utils import clean_tokens, perform_completion_with_backoff, escape_json_string, sanitize_html, get_home_folder, extract_xml_data, merge_chunks
from .utils import (
clean_tokens,
perform_completion_with_backoff,
escape_json_string,
sanitize_html,
get_home_folder,
extract_xml_data,
merge_chunks,
)
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
import math import math
from snowballstemmer import stemmer from snowballstemmer import stemmer
@@ -20,10 +30,16 @@ from concurrent.futures import ThreadPoolExecutor
from .async_logger import AsyncLogger, LogLevel from .async_logger import AsyncLogger, LogLevel
from colorama import Fore, Style from colorama import Fore, Style
class RelevantContentFilter(ABC): class RelevantContentFilter(ABC):
"""Abstract base class for content filtering strategies""" """Abstract base class for content filtering strategies"""
def __init__(self, user_query: str = None, verbose: bool = False, logger: Optional[AsyncLogger] = None): def __init__(
self,
user_query: str = None,
verbose: bool = False,
logger: Optional[AsyncLogger] = None,
):
""" """
Initializes the RelevantContentFilter class with optional user query. Initializes the RelevantContentFilter class with optional user query.
@@ -362,6 +378,7 @@ class RelevantContentFilter(ABC):
except Exception: except Exception:
return str(tag) # Fallback to original if anything fails return str(tag) # Fallback to original if anything fails
class BM25ContentFilter(RelevantContentFilter): class BM25ContentFilter(RelevantContentFilter):
""" """
Content filtering using BM25 algorithm with priority tag handling. Content filtering using BM25 algorithm with priority tag handling.
@@ -504,6 +521,7 @@ class BM25ContentFilter(RelevantContentFilter):
return [self.clean_element(tag) for _, _, tag in selected_candidates] return [self.clean_element(tag) for _, _, tag in selected_candidates]
class PruningContentFilter(RelevantContentFilter): class PruningContentFilter(RelevantContentFilter):
""" """
Content filtering using pruning algorithm with dynamic threshold. Content filtering using pruning algorithm with dynamic threshold.
@@ -750,13 +768,21 @@ class PruningContentFilter(RelevantContentFilter):
class_id_score -= 0.5 class_id_score -= 0.5
return class_id_score return class_id_score
class LLMContentFilter(RelevantContentFilter): class LLMContentFilter(RelevantContentFilter):
"""Content filtering using LLMs to generate relevant markdown.""" """Content filtering using LLMs to generate relevant markdown."""
_UNWANTED_PROPS = {
'provider' : 'Instead, use llmConfig=LlmConfig(provider="...")',
'api_token' : 'Instead, use llmConfig=LlMConfig(api_token="...")',
'base_url' : 'Instead, use llmConfig=LlmConfig(base_url="...")',
'api_base' : 'Instead, use llmConfig=LlmConfig(base_url="...")',
}
def __init__( def __init__(
self, self,
provider: str = DEFAULT_PROVIDER, provider: str = DEFAULT_PROVIDER,
api_token: Optional[str] = None, api_token: Optional[str] = None,
llmConfig: "LlmConfig" = None,
instruction: str = None, instruction: str = None,
chunk_token_threshold: int = int(1e9), chunk_token_threshold: int = int(1e9),
overlap_rate: float = OVERLAP_RATE, overlap_rate: float = OVERLAP_RATE,
@@ -768,15 +794,13 @@ class LLMContentFilter(RelevantContentFilter):
# chunk_mode: str = "char", # chunk_mode: str = "char",
verbose: bool = False, verbose: bool = False,
logger: Optional[AsyncLogger] = None, logger: Optional[AsyncLogger] = None,
ignore_cache: bool = False, ignore_cache: bool = True,
): ):
super().__init__(None) super().__init__(None)
self.provider = provider self.provider = provider
self.api_token = ( self.api_token = api_token
api_token self.base_url = base_url or api_base
or PROVIDER_MODELS.get(provider, "no-token") self.llmConfig = llmConfig
or os.getenv("OPENAI_API_KEY")
)
self.instruction = instruction self.instruction = instruction
self.chunk_token_threshold = chunk_token_threshold self.chunk_token_threshold = chunk_token_threshold
self.overlap_rate = overlap_rate self.overlap_rate = overlap_rate
@@ -785,8 +809,6 @@ class LLMContentFilter(RelevantContentFilter):
# self.char_token_rate = char_token_rate or word_token_rate / 5 # self.char_token_rate = char_token_rate or word_token_rate / 5
# self.token_rate = word_token_rate if chunk_mode == "word" else self.char_token_rate # self.token_rate = word_token_rate if chunk_mode == "word" else self.char_token_rate
self.token_rate = word_token_rate or WORD_TOKEN_RATE self.token_rate = word_token_rate or WORD_TOKEN_RATE
self.base_url = base_url
self.api_base = api_base or base_url
self.extra_args = extra_args or {} self.extra_args = extra_args or {}
self.ignore_cache = ignore_cache self.ignore_cache = ignore_cache
self.verbose = verbose self.verbose = verbose
@@ -801,12 +823,13 @@ class LLMContentFilter(RelevantContentFilter):
**AsyncLogger.DEFAULT_ICONS, **AsyncLogger.DEFAULT_ICONS,
"LLM": "", # Star for LLM operations "LLM": "", # Star for LLM operations
"CHUNK": "", # Diamond for chunks "CHUNK": "", # Diamond for chunks
"CACHE": "", # Lightning for cache operations "CACHE": "", # Lightning for cache operations
}, },
colors={ colors={
**AsyncLogger.DEFAULT_COLORS, **AsyncLogger.DEFAULT_COLORS,
LogLevel.INFO: Fore.MAGENTA + Style.DIM, # Dimmed purple for LLM ops LogLevel.INFO: Fore.MAGENTA
} + Style.DIM, # Dimmed purple for LLM ops
},
) )
else: else:
self.logger = None self.logger = None
@@ -814,6 +837,17 @@ class LLMContentFilter(RelevantContentFilter):
self.usages = [] self.usages = []
self.total_usage = TokenUsage() self.total_usage = TokenUsage()
def __setattr__(self, name, value):
"""Handle attribute setting."""
# TODO: Planning to set properties dynamically based on the __init__ signature
sig = inspect.signature(self.__init__)
all_params = sig.parameters # Dictionary of parameter names and their details
if name in self._UNWANTED_PROPS and value is not all_params[name].default:
raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
super().__setattr__(name, value)
def _get_cache_key(self, html: str, instruction: str) -> str: def _get_cache_key(self, html: str, instruction: str) -> str:
"""Generate a unique cache key based on HTML and instruction""" """Generate a unique cache key based on HTML and instruction"""
content = f"{html}{instruction}" content = f"{html}{instruction}"
@@ -823,15 +857,13 @@ class LLMContentFilter(RelevantContentFilter):
"""Split text into chunks with overlap using char or word mode.""" """Split text into chunks with overlap using char or word mode."""
ov = int(self.chunk_token_threshold * self.overlap_rate) ov = int(self.chunk_token_threshold * self.overlap_rate)
sections = merge_chunks( sections = merge_chunks(
docs = [text], docs=[text],
target_size= self.chunk_token_threshold, target_size=self.chunk_token_threshold,
overlap=ov, overlap=ov,
word_token_ratio=self.word_token_rate word_token_ratio=self.word_token_rate,
) )
return sections return sections
def filter_content(self, html: str, ignore_cache: bool = True) -> List[str]: def filter_content(self, html: str, ignore_cache: bool = True) -> List[str]:
if not html or not isinstance(html, str): if not html or not isinstance(html, str):
return [] return []
@@ -840,8 +872,8 @@ class LLMContentFilter(RelevantContentFilter):
self.logger.info( self.logger.info(
"Starting LLM markdown content filtering process", "Starting LLM markdown content filtering process",
tag="LLM", tag="LLM",
params={"provider": self.provider}, params={"provider": self.llmConfig.provider},
colors={"provider": Fore.CYAN} colors={"provider": Fore.CYAN},
) )
# Cache handling # Cache handling
@@ -857,17 +889,19 @@ class LLMContentFilter(RelevantContentFilter):
if self.logger: if self.logger:
self.logger.info("Found cached markdown result", tag="CACHE") self.logger.info("Found cached markdown result", tag="CACHE")
try: try:
with cache_file.open('r') as f: with cache_file.open("r") as f:
cached_data = json.load(f) cached_data = json.load(f)
usage = TokenUsage(**cached_data['usage']) usage = TokenUsage(**cached_data["usage"])
self.usages.append(usage) self.usages.append(usage)
self.total_usage.completion_tokens += usage.completion_tokens self.total_usage.completion_tokens += usage.completion_tokens
self.total_usage.prompt_tokens += usage.prompt_tokens self.total_usage.prompt_tokens += usage.prompt_tokens
self.total_usage.total_tokens += usage.total_tokens self.total_usage.total_tokens += usage.total_tokens
return cached_data['blocks'] return cached_data["blocks"]
except Exception as e: except Exception as e:
if self.logger: if self.logger:
self.logger.error(f"LLM markdown: Cache read error: {str(e)}", tag="CACHE") self.logger.error(
f"LLM markdown: Cache read error: {str(e)}", tag="CACHE"
)
# Split into chunks # Split into chunks
html_chunks = self._merge_chunks(html) html_chunks = self._merge_chunks(html)
@@ -876,7 +910,7 @@ class LLMContentFilter(RelevantContentFilter):
"LLM markdown: Split content into {chunk_count} chunks", "LLM markdown: Split content into {chunk_count} chunks",
tag="CHUNK", tag="CHUNK",
params={"chunk_count": len(html_chunks)}, params={"chunk_count": len(html_chunks)},
colors={"chunk_count": Fore.YELLOW} colors={"chunk_count": Fore.YELLOW},
) )
start_time = time.time() start_time = time.time()
@@ -889,15 +923,13 @@ class LLMContentFilter(RelevantContentFilter):
self.logger.debug( self.logger.debug(
"LLM markdown: Processing chunk {chunk_num}/{total_chunks}", "LLM markdown: Processing chunk {chunk_num}/{total_chunks}",
tag="CHUNK", tag="CHUNK",
params={ params={"chunk_num": i + 1, "total_chunks": len(html_chunks)},
"chunk_num": i + 1,
"total_chunks": len(html_chunks)
}
) )
prompt_variables = { prompt_variables = {
"HTML": escape_json_string(sanitize_html(chunk)), "HTML": escape_json_string(sanitize_html(chunk)),
"REQUEST": self.instruction or "Convert this HTML into clean, relevant markdown, removing any noise or irrelevant content." "REQUEST": self.instruction
or "Convert this HTML into clean, relevant markdown, removing any noise or irrelevant content.",
} }
prompt = PROMPT_FILTER_CONTENT prompt = PROMPT_FILTER_CONTENT
@@ -905,37 +937,36 @@ class LLMContentFilter(RelevantContentFilter):
prompt = prompt.replace("{" + var + "}", value) prompt = prompt.replace("{" + var + "}", value)
def _proceed_with_chunk( def _proceed_with_chunk(
provider: str, provider: str,
prompt: str, prompt: str,
api_token: str, api_token: str,
base_url: Optional[str] = None, base_url: Optional[str] = None,
extra_args: Dict = {} extra_args: Dict = {},
) -> List[str]: ) -> List[str]:
if self.logger: if self.logger:
self.logger.info( self.logger.info(
"LLM Markdown: Processing chunk {chunk_num}", "LLM Markdown: Processing chunk {chunk_num}",
tag="CHUNK", tag="CHUNK",
params={"chunk_num": i + 1} params={"chunk_num": i + 1},
) )
return perform_completion_with_backoff( return perform_completion_with_backoff(
provider, provider,
prompt, prompt,
api_token, api_token,
base_url=base_url, base_url=base_url,
extra_args=extra_args extra_args=extra_args,
) )
future = executor.submit( future = executor.submit(
_proceed_with_chunk, _proceed_with_chunk,
self.provider, self.llmConfig.provider,
prompt, prompt,
self.api_token, self.llmConfig.api_token,
self.api_base, self.llmConfig.base_url,
self.extra_args self.extra_args,
) )
futures.append((i, future)) futures.append((i, future))
# Collect results in order # Collect results in order
ordered_results = [] ordered_results = []
for i, future in sorted(futures): for i, future in sorted(futures):
@@ -947,34 +978,39 @@ class LLMContentFilter(RelevantContentFilter):
completion_tokens=response.usage.completion_tokens, completion_tokens=response.usage.completion_tokens,
prompt_tokens=response.usage.prompt_tokens, prompt_tokens=response.usage.prompt_tokens,
total_tokens=response.usage.total_tokens, total_tokens=response.usage.total_tokens,
completion_tokens_details=response.usage.completion_tokens_details.__dict__ completion_tokens_details=(
if response.usage.completion_tokens_details else {}, response.usage.completion_tokens_details.__dict__
prompt_tokens_details=response.usage.prompt_tokens_details.__dict__ if response.usage.completion_tokens_details
if response.usage.prompt_tokens_details else {}, else {}
),
prompt_tokens_details=(
response.usage.prompt_tokens_details.__dict__
if response.usage.prompt_tokens_details
else {}
),
) )
self.usages.append(usage) self.usages.append(usage)
self.total_usage.completion_tokens += usage.completion_tokens self.total_usage.completion_tokens += usage.completion_tokens
self.total_usage.prompt_tokens += usage.prompt_tokens self.total_usage.prompt_tokens += usage.prompt_tokens
self.total_usage.total_tokens += usage.total_tokens self.total_usage.total_tokens += usage.total_tokens
blocks = extract_xml_data(["content"], response.choices[0].message.content)["content"] blocks = extract_xml_data(
["content"], response.choices[0].message.content
)["content"]
if blocks: if blocks:
ordered_results.append(blocks) ordered_results.append(blocks)
if self.logger: if self.logger:
self.logger.success( self.logger.success(
"LLM markdown: Successfully processed chunk {chunk_num}", "LLM markdown: Successfully processed chunk {chunk_num}",
tag="CHUNK", tag="CHUNK",
params={"chunk_num": i + 1} params={"chunk_num": i + 1},
) )
except Exception as e: except Exception as e:
if self.logger: if self.logger:
self.logger.error( self.logger.error(
"LLM markdown: Error processing chunk {chunk_num}: {error}", "LLM markdown: Error processing chunk {chunk_num}: {error}",
tag="CHUNK", tag="CHUNK",
params={ params={"chunk_num": i + 1, "error": str(e)},
"chunk_num": i + 1,
"error": str(e)
}
) )
end_time = time.time() end_time = time.time()
@@ -983,17 +1019,14 @@ class LLMContentFilter(RelevantContentFilter):
"LLM markdown: Completed processing in {time:.2f}s", "LLM markdown: Completed processing in {time:.2f}s",
tag="LLM", tag="LLM",
params={"time": end_time - start_time}, params={"time": end_time - start_time},
colors={"time": Fore.YELLOW} colors={"time": Fore.YELLOW},
) )
result = ordered_results if ordered_results else [] result = ordered_results if ordered_results else []
# Cache the final result # Cache the final result
cache_data = { cache_data = {"blocks": result, "usage": self.total_usage.__dict__}
'blocks': result, with cache_file.open("w") as f:
'usage': self.total_usage.__dict__
}
with cache_file.open('w') as f:
json.dump(cache_data, f) json.dump(cache_data, f)
if self.logger: if self.logger:
self.logger.info("Cached results for future use", tag="CACHE") self.logger.info("Cached results for future use", tag="CACHE")

View File

@@ -1,4 +1,5 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
import inspect
from typing import Any, List, Dict, Optional from typing import Any, List, Dict, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
import json import json
@@ -496,20 +497,26 @@ class LLMExtractionStrategy(ExtractionStrategy):
usages: List of individual token usages. usages: List of individual token usages.
total_usage: Accumulated token usage. total_usage: Accumulated token usage.
""" """
_UNWANTED_PROPS = {
'provider' : 'Instead, use llmConfig=LlmConfig(provider="...")',
'api_token' : 'Instead, use llmConfig=LlMConfig(api_token="...")',
'base_url' : 'Instead, use llmConfig=LlmConfig(base_url="...")',
'api_base' : 'Instead, use llmConfig=LlmConfig(base_url="...")',
}
def __init__( def __init__(
self, self,
llmConfig: 'LLMConfig' = None,
instruction: str = None,
provider: str = DEFAULT_PROVIDER, provider: str = DEFAULT_PROVIDER,
api_token: Optional[str] = None, api_token: Optional[str] = None,
instruction: str = None, base_url: str = None,
api_base: str = None,
schema: Dict = None, schema: Dict = None,
extraction_type="block", extraction_type="block",
chunk_token_threshold=CHUNK_TOKEN_THRESHOLD, chunk_token_threshold=CHUNK_TOKEN_THRESHOLD,
overlap_rate=OVERLAP_RATE, overlap_rate=OVERLAP_RATE,
word_token_rate=WORD_TOKEN_RATE, word_token_rate=WORD_TOKEN_RATE,
apply_chunking=True, apply_chunking=True,
api_base: str =None,
base_url: str =None,
input_format: str = "markdown", input_format: str = "markdown",
verbose=False, verbose=False,
**kwargs, **kwargs,
@@ -518,6 +525,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
Initialize the strategy with clustering parameters. Initialize the strategy with clustering parameters.
Args: Args:
llmConfig: The LLM configuration object.
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3". provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
api_token: The API token for the provider. api_token: The API token for the provider.
instruction: The instruction to use for the LLM model. instruction: The instruction to use for the LLM model.
@@ -536,40 +544,38 @@ class LLMExtractionStrategy(ExtractionStrategy):
""" """
super().__init__( input_format=input_format, **kwargs) super().__init__( input_format=input_format, **kwargs)
self.llmConfig = llmConfig
self.provider = provider self.provider = provider
if api_token and not api_token.startswith("env:"): self.api_token = api_token
self.api_token = api_token self.base_url = base_url
elif api_token and api_token.startswith("env:"): self.api_base = api_base
self.api_token = os.getenv(api_token[4:])
else:
self.api_token = (
PROVIDER_MODELS.get(provider, "no-token")
or os.getenv("OPENAI_API_KEY")
)
self.instruction = instruction self.instruction = instruction
self.extract_type = extraction_type self.extract_type = extraction_type
self.schema = schema self.schema = schema
if schema: if schema:
self.extract_type = "schema" self.extract_type = "schema"
self.chunk_token_threshold = chunk_token_threshold or CHUNK_TOKEN_THRESHOLD self.chunk_token_threshold = chunk_token_threshold or CHUNK_TOKEN_THRESHOLD
self.overlap_rate = overlap_rate self.overlap_rate = overlap_rate
self.word_token_rate = word_token_rate self.word_token_rate = word_token_rate
self.apply_chunking = apply_chunking self.apply_chunking = apply_chunking
self.base_url = base_url
self.api_base = api_base or base_url
self.extra_args = kwargs.get("extra_args", {}) self.extra_args = kwargs.get("extra_args", {})
if not self.apply_chunking: if not self.apply_chunking:
self.chunk_token_threshold = 1e9 self.chunk_token_threshold = 1e9
self.verbose = verbose self.verbose = verbose
self.usages = [] # Store individual usages self.usages = [] # Store individual usages
self.total_usage = TokenUsage() # Accumulated usage self.total_usage = TokenUsage() # Accumulated usage
if not self.api_token:
raise ValueError( def __setattr__(self, name, value):
"API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable." """Handle attribute setting."""
) # TODO: Planning to set properties dynamically based on the __init__ signature
sig = inspect.signature(self.__init__)
all_params = sig.parameters # Dictionary of parameter names and their details
if name in self._UNWANTED_PROPS and value is not all_params[name].default:
raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
super().__setattr__(name, value)
def extract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]: def extract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]:
""" """
@@ -603,7 +609,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
if self.extract_type == "schema" and self.schema: if self.extract_type == "schema" and self.schema:
variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) # if type of self.schema is dict else self.schema
prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
for variable in variable_values: for variable in variable_values:
@@ -612,10 +618,10 @@ class LLMExtractionStrategy(ExtractionStrategy):
) )
response = perform_completion_with_backoff( response = perform_completion_with_backoff(
self.provider, self.llmConfig.provider,
prompt_with_variables, prompt_with_variables,
self.api_token, self.llmConfig.api_token,
base_url=self.api_base or self.base_url, base_url=self.llmConfig.base_url,
extra_args=self.extra_args, extra_args=self.extra_args,
) # , json_response=self.extract_type == "schema") ) # , json_response=self.extract_type == "schema")
# Track usage # Track usage
@@ -695,7 +701,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
overlap=int(self.chunk_token_threshold * self.overlap_rate), overlap=int(self.chunk_token_threshold * self.overlap_rate),
) )
extracted_content = [] extracted_content = []
if self.provider.startswith("groq/"): if self.llmConfig.provider.startswith("groq/"):
# Sequential processing with a delay # Sequential processing with a delay
for ix, section in enumerate(merged_sections): for ix, section in enumerate(merged_sections):
extract_func = partial(self.extract, url) extract_func = partial(self.extract, url)
@@ -1036,14 +1042,20 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
"""Get attribute value from element""" """Get attribute value from element"""
pass pass
_GENERATE_SCHEMA_UNWANTED_PROPS = {
'provider': 'Instead, use llmConfig=LlmConfig(provider="...")',
'api_token': 'Instead, use llmConfig=LlMConfig(api_token="...")',
}
@staticmethod @staticmethod
def generate_schema( def generate_schema(
html: str, html: str,
schema_type: str = "CSS", # or XPATH schema_type: str = "CSS", # or XPATH
query: str = None, query: str = None,
target_json_example: str = None, target_json_example: str = None,
provider: str = "gpt-4o", llmConfig: 'LLMConfig' = None,
api_token: str = os.getenv("OPENAI_API_KEY"), provider: str = None,
api_token: str = None,
**kwargs **kwargs
) -> dict: ) -> dict:
""" """
@@ -1052,8 +1064,9 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
Args: Args:
html (str): The HTML content to analyze html (str): The HTML content to analyze
query (str, optional): Natural language description of what data to extract query (str, optional): Natural language description of what data to extract
provider (str): LLM provider to use provider (str): Legacy Parameter. LLM provider to use
api_token (str): API token for LLM provider api_token (str): Legacy Parameter. API token for LLM provider
llmConfig (LlmConfig): LLM configuration object
prompt (str, optional): Custom prompt template to use prompt (str, optional): Custom prompt template to use
**kwargs: Additional args passed to perform_completion_with_backoff **kwargs: Additional args passed to perform_completion_with_backoff
@@ -1062,6 +1075,9 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
""" """
from .prompts import JSON_SCHEMA_BUILDER from .prompts import JSON_SCHEMA_BUILDER
from .utils import perform_completion_with_backoff from .utils import perform_completion_with_backoff
for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
if locals()[name] is not None:
raise AttributeError(f"Setting '{name}' is deprecated. {message}")
# Use default or custom prompt # Use default or custom prompt
prompt_template = JSON_SCHEMA_BUILDER if schema_type == "CSS" else JSON_SCHEMA_BUILDER_XPATH prompt_template = JSON_SCHEMA_BUILDER if schema_type == "CSS" else JSON_SCHEMA_BUILDER_XPATH
@@ -1114,10 +1130,10 @@ In this scenario, use your best judgment to generate the schema. Try to maximize
try: try:
# Call LLM with backoff handling # Call LLM with backoff handling
response = perform_completion_with_backoff( response = perform_completion_with_backoff(
provider=provider, provider=llmConfig.provider,
prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]), prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
json_response = True, json_response = True,
api_token=api_token, api_token=llmConfig.api_token,
**kwargs **kwargs
) )

View File

@@ -11,6 +11,7 @@ import asyncio
import os import os
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.async_configs import LlmConfig
from crawl4ai.extraction_strategy import ( from crawl4ai.extraction_strategy import (
LLMExtractionStrategy, LLMExtractionStrategy,
JsonCssExtractionStrategy, JsonCssExtractionStrategy,
@@ -60,22 +61,19 @@ async def main():
# 1. LLM Extraction with different input formats # 1. LLM Extraction with different input formats
markdown_strategy = LLMExtractionStrategy( markdown_strategy = LLMExtractionStrategy(
provider="openai/gpt-4o-mini", llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
api_token=os.getenv("OPENAI_API_KEY"),
instruction="Extract product information including name, price, and description", instruction="Extract product information including name, price, and description",
) )
html_strategy = LLMExtractionStrategy( html_strategy = LLMExtractionStrategy(
input_format="html", input_format="html",
provider="openai/gpt-4o-mini", llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
api_token=os.getenv("OPENAI_API_KEY"),
instruction="Extract product information from HTML including structured data", instruction="Extract product information from HTML including structured data",
) )
fit_markdown_strategy = LLMExtractionStrategy( fit_markdown_strategy = LLMExtractionStrategy(
input_format="fit_markdown", input_format="fit_markdown",
provider="openai/gpt-4o-mini", llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
api_token=os.getenv("OPENAI_API_KEY"),
instruction="Extract product information from cleaned markdown", instruction="Extract product information from cleaned markdown",
) )

View File

@@ -1,3 +1,4 @@
from crawl4ai.async_configs import LlmConfig
from crawl4ai.extraction_strategy import * from crawl4ai.extraction_strategy import *
from crawl4ai.crawler_strategy import * from crawl4ai.crawler_strategy import *
import asyncio import asyncio
@@ -25,8 +26,7 @@ async def main():
word_count_threshold=1, word_count_threshold=1,
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
# provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
provider="groq/llama-3.1-70b-versatile", llmConfig=LlmConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
api_token=os.getenv("GROQ_API_KEY"),
schema=OpenAIModelFee.model_json_schema(), schema=OpenAIModelFee.model_json_schema(),
extraction_type="schema", extraction_type="schema",
instruction="From the crawled content, extract all mentioned model names along with their " instruction="From the crawled content, extract all mentioned model names along with their "

View File

@@ -1,6 +1,7 @@
import os import os
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.async_configs import LlmConfig
from crawl4ai.content_filter_strategy import LLMContentFilter from crawl4ai.content_filter_strategy import LLMContentFilter
async def test_llm_filter(): async def test_llm_filter():
@@ -22,8 +23,7 @@ async def test_llm_filter():
# Initialize LLM filter with focused instruction # Initialize LLM filter with focused instruction
filter = LLMContentFilter( filter = LLMContentFilter(
provider="openai/gpt-4o", llmConfig=LlmConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
api_token=os.getenv('OPENAI_API_KEY'),
instruction=""" instruction="""
Focus on extracting the core educational content about Python classes. Focus on extracting the core educational content about Python classes.
Include: Include:
@@ -43,8 +43,7 @@ async def test_llm_filter():
) )
filter = LLMContentFilter( filter = LLMContentFilter(
provider="openai/gpt-4o", llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
api_token=os.getenv('OPENAI_API_KEY'),
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2 chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
ignore_cache = True, ignore_cache = True,
instruction=""" instruction="""

View File

@@ -1,5 +1,7 @@
import os, sys import os, sys
from crawl4ai.async_configs import LlmConfig
sys.path.append( sys.path.append(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
) )
@@ -209,8 +211,7 @@ async def extract_structured_data_using_llm(
word_count_threshold=1, word_count_threshold=1,
page_timeout=80000, page_timeout=80000,
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
provider=provider, llmConfig=LlmConfig(provider=provider,api_token=api_token),
api_token=api_token,
schema=OpenAIModelFee.model_json_schema(), schema=OpenAIModelFee.model_json_schema(),
extraction_type="schema", extraction_type="schema",
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.

View File

@@ -1,5 +1,7 @@
import os, sys import os, sys
from crawl4ai.async_configs import LlmConfig
# append parent directory to system path # append parent directory to system path
sys.path.append( sys.path.append(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -145,8 +147,7 @@ async def extract_structured_data_using_llm(
url="https://openai.com/api/pricing/", url="https://openai.com/api/pricing/",
word_count_threshold=1, word_count_threshold=1,
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
provider=provider, llmConfig=LlmConfig(provider=provider,api_token=api_token),
api_token=api_token,
schema=OpenAIModelFee.model_json_schema(), schema=OpenAIModelFee.model_json_schema(),
extraction_type="schema", extraction_type="schema",
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
@@ -569,8 +570,7 @@ async def generate_knowledge_graph():
relationships: List[Relationship] relationships: List[Relationship]
extraction_strategy = LLMExtractionStrategy( extraction_strategy = LLMExtractionStrategy(
provider="openai/gpt-4o-mini", # Or any other provider, including Ollama and open source models llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), # In case of Ollama just pass "no-token"
api_token=os.getenv("OPENAI_API_KEY"), # In case of Ollama just pass "no-token"
schema=KnowledgeGraph.model_json_schema(), schema=KnowledgeGraph.model_json_schema(),
extraction_type="schema", extraction_type="schema",
instruction="""Extract entities and relationships from the given text.""", instruction="""Extract entities and relationships from the given text.""",

View File

@@ -1,5 +1,6 @@
import os import os
import time import time
from crawl4ai.async_configs import LlmConfig
from crawl4ai.web_crawler import WebCrawler from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import * from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import * from crawl4ai.extraction_strategy import *
@@ -178,7 +179,7 @@ def add_llm_extraction_strategy(crawler):
result = crawler.run( result = crawler.run(
url="https://www.nbcnews.com/business", url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY") llmConfig = LlmConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
), ),
) )
cprint( cprint(
@@ -197,8 +198,7 @@ def add_llm_extraction_strategy(crawler):
result = crawler.run( result = crawler.run(
url="https://www.nbcnews.com/business", url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
provider="openai/gpt-4o", llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
api_token=os.getenv("OPENAI_API_KEY"),
instruction="I am interested in only financial news", instruction="I am interested in only financial news",
), ),
) )
@@ -210,8 +210,7 @@ def add_llm_extraction_strategy(crawler):
result = crawler.run( result = crawler.run(
url="https://www.nbcnews.com/business", url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
provider="openai/gpt-4o", llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
api_token=os.getenv("OPENAI_API_KEY"),
instruction="Extract only content related to technology", instruction="Extract only content related to technology",
), ),
) )

View File

@@ -245,11 +245,32 @@ run_config = CrawlerRunConfig(
) )
``` ```
## 3. Putting It All Together # 3. **LlmConfig** - Setting up LLM providers
LlmConfig is useful to pass LLM provider config to strategies and functions that rely on LLMs to do extraction, filtering, schema generation etc. Currently it can be used in the following -
1. LLMExtractionStrategy
2. LLMContentFilter
3. JsonCssExtractionStrategy.generate_schema
4. JsonXPathExtractionStrategy.generate_schema
## 3.1 Parameters
| **Parameter** | **Type / Default** | **What It Does** |
|-----------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------|
| **`provider`** | `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)* | Which LLM provoder to use.
| **`api_token`** |1.Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables <br/> 2. API token of LLM provider <br/> eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"` <br/> 3. Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"` | API token to use for the given provider
| **`base_url`** |Optional. Custom API endpoint | If your provider has a custom endpoint
## 3.2 Example Usage
```python
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
```
## 4. Putting It All Together
- **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent. - **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent.
- **Use** `CrawlerRunConfig` for each crawls **context**: how to filter content, handle caching, wait for dynamic elements, or run JS. - **Use** `CrawlerRunConfig` for each crawls **context**: how to filter content, handle caching, wait for dynamic elements, or run JS.
- **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`). - **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`).
- **Use** `LlmConfig` for LLM provider configurations that can be used across all extraction, filtering, schema generation tasks. Can be used in - `LLMExtractionStrategy`, `LLMContentFilter`, `JsonCssExtractionStrategy.generate_schema` & `JsonXPathExtractionStrategy.generate_schema`
```python ```python
# Create a modified copy with the clone() method # Create a modified copy with the clone() method

View File

@@ -131,6 +131,7 @@ OverlappingWindowChunking(
```python ```python
from pydantic import BaseModel from pydantic import BaseModel
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai.async_configs import LlmConfig
# Define schema # Define schema
class Article(BaseModel): class Article(BaseModel):
@@ -140,7 +141,7 @@ class Article(BaseModel):
# Create strategy # Create strategy
strategy = LLMExtractionStrategy( strategy = LLMExtractionStrategy(
provider="ollama/llama2", llmConfig = LlmConfig(provider="ollama/llama2"),
schema=Article.schema(), schema=Article.schema(),
instruction="Extract article details" instruction="Extract article details"
) )
@@ -197,6 +198,7 @@ result = await crawler.arun(
```python ```python
from crawl4ai.chunking_strategy import OverlappingWindowChunking from crawl4ai.chunking_strategy import OverlappingWindowChunking
from crawl4ai.async_configs import LlmConfig
# Create chunking strategy # Create chunking strategy
chunker = OverlappingWindowChunking( chunker = OverlappingWindowChunking(
@@ -206,7 +208,7 @@ chunker = OverlappingWindowChunking(
# Use with extraction strategy # Use with extraction strategy
strategy = LLMExtractionStrategy( strategy = LLMExtractionStrategy(
provider="ollama/llama2", llmConfig = LlmConfig(provider="ollama/llama2"),
chunking_strategy=chunker chunking_strategy=chunker
) )

View File

@@ -1,9 +1,10 @@
# Browser & Crawler Configuration (Quick Overview) # Browser, Crawler & LLM Configuration (Quick Overview)
Crawl4AIs flexibility stems from two key classes: Crawl4AIs flexibility stems from two key classes:
1. **`BrowserConfig`** Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent). 1. **`BrowserConfig`** Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent).
2. **`CrawlerRunConfig`** Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.). 2. **`CrawlerRunConfig`** Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.).
3. **`LlmConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.)
In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md). In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md).
@@ -234,13 +235,37 @@ The `clone()` method:
--- ---
## 3. Putting It All Together
In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` depending on each calls needs:
## 3. LlmConfig Essentials
### Key fields to note
1. **`provider`**:
- Which LLM provoder to use.
- Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)*
2. **`api_token`**:
- Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables
- API token of LLM provider <br/> eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"`
- Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"`
3. **`base_url`**:
- If your provider has a custom endpoint
```python
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
```
## 4. Putting It All Together
In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LlmConfig` depending on each calls needs:
```python ```python
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
async def main(): async def main():
@@ -262,8 +287,39 @@ async def main():
} }
extraction = JsonCssExtractionStrategy(schema) extraction = JsonCssExtractionStrategy(schema)
# 3) Crawler run config: skip cache, use extraction # 3) Example LLM content filtering
gemini_config = LlmConfig(
provider="gemini/gemini-1.5-pro"
api_token = "env:GEMINI_API_TOKEN"
)
# Initialize LLM filter with specific instruction
filter = LLMContentFilter(
llmConfig=gemini_config, # or your preferred provider
instruction="""
Focus on extracting the core educational content.
Include:
- Key concepts and explanations
- Important code examples
- Essential technical details
Exclude:
- Navigation elements
- Sidebars
- Footer content
Format the output as clean markdown with proper code blocks and headers.
""",
chunk_token_threshold=500, # Adjust based on your needs
verbose=True
)
md_generator = DefaultMarkdownGenerator(
content_filter=filter,
options={"ignore_links": True}
# 4) Crawler run config: skip cache, use extraction
run_conf = CrawlerRunConfig( run_conf = CrawlerRunConfig(
markdown_generator=md_generator,
extraction_strategy=extraction, extraction_strategy=extraction,
cache_mode=CacheMode.BYPASS, cache_mode=CacheMode.BYPASS,
) )
@@ -283,11 +339,11 @@ if __name__ == "__main__":
--- ---
## 4. Next Steps ## 5. Next Steps
For a **detailed list** of available parameters (including advanced ones), see: For a **detailed list** of available parameters (including advanced ones), see:
- [BrowserConfig and CrawlerRunConfig Reference](../api/parameters.md) - [BrowserConfig, CrawlerRunConfig & LlmConfig Reference](../api/parameters.md)
You can explore topics like: You can explore topics like:
@@ -298,11 +354,12 @@ You can explore topics like:
--- ---
## 5. Conclusion ## 6. Conclusion
**BrowserConfig** and **CrawlerRunConfig** give you straightforward ways to define: **BrowserConfig**, **CrawlerRunConfig** and **LlmConfig** give you straightforward ways to define:
- **Which** browser to launch, how it should run, and any proxy or user agent needs. - **Which** browser to launch, how it should run, and any proxy or user agent needs.
- **How** each crawl should behave—caching, timeouts, JavaScript code, extraction strategies, etc. - **How** each crawl should behave—caching, timeouts, JavaScript code, extraction strategies, etc.
- **Which** LLM provider to use, api token, temperature and base url for custom endpoints
Use them together for **clear, maintainable** code, and when you need more specialized behavior, check out the advanced parameters in the [reference docs](../api/parameters.md). Happy crawling! Use them together for **clear, maintainable** code, and when you need more specialized behavior, check out the advanced parameters in the [reference docs](../api/parameters.md). Happy crawling!

View File

@@ -211,7 +211,7 @@ if __name__ == "__main__":
import asyncio import asyncio
import json import json
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LlmConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai.extraction_strategy import LLMExtractionStrategy
class ArticleData(BaseModel): class ArticleData(BaseModel):
@@ -220,8 +220,7 @@ class ArticleData(BaseModel):
async def main(): async def main():
llm_strategy = LLMExtractionStrategy( llm_strategy = LLMExtractionStrategy(
provider="openai/gpt-4", llmConfig = LlmConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY")
api_token="sk-YOUR_API_KEY",
schema=ArticleData.schema(), schema=ArticleData.schema(),
extraction_type="schema", extraction_type="schema",
instruction="Extract 'headline' and a short 'summary' from the content." instruction="Extract 'headline' and a short 'summary' from the content."

View File

@@ -175,14 +175,13 @@ prune_filter = PruningContentFilter(
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure: For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
```python ```python
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LlmConfig
from crawl4ai.content_filter_strategy import LLMContentFilter from crawl4ai.content_filter_strategy import LLMContentFilter
async def main(): async def main():
# Initialize LLM filter with specific instruction # Initialize LLM filter with specific instruction
filter = LLMContentFilter( filter = LLMContentFilter(
provider="openai/gpt-4o", # or your preferred provider llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable
api_token="your-api-token", # or use environment variable
instruction=""" instruction="""
Focus on extracting the core educational content. Focus on extracting the core educational content.
Include: Include:

View File

@@ -128,6 +128,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B
```python ```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.async_configs import LlmConfig
# Generate a schema (one-time cost) # Generate a schema (one-time cost)
html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</span></div>" html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</span></div>"
@@ -135,15 +136,13 @@ html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</
# Using OpenAI (requires API token) # Using OpenAI (requires API token)
schema = JsonCssExtractionStrategy.generate_schema( schema = JsonCssExtractionStrategy.generate_schema(
html, html,
provider="openai/gpt-4o", # Default provider llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-openai-token") # Required for OpenAI
api_token="your-openai-token" # Required for OpenAI
) )
# Or using Ollama (open source, no token needed) # Or using Ollama (open source, no token needed)
schema = JsonCssExtractionStrategy.generate_schema( schema = JsonCssExtractionStrategy.generate_schema(
html, html,
provider="ollama/llama3.3", # Open source alternative llmConfig = LlmConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
api_token=None # Not needed for Ollama
) )
# Use the schema for fast, repeated extractions # Use the schema for fast, repeated extractions
@@ -212,7 +211,7 @@ import os
import json import json
import asyncio import asyncio
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LlmConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai.extraction_strategy import LLMExtractionStrategy
class OpenAIModelFee(BaseModel): class OpenAIModelFee(BaseModel):
@@ -242,8 +241,7 @@ async def extract_structured_data_using_llm(
word_count_threshold=1, word_count_threshold=1,
page_timeout=80000, page_timeout=80000,
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
provider=provider, llmConfig = LlmConfig(provider=provider,api_token=api_token),
api_token=api_token,
schema=OpenAIModelFee.model_json_schema(), schema=OpenAIModelFee.model_json_schema(),
extraction_type="schema", extraction_type="schema",
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
@@ -259,12 +257,6 @@ async def extract_structured_data_using_llm(
print(result.extracted_content) print(result.extracted_content)
if __name__ == "__main__": if __name__ == "__main__":
# Use ollama with llama3.3
# asyncio.run(
# extract_structured_data_using_llm(
# provider="ollama/llama3.3", api_token="no-token"
# )
# )
asyncio.run( asyncio.run(
extract_structured_data_using_llm( extract_structured_data_using_llm(

View File

@@ -71,8 +71,7 @@ Below is an overview of important LLM extraction parameters. All are typically s
```python ```python
extraction_strategy = LLMExtractionStrategy( extraction_strategy = LLMExtractionStrategy(
provider="openai/gpt-4", llmConfig = LlmConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
api_token="YOUR_OPENAI_KEY",
schema=MyModel.model_json_schema(), schema=MyModel.model_json_schema(),
extraction_type="schema", extraction_type="schema",
instruction="Extract a list of items from the text with 'name' and 'price' fields.", instruction="Extract a list of items from the text with 'name' and 'price' fields.",
@@ -97,7 +96,7 @@ import asyncio
import json import json
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from typing import List from typing import List
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai.extraction_strategy import LLMExtractionStrategy
class Product(BaseModel): class Product(BaseModel):
@@ -107,9 +106,8 @@ class Product(BaseModel):
async def main(): async def main():
# 1. Define the LLM extraction strategy # 1. Define the LLM extraction strategy
llm_strategy = LLMExtractionStrategy( llm_strategy = LLMExtractionStrategy(
provider="openai/gpt-4o-mini", # e.g. "ollama/llama2" llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
api_token=os.getenv('OPENAI_API_KEY'), schema=Product.schema_json(), # Or use model_json_schema()
schema=Product.schema_json(), # Or use model_json_schema()
extraction_type="schema", extraction_type="schema",
instruction="Extract all product objects with 'name' and 'price' from the content.", instruction="Extract all product objects with 'name' and 'price' from the content.",
chunk_token_threshold=1000, chunk_token_threshold=1000,

View File

@@ -415,6 +415,7 @@ The schema generator is available as a static method on both `JsonCssExtractionS
```python ```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
from crawl4ai.async_configs import LlmConfig
# Sample HTML with product information # Sample HTML with product information
html = """ html = """
@@ -433,17 +434,15 @@ html = """
# Option 1: Using OpenAI (requires API token) # Option 1: Using OpenAI (requires API token)
css_schema = JsonCssExtractionStrategy.generate_schema( css_schema = JsonCssExtractionStrategy.generate_schema(
html, html,
schema_type="css", # This is the default schema_type="css",
provider="openai/gpt-4o", # Default provider llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-openai-token")
api_token="your-openai-token" # Required for OpenAI
) )
# Option 2: Using Ollama (open source, no token needed) # Option 2: Using Ollama (open source, no token needed)
xpath_schema = JsonXPathExtractionStrategy.generate_schema( xpath_schema = JsonXPathExtractionStrategy.generate_schema(
html, html,
schema_type="xpath", schema_type="xpath",
provider="ollama/llama3.3", # Open source alternative llmConfig = LlmConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
api_token=None # Not needed for Ollama
) )
# Use the generated schema for fast, repeated extractions # Use the generated schema for fast, repeated extractions

View File

@@ -18,7 +18,7 @@ nav:
- "Command Line Interface": "core/cli.md" - "Command Line Interface": "core/cli.md"
- "Simple Crawling": "core/simple-crawling.md" - "Simple Crawling": "core/simple-crawling.md"
- "Crawler Result": "core/crawler-result.md" - "Crawler Result": "core/crawler-result.md"
- "Browser & Crawler Config": "core/browser-crawler-config.md" - "Browser, Crawler & LLM Config": "core/browser-crawler-config.md"
- "Markdown Generation": "core/markdown-generation.md" - "Markdown Generation": "core/markdown-generation.md"
- "Fit Markdown": "core/fit-markdown.md" - "Fit Markdown": "core/fit-markdown.md"
- "Page Interaction": "core/page-interaction.md" - "Page Interaction": "core/page-interaction.md"
@@ -46,7 +46,7 @@ nav:
- "AsyncWebCrawler": "api/async-webcrawler.md" - "AsyncWebCrawler": "api/async-webcrawler.md"
- "arun()": "api/arun.md" - "arun()": "api/arun.md"
- "arun_many()": "api/arun_many.md" - "arun_many()": "api/arun_many.md"
- "Browser & Crawler Config": "api/parameters.md" - "Browser, Crawler & LLM Config": "api/parameters.md"
- "CrawlResult": "api/crawl-result.md" - "CrawlResult": "api/crawl-result.md"
- "Strategies": "api/strategies.md" - "Strategies": "api/strategies.md"

View File

@@ -1,6 +1,7 @@
import os import os
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.async_configs import LlmConfig
from crawl4ai.content_filter_strategy import LLMContentFilter from crawl4ai.content_filter_strategy import LLMContentFilter
async def test_llm_filter(): async def test_llm_filter():
@@ -22,8 +23,7 @@ async def test_llm_filter():
# Initialize LLM filter with focused instruction # Initialize LLM filter with focused instruction
filter = LLMContentFilter( filter = LLMContentFilter(
provider="openai/gpt-4o", llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
api_token=os.getenv('OPENAI_API_KEY'),
instruction=""" instruction="""
Focus on extracting the core educational content about Python classes. Focus on extracting the core educational content about Python classes.
Include: Include:
@@ -43,8 +43,7 @@ async def test_llm_filter():
) )
filter = LLMContentFilter( filter = LLMContentFilter(
provider="openai/gpt-4o", llmConfig = LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
api_token=os.getenv('OPENAI_API_KEY'),
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2 chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
instruction=""" instruction="""
Extract the main educational content while preserving its original wording and substance completely. Your task is to: Extract the main educational content while preserving its original wording and substance completely. Your task is to:

View File

@@ -7,6 +7,7 @@ import json
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir) sys.path.append(parent_dir)
from crawl4ai.async_configs import LlmConfig
from crawl4ai.async_webcrawler import AsyncWebCrawler from crawl4ai.async_webcrawler import AsyncWebCrawler
from crawl4ai.chunking_strategy import RegexChunking from crawl4ai.chunking_strategy import RegexChunking
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai.extraction_strategy import LLMExtractionStrategy
@@ -48,8 +49,7 @@ async def test_llm_extraction_strategy():
async with AsyncWebCrawler(verbose=True) as crawler: async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://www.nbcnews.com/business" url = "https://www.nbcnews.com/business"
extraction_strategy = LLMExtractionStrategy( extraction_strategy = LLMExtractionStrategy(
provider="openai/gpt-4o-mini", llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
api_token=os.getenv("OPENAI_API_KEY"),
instruction="Extract only content related to technology", instruction="Extract only content related to technology",
) )
result = await crawler.arun( result = await crawler.arun(

View File

@@ -7,6 +7,7 @@ from crawl4ai import (
BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator, BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator,
PruningContentFilter, JsonCssExtractionStrategy, LLMContentFilter, CacheMode PruningContentFilter, JsonCssExtractionStrategy, LLMContentFilter, CacheMode
) )
from crawl4ai.async_configs import LlmConfig
from crawl4ai.docker_client import Crawl4aiDockerClient from crawl4ai.docker_client import Crawl4aiDockerClient
class Crawl4AiTester: class Crawl4AiTester:
@@ -142,7 +143,7 @@ async def test_with_client():
cache_mode=CacheMode.BYPASS, cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator( markdown_generator=DefaultMarkdownGenerator(
content_filter=LLMContentFilter( content_filter=LLMContentFilter(
provider="openai/gpt-40", llmConfig=LlmConfig(provider="openai/gpt-40"),
instruction="Extract key technical concepts" instruction="Extract key technical concepts"
) )
), ),

View File

@@ -2,6 +2,8 @@ import inspect
from typing import Any, Dict from typing import Any, Dict
from enum import Enum from enum import Enum
from crawl4ai.async_configs import LlmConfig
def to_serializable_dict(obj: Any) -> Dict: def to_serializable_dict(obj: Any) -> Dict:
""" """
Recursively convert an object to a serializable dictionary using {type, params} structure Recursively convert an object to a serializable dictionary using {type, params} structure
@@ -222,7 +224,7 @@ if __name__ == "__main__":
config3 = CrawlerRunConfig( config3 = CrawlerRunConfig(
markdown_generator=DefaultMarkdownGenerator( markdown_generator=DefaultMarkdownGenerator(
content_filter=LLMContentFilter( content_filter=LLMContentFilter(
provider="openai/gpt-4", llmConfig = LlmConfig(provider="openai/gpt-4"),
instruction="Extract key technical concepts", instruction="Extract key technical concepts",
chunk_token_threshold=2000, chunk_token_threshold=2000,
overlap_rate=0.1 overlap_rate=0.1

View File

@@ -1,4 +1,5 @@
import unittest, os import unittest, os
from crawl4ai.async_configs import LlmConfig
from crawl4ai.web_crawler import WebCrawler from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import ( from crawl4ai.chunking_strategy import (
RegexChunking, RegexChunking,
@@ -42,7 +43,7 @@ class TestWebCrawler(unittest.TestCase):
word_count_threshold=5, word_count_threshold=5,
chunking_strategy=FixedLengthWordChunking(chunk_size=100), chunking_strategy=FixedLengthWordChunking(chunk_size=100),
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY") llmConfig=LlmConfig(provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY"))
), ),
bypass_cache=True, bypass_cache=True,
) )