refactor(llm): rename LlmConfig to LLMConfig for consistency

Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions.
Update all imports and usages to use the new name.
Update documentation and examples to reflect the change.

BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage.
This commit is contained in:
UncleCode
2025-03-05 14:17:04 +08:00
parent e896c08f9c
commit baee4949d3
33 changed files with 362 additions and 174 deletions

View File

@@ -2,7 +2,8 @@
import warnings
from .async_webcrawler import AsyncWebCrawler, CacheMode
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig
from .content_scraping_strategy import (
ContentScrapingStrategy,
WebScrapingStrategy,
@@ -68,6 +69,7 @@ __all__ = [
"AsyncLogger",
"AsyncWebCrawler",
"BrowserProfiler",
"LLMConfig",
"DeepCrawlStrategy",
"BFSDeepCrawlStrategy",
"BestFirstCrawlingStrategy",

View File

@@ -13,13 +13,15 @@ from .config import (
from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator
from .extraction_strategy import ExtractionStrategy
from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import MarkdownGenerationStrategy
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
from .deep_crawling import DeepCrawlStrategy
from typing import Union, List
from .cache_context import CacheMode
from .proxy_strategy import ProxyRotationStrategy
from typing import Union, List
import inspect
from typing import Any, Dict, Optional
from enum import Enum
@@ -1042,7 +1044,7 @@ class CrawlerRunConfig():
return CrawlerRunConfig.from_kwargs(config_dict)
class LlmConfig:
class LLMConfig:
def __init__(
self,
provider: str = DEFAULT_PROVIDER,
@@ -1063,8 +1065,8 @@ class LlmConfig:
@staticmethod
def from_kwargs(kwargs: dict) -> "LlmConfig":
return LlmConfig(
def from_kwargs(kwargs: dict) -> "LLMConfig":
return LLMConfig(
provider=kwargs.get("provider", DEFAULT_PROVIDER),
api_token=kwargs.get("api_token"),
base_url=kwargs.get("base_url"),
@@ -1084,8 +1086,8 @@ class LlmConfig:
**kwargs: Key-value pairs of configuration options to update
Returns:
LLMConfig: A new instance with the specified updates
llm_config: A new instance with the specified updates
"""
config_dict = self.to_dict()
config_dict.update(kwargs)
return LlmConfig.from_kwargs(config_dict)
return LLMConfig.from_kwargs(config_dict)

View File

@@ -1,9 +1,7 @@
import click
import os
import time
import datetime
import sys
import shutil
import humanize
from typing import Dict, Any, Optional, List
import json
@@ -13,7 +11,6 @@ from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.prompt import Prompt, Confirm
from rich.style import Style
from crawl4ai import (
CacheMode,
@@ -26,12 +23,12 @@ from crawl4ai import (
JsonXPathExtractionStrategy,
BM25ContentFilter,
PruningContentFilter,
BrowserProfiler
BrowserProfiler,
LLMConfig
)
from litellm import completion
from pathlib import Path
from crawl4ai.async_configs import LlmConfig
# Initialize rich console
console = Console()
@@ -647,7 +644,7 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
raise click.ClickException("LLM provider and API token are required for LLM extraction")
crawler_cfg.extraction_strategy = LLMExtractionStrategy(
llmConfig=LlmConfig(provider=extract_conf["provider"], api_token=extract_conf["api_token"]),
llm_config=LLMConfig(provider=extract_conf["provider"], api_token=extract_conf["api_token"]),
instruction=extract_conf["instruction"],
schema=schema_data,
**extract_conf.get("params", {})

View File

@@ -16,13 +16,13 @@ from .utils import (
extract_xml_data,
merge_chunks,
)
from .types import LLMConfig
from .config import DEFAULT_PROVIDER, OVERLAP_RATE, WORD_TOKEN_RATE
from abc import ABC, abstractmethod
import math
from snowballstemmer import stemmer
from .config import DEFAULT_PROVIDER, OVERLAP_RATE, WORD_TOKEN_RATE, PROVIDER_MODELS
from .models import TokenUsage
from .prompts import PROMPT_FILTER_CONTENT
import os
import json
import hashlib
from pathlib import Path
@@ -770,37 +770,56 @@ class PruningContentFilter(RelevantContentFilter):
class LLMContentFilter(RelevantContentFilter):
"""Content filtering using LLMs to generate relevant markdown."""
"""Content filtering using LLMs to generate relevant markdown.
How it works:
1. Extracts page metadata with fallbacks.
2. Extracts text chunks from the body element.
3. Applies LLMs to generate markdown for each chunk.
4. Filters out chunks below the threshold.
5. Sorts chunks by score in descending order.
6. Returns the top N chunks.
Attributes:
llm_config (LLMConfig): LLM configuration object.
instruction (str): Instruction for LLM markdown generation
chunk_token_threshold (int): Chunk token threshold for splitting (default: 1e9).
overlap_rate (float): Overlap rate for chunking (default: 0.5).
word_token_rate (float): Word token rate for chunking (default: 0.2).
verbose (bool): Enable verbose logging (default: False).
logger (AsyncLogger): Custom logger for LLM operations (optional).
"""
_UNWANTED_PROPS = {
'provider' : 'Instead, use llmConfig=LlmConfig(provider="...")',
'api_token' : 'Instead, use llmConfig=LlMConfig(api_token="...")',
'base_url' : 'Instead, use llmConfig=LlmConfig(base_url="...")',
'api_base' : 'Instead, use llmConfig=LlmConfig(base_url="...")',
'provider' : 'Instead, use llm_config=LLMConfig(provider="...")',
'api_token' : 'Instead, use llm_config=LlMConfig(api_token="...")',
'base_url' : 'Instead, use llm_config=LLMConfig(base_url="...")',
'api_base' : 'Instead, use llm_config=LLMConfig(base_url="...")',
}
def __init__(
self,
provider: str = DEFAULT_PROVIDER,
api_token: Optional[str] = None,
llmConfig: "LlmConfig" = None,
llm_config: "LLMConfig" = None,
instruction: str = None,
chunk_token_threshold: int = int(1e9),
overlap_rate: float = OVERLAP_RATE,
word_token_rate: float = WORD_TOKEN_RATE,
base_url: Optional[str] = None,
api_base: Optional[str] = None,
extra_args: Dict = None,
# char_token_rate: float = WORD_TOKEN_RATE * 5,
# chunk_mode: str = "char",
verbose: bool = False,
logger: Optional[AsyncLogger] = None,
ignore_cache: bool = True,
# Deprecated properties
provider: str = DEFAULT_PROVIDER,
api_token: Optional[str] = None,
base_url: Optional[str] = None,
api_base: Optional[str] = None,
extra_args: Dict = None,
):
super().__init__(None)
self.provider = provider
self.api_token = api_token
self.base_url = base_url or api_base
self.llmConfig = llmConfig
self.llm_config = llm_config
self.instruction = instruction
self.chunk_token_threshold = chunk_token_threshold
self.overlap_rate = overlap_rate
@@ -872,7 +891,7 @@ class LLMContentFilter(RelevantContentFilter):
self.logger.info(
"Starting LLM markdown content filtering process",
tag="LLM",
params={"provider": self.llmConfig.provider},
params={"provider": self.llm_config.provider},
colors={"provider": Fore.CYAN},
)
@@ -959,10 +978,10 @@ class LLMContentFilter(RelevantContentFilter):
future = executor.submit(
_proceed_with_chunk,
self.llmConfig.provider,
self.llm_config.provider,
prompt,
self.llmConfig.api_token,
self.llmConfig.base_url,
self.llm_config.api_token,
self.llm_config.base_url,
self.extra_args,
)
futures.append((i, future))

View File

@@ -4,12 +4,10 @@ from typing import Any, List, Dict, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
import time
import os
from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH
from .config import (
DEFAULT_PROVIDER, PROVIDER_MODELS,
CHUNK_TOKEN_THRESHOLD,
DEFAULT_PROVIDER, CHUNK_TOKEN_THRESHOLD,
OVERLAP_RATE,
WORD_TOKEN_RATE,
)
@@ -22,9 +20,7 @@ from .utils import (
extract_xml_data,
split_and_parse_json_objects,
sanitize_input_encode,
chunk_documents,
merge_chunks,
advanced_split,
)
from .models import * # noqa: F403
@@ -38,8 +34,9 @@ from .model_loader import (
calculate_batch_size
)
from .types import LLMConfig
from functools import partial
import math
import numpy as np
import re
from bs4 import BeautifulSoup
@@ -481,8 +478,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
A strategy that uses an LLM to extract meaningful content from the HTML.
Attributes:
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
api_token: The API token for the provider.
llm_config: The LLM configuration object.
instruction: The instruction to use for the LLM model.
schema: Pydantic model schema for structured data.
extraction_type: "block" or "schema".
@@ -490,27 +486,20 @@ class LLMExtractionStrategy(ExtractionStrategy):
overlap_rate: Overlap between chunks.
word_token_rate: Word to token conversion rate.
apply_chunking: Whether to apply chunking.
base_url: The base URL for the API request.
api_base: The base URL for the API request.
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
verbose: Whether to print verbose output.
usages: List of individual token usages.
total_usage: Accumulated token usage.
"""
_UNWANTED_PROPS = {
'provider' : 'Instead, use llmConfig=LlmConfig(provider="...")',
'api_token' : 'Instead, use llmConfig=LlMConfig(api_token="...")',
'base_url' : 'Instead, use llmConfig=LlmConfig(base_url="...")',
'api_base' : 'Instead, use llmConfig=LlmConfig(base_url="...")',
'provider' : 'Instead, use llm_config=LLMConfig(provider="...")',
'api_token' : 'Instead, use llm_config=LlMConfig(api_token="...")',
'base_url' : 'Instead, use llm_config=LLMConfig(base_url="...")',
'api_base' : 'Instead, use llm_config=LLMConfig(base_url="...")',
}
def __init__(
self,
llmConfig: 'LLMConfig' = None,
llm_config: 'LLMConfig' = None,
instruction: str = None,
provider: str = DEFAULT_PROVIDER,
api_token: Optional[str] = None,
base_url: str = None,
api_base: str = None,
schema: Dict = None,
extraction_type="block",
chunk_token_threshold=CHUNK_TOKEN_THRESHOLD,
@@ -519,15 +508,18 @@ class LLMExtractionStrategy(ExtractionStrategy):
apply_chunking=True,
input_format: str = "markdown",
verbose=False,
# Deprecated arguments
provider: str = DEFAULT_PROVIDER,
api_token: Optional[str] = None,
base_url: str = None,
api_base: str = None,
**kwargs,
):
"""
Initialize the strategy with clustering parameters.
Args:
llmConfig: The LLM configuration object.
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
api_token: The API token for the provider.
llm_config: The LLM configuration object.
instruction: The instruction to use for the LLM model.
schema: Pydantic model schema for structured data.
extraction_type: "block" or "schema".
@@ -535,20 +527,19 @@ class LLMExtractionStrategy(ExtractionStrategy):
overlap_rate: Overlap between chunks.
word_token_rate: Word to token conversion rate.
apply_chunking: Whether to apply chunking.
base_url: The base URL for the API request.
api_base: The base URL for the API request.
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
verbose: Whether to print verbose output.
usages: List of individual token usages.
total_usage: Accumulated token usage.
# Deprecated arguments, will be removed very soon
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
api_token: The API token for the provider.
base_url: The base URL for the API request.
api_base: The base URL for the API request.
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
"""
super().__init__( input_format=input_format, **kwargs)
self.llmConfig = llmConfig
self.provider = provider
self.api_token = api_token
self.base_url = base_url
self.api_base = api_base
self.llm_config = llm_config
self.instruction = instruction
self.extract_type = extraction_type
self.schema = schema
@@ -565,6 +556,11 @@ class LLMExtractionStrategy(ExtractionStrategy):
self.usages = [] # Store individual usages
self.total_usage = TokenUsage() # Accumulated usage
self.provider = provider
self.api_token = api_token
self.base_url = base_url
self.api_base = api_base
def __setattr__(self, name, value):
"""Handle attribute setting."""
@@ -618,10 +614,10 @@ class LLMExtractionStrategy(ExtractionStrategy):
)
response = perform_completion_with_backoff(
self.llmConfig.provider,
self.llm_config.provider,
prompt_with_variables,
self.llmConfig.api_token,
base_url=self.llmConfig.base_url,
self.llm_config.api_token,
base_url=self.llm_config.base_url,
extra_args=self.extra_args,
) # , json_response=self.extract_type == "schema")
# Track usage
@@ -701,7 +697,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
overlap=int(self.chunk_token_threshold * self.overlap_rate),
)
extracted_content = []
if self.llmConfig.provider.startswith("groq/"):
if self.llm_config.provider.startswith("groq/"):
# Sequential processing with a delay
for ix, section in enumerate(merged_sections):
extract_func = partial(self.extract, url)
@@ -1043,8 +1039,8 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
pass
_GENERATE_SCHEMA_UNWANTED_PROPS = {
'provider': 'Instead, use llmConfig=LlmConfig(provider="...")',
'api_token': 'Instead, use llmConfig=LlMConfig(api_token="...")',
'provider': 'Instead, use llm_config=LLMConfig(provider="...")',
'api_token': 'Instead, use llm_config=LlMConfig(api_token="...")',
}
@staticmethod
@@ -1053,7 +1049,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
schema_type: str = "CSS", # or XPATH
query: str = None,
target_json_example: str = None,
llmConfig: 'LLMConfig' = None,
llm_config: 'LLMConfig' = None,
provider: str = None,
api_token: str = None,
**kwargs
@@ -1066,7 +1062,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
query (str, optional): Natural language description of what data to extract
provider (str): Legacy Parameter. LLM provider to use
api_token (str): Legacy Parameter. API token for LLM provider
llmConfig (LlmConfig): LLM configuration object
llm_config (LLMConfig): LLM configuration object
prompt (str, optional): Custom prompt template to use
**kwargs: Additional args passed to perform_completion_with_backoff
@@ -1130,10 +1126,10 @@ In this scenario, use your best judgment to generate the schema. Try to maximize
try:
# Call LLM with backoff handling
response = perform_completion_with_backoff(
provider=llmConfig.provider,
provider=llm_config.provider,
prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
json_response = True,
api_token=llmConfig.api_token,
api_token=llm_config.api_token,
**kwargs
)

View File

@@ -1,9 +1,9 @@
from abc import ABC, abstractmethod
from tabnanny import verbose
from typing import Optional, Dict, Any, Tuple
from .models import MarkdownGenerationResult
from .html2text import CustomHTML2Text
from .content_filter_strategy import RelevantContentFilter
from .types import RelevantContentFilter
# from .content_filter_strategy import RelevantContentFilter
import re
from urllib.parse import urljoin

View File

@@ -1,14 +1,181 @@
from typing import TYPE_CHECKING, Union
AsyncWebCrawler = Union['AsyncWebCrawlerType'] # Note the string literal
CrawlerRunConfig = Union['CrawlerRunConfigType']
# Logger types
AsyncLoggerBase = Union['AsyncLoggerBaseType']
AsyncLogger = Union['AsyncLoggerType']
# Crawler core types
AsyncWebCrawler = Union['AsyncWebCrawlerType']
CacheMode = Union['CacheModeType']
CrawlResult = Union['CrawlResultType']
CrawlerHub = Union['CrawlerHubType']
BrowserProfiler = Union['BrowserProfilerType']
# Configuration types
BrowserConfig = Union['BrowserConfigType']
CrawlerRunConfig = Union['CrawlerRunConfigType']
HTTPCrawlerConfig = Union['HTTPCrawlerConfigType']
LLMConfig = Union['LLMConfigType']
# Content scraping types
ContentScrapingStrategy = Union['ContentScrapingStrategyType']
WebScrapingStrategy = Union['WebScrapingStrategyType']
LXMLWebScrapingStrategy = Union['LXMLWebScrapingStrategyType']
# Proxy types
ProxyRotationStrategy = Union['ProxyRotationStrategyType']
RoundRobinProxyStrategy = Union['RoundRobinProxyStrategyType']
# Extraction types
ExtractionStrategy = Union['ExtractionStrategyType']
LLMExtractionStrategy = Union['LLMExtractionStrategyType']
CosineStrategy = Union['CosineStrategyType']
JsonCssExtractionStrategy = Union['JsonCssExtractionStrategyType']
JsonXPathExtractionStrategy = Union['JsonXPathExtractionStrategyType']
# Chunking types
ChunkingStrategy = Union['ChunkingStrategyType']
RegexChunking = Union['RegexChunkingType']
# Markdown generation types
DefaultMarkdownGenerator = Union['DefaultMarkdownGeneratorType']
MarkdownGenerationResult = Union['MarkdownGenerationResultType']
# Content filter types
RelevantContentFilter = Union['RelevantContentFilterType']
PruningContentFilter = Union['PruningContentFilterType']
BM25ContentFilter = Union['BM25ContentFilterType']
LLMContentFilter = Union['LLMContentFilterType']
# Dispatcher types
BaseDispatcher = Union['BaseDispatcherType']
MemoryAdaptiveDispatcher = Union['MemoryAdaptiveDispatcherType']
SemaphoreDispatcher = Union['SemaphoreDispatcherType']
RateLimiter = Union['RateLimiterType']
CrawlerMonitor = Union['CrawlerMonitorType']
DisplayMode = Union['DisplayModeType']
RunManyReturn = Union['RunManyReturnType']
# Docker client
Crawl4aiDockerClient = Union['Crawl4aiDockerClientType']
# Deep crawling types
DeepCrawlStrategy = Union['DeepCrawlStrategyType']
BFSDeepCrawlStrategy = Union['BFSDeepCrawlStrategyType']
FilterChain = Union['FilterChainType']
ContentTypeFilter = Union['ContentTypeFilterType']
DomainFilter = Union['DomainFilterType']
URLFilter = Union['URLFilterType']
FilterStats = Union['FilterStatsType']
SEOFilter = Union['SEOFilterType']
KeywordRelevanceScorer = Union['KeywordRelevanceScorerType']
URLScorer = Union['URLScorerType']
CompositeScorer = Union['CompositeScorerType']
DomainAuthorityScorer = Union['DomainAuthorityScorerType']
FreshnessScorer = Union['FreshnessScorerType']
PathDepthScorer = Union['PathDepthScorerType']
BestFirstCrawlingStrategy = Union['BestFirstCrawlingStrategyType']
DFSDeepCrawlStrategy = Union['DFSDeepCrawlStrategyType']
DeepCrawlDecorator = Union['DeepCrawlDecoratorType']
# Only import types during type checking to avoid circular imports
if TYPE_CHECKING:
from . import (
# Logger imports
from .async_logger import (
AsyncLoggerBase as AsyncLoggerBaseType,
AsyncLogger as AsyncLoggerType,
)
# Crawler core imports
from .async_webcrawler import (
AsyncWebCrawler as AsyncWebCrawlerType,
CacheMode as CacheModeType,
)
from .models import CrawlResult as CrawlResultType
from .hub import CrawlerHub as CrawlerHubType
from .browser_profiler import BrowserProfiler as BrowserProfilerType
# Configuration imports
from .async_configs import (
BrowserConfig as BrowserConfigType,
CrawlerRunConfig as CrawlerRunConfigType,
CrawlResult as CrawlResultType,
HTTPCrawlerConfig as HTTPCrawlerConfigType,
LLMConfig as LLMConfigType,
)
# Content scraping imports
from .content_scraping_strategy import (
ContentScrapingStrategy as ContentScrapingStrategyType,
WebScrapingStrategy as WebScrapingStrategyType,
LXMLWebScrapingStrategy as LXMLWebScrapingStrategyType,
)
# Proxy imports
from .proxy_strategy import (
ProxyRotationStrategy as ProxyRotationStrategyType,
RoundRobinProxyStrategy as RoundRobinProxyStrategyType,
)
# Extraction imports
from .extraction_strategy import (
ExtractionStrategy as ExtractionStrategyType,
LLMExtractionStrategy as LLMExtractionStrategyType,
CosineStrategy as CosineStrategyType,
JsonCssExtractionStrategy as JsonCssExtractionStrategyType,
JsonXPathExtractionStrategy as JsonXPathExtractionStrategyType,
)
# Chunking imports
from .chunking_strategy import (
ChunkingStrategy as ChunkingStrategyType,
RegexChunking as RegexChunkingType,
)
# Markdown generation imports
from .markdown_generation_strategy import (
DefaultMarkdownGenerator as DefaultMarkdownGeneratorType,
)
from .models import MarkdownGenerationResult as MarkdownGenerationResultType
# Content filter imports
from .content_filter_strategy import (
RelevantContentFilter as RelevantContentFilterType,
PruningContentFilter as PruningContentFilterType,
BM25ContentFilter as BM25ContentFilterType,
LLMContentFilter as LLMContentFilterType,
)
# Dispatcher imports
from .async_dispatcher import (
BaseDispatcher as BaseDispatcherType,
MemoryAdaptiveDispatcher as MemoryAdaptiveDispatcherType,
SemaphoreDispatcher as SemaphoreDispatcherType,
RateLimiter as RateLimiterType,
CrawlerMonitor as CrawlerMonitorType,
DisplayMode as DisplayModeType,
RunManyReturn as RunManyReturnType,
)
# Docker client
from .docker_client import Crawl4aiDockerClient as Crawl4aiDockerClientType
# Deep crawling imports
from .deep_crawling import (
DeepCrawlStrategy as DeepCrawlStrategyType,
BFSDeepCrawlStrategy as BFSDeepCrawlStrategyType,
FilterChain as FilterChainType,
ContentTypeFilter as ContentTypeFilterType,
DomainFilter as DomainFilterType,
URLFilter as URLFilterType,
FilterStats as FilterStatsType,
SEOFilter as SEOFilterType,
KeywordRelevanceScorer as KeywordRelevanceScorerType,
URLScorer as URLScorerType,
CompositeScorer as CompositeScorerType,
DomainAuthorityScorer as DomainAuthorityScorerType,
FreshnessScorer as FreshnessScorerType,
PathDepthScorer as PathDepthScorerType,
BestFirstCrawlingStrategy as BestFirstCrawlingStrategyType,
DFSDeepCrawlStrategy as DFSDeepCrawlStrategyType,
DeepCrawlDecorator as DeepCrawlDecoratorType,
)