Merge branch 'vr0.5.0.post1' into next

This commit is contained in:
UncleCode
2025-03-05 14:17:19 +08:00
33 changed files with 362 additions and 174 deletions

View File

@@ -420,7 +420,7 @@ if __name__ == "__main__":
```python ```python
import os import os
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai.extraction_strategy import LLMExtractionStrategy
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
@@ -436,7 +436,7 @@ async def main():
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
# Here you can use any provider that Litellm library supports, for instance: ollama/qwen2 # Here you can use any provider that Litellm library supports, for instance: ollama/qwen2
# provider="ollama/qwen2", api_token="no-token", # provider="ollama/qwen2", api_token="no-token",
llmConfig = LlmConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')), llm_config = LLMConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
schema=OpenAIModelFee.schema(), schema=OpenAIModelFee.schema(),
extraction_type="schema", extraction_type="schema",
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.

View File

@@ -2,7 +2,8 @@
import warnings import warnings
from .async_webcrawler import AsyncWebCrawler, CacheMode from .async_webcrawler import AsyncWebCrawler, CacheMode
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig
from .content_scraping_strategy import ( from .content_scraping_strategy import (
ContentScrapingStrategy, ContentScrapingStrategy,
WebScrapingStrategy, WebScrapingStrategy,
@@ -68,6 +69,7 @@ __all__ = [
"AsyncLogger", "AsyncLogger",
"AsyncWebCrawler", "AsyncWebCrawler",
"BrowserProfiler", "BrowserProfiler",
"LLMConfig",
"DeepCrawlStrategy", "DeepCrawlStrategy",
"BFSDeepCrawlStrategy", "BFSDeepCrawlStrategy",
"BestFirstCrawlingStrategy", "BestFirstCrawlingStrategy",

View File

@@ -13,13 +13,15 @@ from .config import (
from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator
from .extraction_strategy import ExtractionStrategy from .extraction_strategy import ExtractionStrategy
from .chunking_strategy import ChunkingStrategy, RegexChunking from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import MarkdownGenerationStrategy from .markdown_generation_strategy import MarkdownGenerationStrategy
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
from .deep_crawling import DeepCrawlStrategy from .deep_crawling import DeepCrawlStrategy
from typing import Union, List
from .cache_context import CacheMode from .cache_context import CacheMode
from .proxy_strategy import ProxyRotationStrategy from .proxy_strategy import ProxyRotationStrategy
from typing import Union, List
import inspect import inspect
from typing import Any, Dict, Optional from typing import Any, Dict, Optional
from enum import Enum from enum import Enum
@@ -1042,7 +1044,7 @@ class CrawlerRunConfig():
return CrawlerRunConfig.from_kwargs(config_dict) return CrawlerRunConfig.from_kwargs(config_dict)
class LlmConfig: class LLMConfig:
def __init__( def __init__(
self, self,
provider: str = DEFAULT_PROVIDER, provider: str = DEFAULT_PROVIDER,
@@ -1063,8 +1065,8 @@ class LlmConfig:
@staticmethod @staticmethod
def from_kwargs(kwargs: dict) -> "LlmConfig": def from_kwargs(kwargs: dict) -> "LLMConfig":
return LlmConfig( return LLMConfig(
provider=kwargs.get("provider", DEFAULT_PROVIDER), provider=kwargs.get("provider", DEFAULT_PROVIDER),
api_token=kwargs.get("api_token"), api_token=kwargs.get("api_token"),
base_url=kwargs.get("base_url"), base_url=kwargs.get("base_url"),
@@ -1084,8 +1086,8 @@ class LlmConfig:
**kwargs: Key-value pairs of configuration options to update **kwargs: Key-value pairs of configuration options to update
Returns: Returns:
LLMConfig: A new instance with the specified updates llm_config: A new instance with the specified updates
""" """
config_dict = self.to_dict() config_dict = self.to_dict()
config_dict.update(kwargs) config_dict.update(kwargs)
return LlmConfig.from_kwargs(config_dict) return LLMConfig.from_kwargs(config_dict)

View File

@@ -1,9 +1,7 @@
import click import click
import os import os
import time import time
import datetime
import sys
import shutil
import humanize import humanize
from typing import Dict, Any, Optional, List from typing import Dict, Any, Optional, List
import json import json
@@ -13,7 +11,6 @@ from rich.console import Console
from rich.table import Table from rich.table import Table
from rich.panel import Panel from rich.panel import Panel
from rich.prompt import Prompt, Confirm from rich.prompt import Prompt, Confirm
from rich.style import Style
from crawl4ai import ( from crawl4ai import (
CacheMode, CacheMode,
@@ -26,12 +23,12 @@ from crawl4ai import (
JsonXPathExtractionStrategy, JsonXPathExtractionStrategy,
BM25ContentFilter, BM25ContentFilter,
PruningContentFilter, PruningContentFilter,
BrowserProfiler BrowserProfiler,
LLMConfig
) )
from litellm import completion from litellm import completion
from pathlib import Path from pathlib import Path
from crawl4ai.async_configs import LlmConfig
# Initialize rich console # Initialize rich console
console = Console() console = Console()
@@ -647,7 +644,7 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
raise click.ClickException("LLM provider and API token are required for LLM extraction") raise click.ClickException("LLM provider and API token are required for LLM extraction")
crawler_cfg.extraction_strategy = LLMExtractionStrategy( crawler_cfg.extraction_strategy = LLMExtractionStrategy(
llmConfig=LlmConfig(provider=extract_conf["provider"], api_token=extract_conf["api_token"]), llm_config=LLMConfig(provider=extract_conf["provider"], api_token=extract_conf["api_token"]),
instruction=extract_conf["instruction"], instruction=extract_conf["instruction"],
schema=schema_data, schema=schema_data,
**extract_conf.get("params", {}) **extract_conf.get("params", {})

View File

@@ -16,13 +16,13 @@ from .utils import (
extract_xml_data, extract_xml_data,
merge_chunks, merge_chunks,
) )
from .types import LLMConfig
from .config import DEFAULT_PROVIDER, OVERLAP_RATE, WORD_TOKEN_RATE
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
import math import math
from snowballstemmer import stemmer from snowballstemmer import stemmer
from .config import DEFAULT_PROVIDER, OVERLAP_RATE, WORD_TOKEN_RATE, PROVIDER_MODELS
from .models import TokenUsage from .models import TokenUsage
from .prompts import PROMPT_FILTER_CONTENT from .prompts import PROMPT_FILTER_CONTENT
import os
import json import json
import hashlib import hashlib
from pathlib import Path from pathlib import Path
@@ -770,37 +770,56 @@ class PruningContentFilter(RelevantContentFilter):
class LLMContentFilter(RelevantContentFilter): class LLMContentFilter(RelevantContentFilter):
"""Content filtering using LLMs to generate relevant markdown.""" """Content filtering using LLMs to generate relevant markdown.
How it works:
1. Extracts page metadata with fallbacks.
2. Extracts text chunks from the body element.
3. Applies LLMs to generate markdown for each chunk.
4. Filters out chunks below the threshold.
5. Sorts chunks by score in descending order.
6. Returns the top N chunks.
Attributes:
llm_config (LLMConfig): LLM configuration object.
instruction (str): Instruction for LLM markdown generation
chunk_token_threshold (int): Chunk token threshold for splitting (default: 1e9).
overlap_rate (float): Overlap rate for chunking (default: 0.5).
word_token_rate (float): Word token rate for chunking (default: 0.2).
verbose (bool): Enable verbose logging (default: False).
logger (AsyncLogger): Custom logger for LLM operations (optional).
"""
_UNWANTED_PROPS = { _UNWANTED_PROPS = {
'provider' : 'Instead, use llmConfig=LlmConfig(provider="...")', 'provider' : 'Instead, use llm_config=LLMConfig(provider="...")',
'api_token' : 'Instead, use llmConfig=LlMConfig(api_token="...")', 'api_token' : 'Instead, use llm_config=LlMConfig(api_token="...")',
'base_url' : 'Instead, use llmConfig=LlmConfig(base_url="...")', 'base_url' : 'Instead, use llm_config=LLMConfig(base_url="...")',
'api_base' : 'Instead, use llmConfig=LlmConfig(base_url="...")', 'api_base' : 'Instead, use llm_config=LLMConfig(base_url="...")',
} }
def __init__( def __init__(
self, self,
provider: str = DEFAULT_PROVIDER, llm_config: "LLMConfig" = None,
api_token: Optional[str] = None,
llmConfig: "LlmConfig" = None,
instruction: str = None, instruction: str = None,
chunk_token_threshold: int = int(1e9), chunk_token_threshold: int = int(1e9),
overlap_rate: float = OVERLAP_RATE, overlap_rate: float = OVERLAP_RATE,
word_token_rate: float = WORD_TOKEN_RATE, word_token_rate: float = WORD_TOKEN_RATE,
base_url: Optional[str] = None,
api_base: Optional[str] = None,
extra_args: Dict = None,
# char_token_rate: float = WORD_TOKEN_RATE * 5, # char_token_rate: float = WORD_TOKEN_RATE * 5,
# chunk_mode: str = "char", # chunk_mode: str = "char",
verbose: bool = False, verbose: bool = False,
logger: Optional[AsyncLogger] = None, logger: Optional[AsyncLogger] = None,
ignore_cache: bool = True, ignore_cache: bool = True,
# Deprecated properties
provider: str = DEFAULT_PROVIDER,
api_token: Optional[str] = None,
base_url: Optional[str] = None,
api_base: Optional[str] = None,
extra_args: Dict = None,
): ):
super().__init__(None) super().__init__(None)
self.provider = provider self.provider = provider
self.api_token = api_token self.api_token = api_token
self.base_url = base_url or api_base self.base_url = base_url or api_base
self.llmConfig = llmConfig self.llm_config = llm_config
self.instruction = instruction self.instruction = instruction
self.chunk_token_threshold = chunk_token_threshold self.chunk_token_threshold = chunk_token_threshold
self.overlap_rate = overlap_rate self.overlap_rate = overlap_rate
@@ -872,7 +891,7 @@ class LLMContentFilter(RelevantContentFilter):
self.logger.info( self.logger.info(
"Starting LLM markdown content filtering process", "Starting LLM markdown content filtering process",
tag="LLM", tag="LLM",
params={"provider": self.llmConfig.provider}, params={"provider": self.llm_config.provider},
colors={"provider": Fore.CYAN}, colors={"provider": Fore.CYAN},
) )
@@ -959,10 +978,10 @@ class LLMContentFilter(RelevantContentFilter):
future = executor.submit( future = executor.submit(
_proceed_with_chunk, _proceed_with_chunk,
self.llmConfig.provider, self.llm_config.provider,
prompt, prompt,
self.llmConfig.api_token, self.llm_config.api_token,
self.llmConfig.base_url, self.llm_config.base_url,
self.extra_args, self.extra_args,
) )
futures.append((i, future)) futures.append((i, future))

View File

@@ -4,12 +4,10 @@ from typing import Any, List, Dict, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
import json import json
import time import time
import os
from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH
from .config import ( from .config import (
DEFAULT_PROVIDER, PROVIDER_MODELS, DEFAULT_PROVIDER, CHUNK_TOKEN_THRESHOLD,
CHUNK_TOKEN_THRESHOLD,
OVERLAP_RATE, OVERLAP_RATE,
WORD_TOKEN_RATE, WORD_TOKEN_RATE,
) )
@@ -22,9 +20,7 @@ from .utils import (
extract_xml_data, extract_xml_data,
split_and_parse_json_objects, split_and_parse_json_objects,
sanitize_input_encode, sanitize_input_encode,
chunk_documents,
merge_chunks, merge_chunks,
advanced_split,
) )
from .models import * # noqa: F403 from .models import * # noqa: F403
@@ -38,8 +34,9 @@ from .model_loader import (
calculate_batch_size calculate_batch_size
) )
from .types import LLMConfig
from functools import partial from functools import partial
import math
import numpy as np import numpy as np
import re import re
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@@ -481,8 +478,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
A strategy that uses an LLM to extract meaningful content from the HTML. A strategy that uses an LLM to extract meaningful content from the HTML.
Attributes: Attributes:
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3". llm_config: The LLM configuration object.
api_token: The API token for the provider.
instruction: The instruction to use for the LLM model. instruction: The instruction to use for the LLM model.
schema: Pydantic model schema for structured data. schema: Pydantic model schema for structured data.
extraction_type: "block" or "schema". extraction_type: "block" or "schema".
@@ -490,27 +486,20 @@ class LLMExtractionStrategy(ExtractionStrategy):
overlap_rate: Overlap between chunks. overlap_rate: Overlap between chunks.
word_token_rate: Word to token conversion rate. word_token_rate: Word to token conversion rate.
apply_chunking: Whether to apply chunking. apply_chunking: Whether to apply chunking.
base_url: The base URL for the API request.
api_base: The base URL for the API request.
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
verbose: Whether to print verbose output. verbose: Whether to print verbose output.
usages: List of individual token usages. usages: List of individual token usages.
total_usage: Accumulated token usage. total_usage: Accumulated token usage.
""" """
_UNWANTED_PROPS = { _UNWANTED_PROPS = {
'provider' : 'Instead, use llmConfig=LlmConfig(provider="...")', 'provider' : 'Instead, use llm_config=LLMConfig(provider="...")',
'api_token' : 'Instead, use llmConfig=LlMConfig(api_token="...")', 'api_token' : 'Instead, use llm_config=LlMConfig(api_token="...")',
'base_url' : 'Instead, use llmConfig=LlmConfig(base_url="...")', 'base_url' : 'Instead, use llm_config=LLMConfig(base_url="...")',
'api_base' : 'Instead, use llmConfig=LlmConfig(base_url="...")', 'api_base' : 'Instead, use llm_config=LLMConfig(base_url="...")',
} }
def __init__( def __init__(
self, self,
llmConfig: 'LLMConfig' = None, llm_config: 'LLMConfig' = None,
instruction: str = None, instruction: str = None,
provider: str = DEFAULT_PROVIDER,
api_token: Optional[str] = None,
base_url: str = None,
api_base: str = None,
schema: Dict = None, schema: Dict = None,
extraction_type="block", extraction_type="block",
chunk_token_threshold=CHUNK_TOKEN_THRESHOLD, chunk_token_threshold=CHUNK_TOKEN_THRESHOLD,
@@ -519,15 +508,18 @@ class LLMExtractionStrategy(ExtractionStrategy):
apply_chunking=True, apply_chunking=True,
input_format: str = "markdown", input_format: str = "markdown",
verbose=False, verbose=False,
# Deprecated arguments
provider: str = DEFAULT_PROVIDER,
api_token: Optional[str] = None,
base_url: str = None,
api_base: str = None,
**kwargs, **kwargs,
): ):
""" """
Initialize the strategy with clustering parameters. Initialize the strategy with clustering parameters.
Args: Args:
llmConfig: The LLM configuration object. llm_config: The LLM configuration object.
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
api_token: The API token for the provider.
instruction: The instruction to use for the LLM model. instruction: The instruction to use for the LLM model.
schema: Pydantic model schema for structured data. schema: Pydantic model schema for structured data.
extraction_type: "block" or "schema". extraction_type: "block" or "schema".
@@ -535,20 +527,19 @@ class LLMExtractionStrategy(ExtractionStrategy):
overlap_rate: Overlap between chunks. overlap_rate: Overlap between chunks.
word_token_rate: Word to token conversion rate. word_token_rate: Word to token conversion rate.
apply_chunking: Whether to apply chunking. apply_chunking: Whether to apply chunking.
base_url: The base URL for the API request.
api_base: The base URL for the API request.
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
verbose: Whether to print verbose output. verbose: Whether to print verbose output.
usages: List of individual token usages. usages: List of individual token usages.
total_usage: Accumulated token usage. total_usage: Accumulated token usage.
# Deprecated arguments, will be removed very soon
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
api_token: The API token for the provider.
base_url: The base URL for the API request.
api_base: The base URL for the API request.
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
""" """
super().__init__( input_format=input_format, **kwargs) super().__init__( input_format=input_format, **kwargs)
self.llmConfig = llmConfig self.llm_config = llm_config
self.provider = provider
self.api_token = api_token
self.base_url = base_url
self.api_base = api_base
self.instruction = instruction self.instruction = instruction
self.extract_type = extraction_type self.extract_type = extraction_type
self.schema = schema self.schema = schema
@@ -565,6 +556,11 @@ class LLMExtractionStrategy(ExtractionStrategy):
self.usages = [] # Store individual usages self.usages = [] # Store individual usages
self.total_usage = TokenUsage() # Accumulated usage self.total_usage = TokenUsage() # Accumulated usage
self.provider = provider
self.api_token = api_token
self.base_url = base_url
self.api_base = api_base
def __setattr__(self, name, value): def __setattr__(self, name, value):
"""Handle attribute setting.""" """Handle attribute setting."""
@@ -618,10 +614,10 @@ class LLMExtractionStrategy(ExtractionStrategy):
) )
response = perform_completion_with_backoff( response = perform_completion_with_backoff(
self.llmConfig.provider, self.llm_config.provider,
prompt_with_variables, prompt_with_variables,
self.llmConfig.api_token, self.llm_config.api_token,
base_url=self.llmConfig.base_url, base_url=self.llm_config.base_url,
extra_args=self.extra_args, extra_args=self.extra_args,
) # , json_response=self.extract_type == "schema") ) # , json_response=self.extract_type == "schema")
# Track usage # Track usage
@@ -701,7 +697,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
overlap=int(self.chunk_token_threshold * self.overlap_rate), overlap=int(self.chunk_token_threshold * self.overlap_rate),
) )
extracted_content = [] extracted_content = []
if self.llmConfig.provider.startswith("groq/"): if self.llm_config.provider.startswith("groq/"):
# Sequential processing with a delay # Sequential processing with a delay
for ix, section in enumerate(merged_sections): for ix, section in enumerate(merged_sections):
extract_func = partial(self.extract, url) extract_func = partial(self.extract, url)
@@ -1043,8 +1039,8 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
pass pass
_GENERATE_SCHEMA_UNWANTED_PROPS = { _GENERATE_SCHEMA_UNWANTED_PROPS = {
'provider': 'Instead, use llmConfig=LlmConfig(provider="...")', 'provider': 'Instead, use llm_config=LLMConfig(provider="...")',
'api_token': 'Instead, use llmConfig=LlMConfig(api_token="...")', 'api_token': 'Instead, use llm_config=LlMConfig(api_token="...")',
} }
@staticmethod @staticmethod
@@ -1053,7 +1049,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
schema_type: str = "CSS", # or XPATH schema_type: str = "CSS", # or XPATH
query: str = None, query: str = None,
target_json_example: str = None, target_json_example: str = None,
llmConfig: 'LLMConfig' = None, llm_config: 'LLMConfig' = None,
provider: str = None, provider: str = None,
api_token: str = None, api_token: str = None,
**kwargs **kwargs
@@ -1066,7 +1062,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
query (str, optional): Natural language description of what data to extract query (str, optional): Natural language description of what data to extract
provider (str): Legacy Parameter. LLM provider to use provider (str): Legacy Parameter. LLM provider to use
api_token (str): Legacy Parameter. API token for LLM provider api_token (str): Legacy Parameter. API token for LLM provider
llmConfig (LlmConfig): LLM configuration object llm_config (LLMConfig): LLM configuration object
prompt (str, optional): Custom prompt template to use prompt (str, optional): Custom prompt template to use
**kwargs: Additional args passed to perform_completion_with_backoff **kwargs: Additional args passed to perform_completion_with_backoff
@@ -1130,10 +1126,10 @@ In this scenario, use your best judgment to generate the schema. Try to maximize
try: try:
# Call LLM with backoff handling # Call LLM with backoff handling
response = perform_completion_with_backoff( response = perform_completion_with_backoff(
provider=llmConfig.provider, provider=llm_config.provider,
prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]), prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
json_response = True, json_response = True,
api_token=llmConfig.api_token, api_token=llm_config.api_token,
**kwargs **kwargs
) )

View File

@@ -1,9 +1,9 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from tabnanny import verbose
from typing import Optional, Dict, Any, Tuple from typing import Optional, Dict, Any, Tuple
from .models import MarkdownGenerationResult from .models import MarkdownGenerationResult
from .html2text import CustomHTML2Text from .html2text import CustomHTML2Text
from .content_filter_strategy import RelevantContentFilter from .types import RelevantContentFilter
# from .content_filter_strategy import RelevantContentFilter
import re import re
from urllib.parse import urljoin from urllib.parse import urljoin

View File

@@ -1,14 +1,181 @@
from typing import TYPE_CHECKING, Union from typing import TYPE_CHECKING, Union
AsyncWebCrawler = Union['AsyncWebCrawlerType'] # Note the string literal # Logger types
CrawlerRunConfig = Union['CrawlerRunConfigType'] AsyncLoggerBase = Union['AsyncLoggerBaseType']
AsyncLogger = Union['AsyncLoggerType']
# Crawler core types
AsyncWebCrawler = Union['AsyncWebCrawlerType']
CacheMode = Union['CacheModeType']
CrawlResult = Union['CrawlResultType'] CrawlResult = Union['CrawlResultType']
CrawlerHub = Union['CrawlerHubType']
BrowserProfiler = Union['BrowserProfilerType']
# Configuration types
BrowserConfig = Union['BrowserConfigType']
CrawlerRunConfig = Union['CrawlerRunConfigType']
HTTPCrawlerConfig = Union['HTTPCrawlerConfigType']
LLMConfig = Union['LLMConfigType']
# Content scraping types
ContentScrapingStrategy = Union['ContentScrapingStrategyType']
WebScrapingStrategy = Union['WebScrapingStrategyType']
LXMLWebScrapingStrategy = Union['LXMLWebScrapingStrategyType']
# Proxy types
ProxyRotationStrategy = Union['ProxyRotationStrategyType']
RoundRobinProxyStrategy = Union['RoundRobinProxyStrategyType']
# Extraction types
ExtractionStrategy = Union['ExtractionStrategyType']
LLMExtractionStrategy = Union['LLMExtractionStrategyType']
CosineStrategy = Union['CosineStrategyType']
JsonCssExtractionStrategy = Union['JsonCssExtractionStrategyType']
JsonXPathExtractionStrategy = Union['JsonXPathExtractionStrategyType']
# Chunking types
ChunkingStrategy = Union['ChunkingStrategyType']
RegexChunking = Union['RegexChunkingType']
# Markdown generation types
DefaultMarkdownGenerator = Union['DefaultMarkdownGeneratorType']
MarkdownGenerationResult = Union['MarkdownGenerationResultType']
# Content filter types
RelevantContentFilter = Union['RelevantContentFilterType']
PruningContentFilter = Union['PruningContentFilterType']
BM25ContentFilter = Union['BM25ContentFilterType']
LLMContentFilter = Union['LLMContentFilterType']
# Dispatcher types
BaseDispatcher = Union['BaseDispatcherType']
MemoryAdaptiveDispatcher = Union['MemoryAdaptiveDispatcherType']
SemaphoreDispatcher = Union['SemaphoreDispatcherType']
RateLimiter = Union['RateLimiterType']
CrawlerMonitor = Union['CrawlerMonitorType']
DisplayMode = Union['DisplayModeType']
RunManyReturn = Union['RunManyReturnType'] RunManyReturn = Union['RunManyReturnType']
# Docker client
Crawl4aiDockerClient = Union['Crawl4aiDockerClientType']
# Deep crawling types
DeepCrawlStrategy = Union['DeepCrawlStrategyType']
BFSDeepCrawlStrategy = Union['BFSDeepCrawlStrategyType']
FilterChain = Union['FilterChainType']
ContentTypeFilter = Union['ContentTypeFilterType']
DomainFilter = Union['DomainFilterType']
URLFilter = Union['URLFilterType']
FilterStats = Union['FilterStatsType']
SEOFilter = Union['SEOFilterType']
KeywordRelevanceScorer = Union['KeywordRelevanceScorerType']
URLScorer = Union['URLScorerType']
CompositeScorer = Union['CompositeScorerType']
DomainAuthorityScorer = Union['DomainAuthorityScorerType']
FreshnessScorer = Union['FreshnessScorerType']
PathDepthScorer = Union['PathDepthScorerType']
BestFirstCrawlingStrategy = Union['BestFirstCrawlingStrategyType']
DFSDeepCrawlStrategy = Union['DFSDeepCrawlStrategyType']
DeepCrawlDecorator = Union['DeepCrawlDecoratorType']
# Only import types during type checking to avoid circular imports
if TYPE_CHECKING: if TYPE_CHECKING:
from . import ( # Logger imports
from .async_logger import (
AsyncLoggerBase as AsyncLoggerBaseType,
AsyncLogger as AsyncLoggerType,
)
# Crawler core imports
from .async_webcrawler import (
AsyncWebCrawler as AsyncWebCrawlerType, AsyncWebCrawler as AsyncWebCrawlerType,
CacheMode as CacheModeType,
)
from .models import CrawlResult as CrawlResultType
from .hub import CrawlerHub as CrawlerHubType
from .browser_profiler import BrowserProfiler as BrowserProfilerType
# Configuration imports
from .async_configs import (
BrowserConfig as BrowserConfigType,
CrawlerRunConfig as CrawlerRunConfigType, CrawlerRunConfig as CrawlerRunConfigType,
CrawlResult as CrawlResultType, HTTPCrawlerConfig as HTTPCrawlerConfigType,
LLMConfig as LLMConfigType,
)
# Content scraping imports
from .content_scraping_strategy import (
ContentScrapingStrategy as ContentScrapingStrategyType,
WebScrapingStrategy as WebScrapingStrategyType,
LXMLWebScrapingStrategy as LXMLWebScrapingStrategyType,
)
# Proxy imports
from .proxy_strategy import (
ProxyRotationStrategy as ProxyRotationStrategyType,
RoundRobinProxyStrategy as RoundRobinProxyStrategyType,
)
# Extraction imports
from .extraction_strategy import (
ExtractionStrategy as ExtractionStrategyType,
LLMExtractionStrategy as LLMExtractionStrategyType,
CosineStrategy as CosineStrategyType,
JsonCssExtractionStrategy as JsonCssExtractionStrategyType,
JsonXPathExtractionStrategy as JsonXPathExtractionStrategyType,
)
# Chunking imports
from .chunking_strategy import (
ChunkingStrategy as ChunkingStrategyType,
RegexChunking as RegexChunkingType,
)
# Markdown generation imports
from .markdown_generation_strategy import (
DefaultMarkdownGenerator as DefaultMarkdownGeneratorType,
)
from .models import MarkdownGenerationResult as MarkdownGenerationResultType
# Content filter imports
from .content_filter_strategy import (
RelevantContentFilter as RelevantContentFilterType,
PruningContentFilter as PruningContentFilterType,
BM25ContentFilter as BM25ContentFilterType,
LLMContentFilter as LLMContentFilterType,
)
# Dispatcher imports
from .async_dispatcher import (
BaseDispatcher as BaseDispatcherType,
MemoryAdaptiveDispatcher as MemoryAdaptiveDispatcherType,
SemaphoreDispatcher as SemaphoreDispatcherType,
RateLimiter as RateLimiterType,
CrawlerMonitor as CrawlerMonitorType,
DisplayMode as DisplayModeType,
RunManyReturn as RunManyReturnType, RunManyReturn as RunManyReturnType,
) )
# Docker client
from .docker_client import Crawl4aiDockerClient as Crawl4aiDockerClientType
# Deep crawling imports
from .deep_crawling import (
DeepCrawlStrategy as DeepCrawlStrategyType,
BFSDeepCrawlStrategy as BFSDeepCrawlStrategyType,
FilterChain as FilterChainType,
ContentTypeFilter as ContentTypeFilterType,
DomainFilter as DomainFilterType,
URLFilter as URLFilterType,
FilterStats as FilterStatsType,
SEOFilter as SEOFilterType,
KeywordRelevanceScorer as KeywordRelevanceScorerType,
URLScorer as URLScorerType,
CompositeScorer as CompositeScorerType,
DomainAuthorityScorer as DomainAuthorityScorerType,
FreshnessScorer as FreshnessScorerType,
PathDepthScorer as PathDepthScorerType,
BestFirstCrawlingStrategy as BestFirstCrawlingStrategyType,
DFSDeepCrawlStrategy as DFSDeepCrawlStrategyType,
DeepCrawlDecorator as DeepCrawlDecoratorType,
)

View File

@@ -18,7 +18,8 @@ from crawl4ai import (
CacheMode, CacheMode,
BrowserConfig, BrowserConfig,
MemoryAdaptiveDispatcher, MemoryAdaptiveDispatcher,
RateLimiter RateLimiter,
LLMConfig
) )
from crawl4ai.utils import perform_completion_with_backoff from crawl4ai.utils import perform_completion_with_backoff
from crawl4ai.content_filter_strategy import ( from crawl4ai.content_filter_strategy import (
@@ -103,8 +104,10 @@ async def process_llm_extraction(
else: else:
api_key = os.environ.get(config["llm"].get("api_key_env", None), "") api_key = os.environ.get(config["llm"].get("api_key_env", None), "")
llm_strategy = LLMExtractionStrategy( llm_strategy = LLMExtractionStrategy(
provider=config["llm"]["provider"], llm_config=LLMConfig(
api_token=api_key, provider=config["llm"]["provider"],
api_token=api_key
),
instruction=instruction, instruction=instruction,
schema=json.loads(schema) if schema else None, schema=json.loads(schema) if schema else None,
) )
@@ -164,8 +167,10 @@ async def handle_markdown_request(
FilterType.FIT: PruningContentFilter(), FilterType.FIT: PruningContentFilter(),
FilterType.BM25: BM25ContentFilter(user_query=query or ""), FilterType.BM25: BM25ContentFilter(user_query=query or ""),
FilterType.LLM: LLMContentFilter( FilterType.LLM: LLMContentFilter(
provider=config["llm"]["provider"], llm_config=LLMConfig(
api_token=os.environ.get(config["llm"].get("api_key_env", None), ""), provider=config["llm"]["provider"],
api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),
),
instruction=query or "Extract main content" instruction=query or "Extract main content"
) )
}[filter_type] }[filter_type]

View File

@@ -3,7 +3,7 @@ app:
title: "Crawl4AI API" title: "Crawl4AI API"
version: "1.0.0" version: "1.0.0"
host: "0.0.0.0" host: "0.0.0.0"
port: 8000 port: 8020
reload: True reload: True
timeout_keep_alive: 300 timeout_keep_alive: 300
@@ -38,8 +38,8 @@ rate_limiting:
# Security Configuration # Security Configuration
security: security:
enabled: true enabled: false
jwt_enabled: true jwt_enabled: false
https_redirect: false https_redirect: false
trusted_hosts: ["*"] trusted_hosts: ["*"]
headers: headers:

View File

@@ -92,7 +92,7 @@ async def get_markdown(
f: FilterType = FilterType.FIT, f: FilterType = FilterType.FIT,
q: Optional[str] = None, q: Optional[str] = None,
c: Optional[str] = "0", c: Optional[str] = "0",
token_data: Optional[Dict] = Depends(token_dependency) # token_data: Optional[Dict] = Depends(token_dependency)
): ):
result = await handle_markdown_request(url, f, q, c, config) result = await handle_markdown_request(url, f, q, c, config)
return PlainTextResponse(result) return PlainTextResponse(result)

View File

@@ -11,7 +11,7 @@ import asyncio
import os import os
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.async_configs import LlmConfig from crawl4ai.types import LLMConfig
from crawl4ai.extraction_strategy import ( from crawl4ai.extraction_strategy import (
LLMExtractionStrategy, LLMExtractionStrategy,
JsonCssExtractionStrategy, JsonCssExtractionStrategy,
@@ -61,19 +61,19 @@ async def main():
# 1. LLM Extraction with different input formats # 1. LLM Extraction with different input formats
markdown_strategy = LLMExtractionStrategy( markdown_strategy = LLMExtractionStrategy(
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
instruction="Extract product information including name, price, and description", instruction="Extract product information including name, price, and description",
) )
html_strategy = LLMExtractionStrategy( html_strategy = LLMExtractionStrategy(
input_format="html", input_format="html",
llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
instruction="Extract product information from HTML including structured data", instruction="Extract product information from HTML including structured data",
) )
fit_markdown_strategy = LLMExtractionStrategy( fit_markdown_strategy = LLMExtractionStrategy(
input_format="fit_markdown", input_format="fit_markdown",
llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")), llm_config=LLMConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
instruction="Extract product information from cleaned markdown", instruction="Extract product information from cleaned markdown",
) )

View File

@@ -1,4 +1,4 @@
from crawl4ai.async_configs import LlmConfig from crawl4ai.types import LLMConfig
from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
import asyncio import asyncio
import os import os
@@ -23,7 +23,7 @@ async def main():
word_count_threshold=1, word_count_threshold=1,
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
# provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
llmConfig=LlmConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")), llm_config=LLMConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
schema=OpenAIModelFee.model_json_schema(), schema=OpenAIModelFee.model_json_schema(),
extraction_type="schema", extraction_type="schema",
instruction="From the crawled content, extract all mentioned model names along with their " instruction="From the crawled content, extract all mentioned model names along with their "

View File

@@ -1,7 +1,7 @@
import os import os
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.async_configs import LlmConfig from crawl4ai.types import LLMConfig
from crawl4ai.content_filter_strategy import LLMContentFilter from crawl4ai.content_filter_strategy import LLMContentFilter
async def test_llm_filter(): async def test_llm_filter():
@@ -23,7 +23,7 @@ async def test_llm_filter():
# Initialize LLM filter with focused instruction # Initialize LLM filter with focused instruction
filter = LLMContentFilter( filter = LLMContentFilter(
llmConfig=LlmConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')), llm_config=LLMConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
instruction=""" instruction="""
Focus on extracting the core educational content about Python classes. Focus on extracting the core educational content about Python classes.
Include: Include:
@@ -43,7 +43,7 @@ async def test_llm_filter():
) )
filter = LLMContentFilter( filter = LLMContentFilter(
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')), llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2 chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
ignore_cache = True, ignore_cache = True,
instruction=""" instruction="""

View File

@@ -1,6 +1,6 @@
import os, sys import os, sys
from crawl4ai.async_configs import LlmConfig from crawl4ai.types import LLMConfig
sys.path.append( sys.path.append(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -211,7 +211,7 @@ async def extract_structured_data_using_llm(
word_count_threshold=1, word_count_threshold=1,
page_timeout=80000, page_timeout=80000,
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
llmConfig=LlmConfig(provider=provider,api_token=api_token), llm_config=LLMConfig(provider=provider,api_token=api_token),
schema=OpenAIModelFee.model_json_schema(), schema=OpenAIModelFee.model_json_schema(),
extraction_type="schema", extraction_type="schema",
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.

View File

@@ -1,6 +1,6 @@
import os, sys import os, sys
from crawl4ai.async_configs import LlmConfig from crawl4ai.types import LLMConfig
# append parent directory to system path # append parent directory to system path
sys.path.append( sys.path.append(
@@ -147,7 +147,7 @@ async def extract_structured_data_using_llm(
url="https://openai.com/api/pricing/", url="https://openai.com/api/pricing/",
word_count_threshold=1, word_count_threshold=1,
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
llmConfig=LlmConfig(provider=provider,api_token=api_token), llm_config=LLMConfig(provider=provider,api_token=api_token),
schema=OpenAIModelFee.model_json_schema(), schema=OpenAIModelFee.model_json_schema(),
extraction_type="schema", extraction_type="schema",
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
@@ -570,7 +570,7 @@ async def generate_knowledge_graph():
relationships: List[Relationship] relationships: List[Relationship]
extraction_strategy = LLMExtractionStrategy( extraction_strategy = LLMExtractionStrategy(
llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), # In case of Ollama just pass "no-token" llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), # In case of Ollama just pass "no-token"
schema=KnowledgeGraph.model_json_schema(), schema=KnowledgeGraph.model_json_schema(),
extraction_type="schema", extraction_type="schema",
instruction="""Extract entities and relationships from the given text.""", instruction="""Extract entities and relationships from the given text.""",

View File

@@ -1,6 +1,6 @@
import os import os
import time import time
from crawl4ai.async_configs import LlmConfig from crawl4ai.types import LLMConfig
from crawl4ai.web_crawler import WebCrawler from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import * from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import * from crawl4ai.extraction_strategy import *
@@ -179,7 +179,7 @@ def add_llm_extraction_strategy(crawler):
result = crawler.run( result = crawler.run(
url="https://www.nbcnews.com/business", url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
llmConfig = LlmConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")) llm_config = LLMConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
), ),
) )
cprint( cprint(
@@ -198,7 +198,7 @@ def add_llm_extraction_strategy(crawler):
result = crawler.run( result = crawler.run(
url="https://www.nbcnews.com/business", url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")), llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
instruction="I am interested in only financial news", instruction="I am interested in only financial news",
), ),
) )
@@ -210,7 +210,7 @@ def add_llm_extraction_strategy(crawler):
result = crawler.run( result = crawler.run(
url="https://www.nbcnews.com/business", url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")), llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
instruction="Extract only content related to technology", instruction="Extract only content related to technology",
), ),
) )

View File

@@ -17,7 +17,7 @@ from crawl4ai.configs import ProxyConfig
from crawl4ai import RoundRobinProxyStrategy from crawl4ai import RoundRobinProxyStrategy
from crawl4ai.content_filter_strategy import LLMContentFilter from crawl4ai.content_filter_strategy import LLMContentFilter
from crawl4ai import DefaultMarkdownGenerator from crawl4ai import DefaultMarkdownGenerator
from crawl4ai.async_configs import LlmConfig from crawl4ai.types import LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
from pprint import pprint from pprint import pprint
@@ -284,9 +284,9 @@ async def llm_content_filter():
PART 5: LLM Content Filter PART 5: LLM Content Filter
This function demonstrates: This function demonstrates:
- Configuring LLM providers via LlmConfig - Configuring LLM providers via LLMConfig
- Using LLM to generate focused markdown - Using LLM to generate focused markdown
- LlmConfig for configuration - LLMConfig for configuration
Note: Requires a valid API key for the chosen LLM provider Note: Requires a valid API key for the chosen LLM provider
""" """
@@ -296,7 +296,7 @@ async def llm_content_filter():
# Create LLM configuration # Create LLM configuration
# Replace with your actual API key or set as environment variable # Replace with your actual API key or set as environment variable
llm_config = LlmConfig( llm_config = LLMConfig(
provider="gemini/gemini-1.5-pro", provider="gemini/gemini-1.5-pro",
api_token="env:GEMINI_API_KEY" # Will read from GEMINI_API_KEY environment variable api_token="env:GEMINI_API_KEY" # Will read from GEMINI_API_KEY environment variable
) )
@@ -309,7 +309,7 @@ async def llm_content_filter():
# Create markdown generator with LLM filter # Create markdown generator with LLM filter
markdown_generator = DefaultMarkdownGenerator( markdown_generator = DefaultMarkdownGenerator(
content_filter=LLMContentFilter( content_filter=LLMContentFilter(
llmConfig=llm_config, llm_config=llm_config,
instruction="Extract key concepts and summaries" instruction="Extract key concepts and summaries"
) )
) )
@@ -381,7 +381,7 @@ async def llm_schema_generation():
PART 7: LLM Schema Generation PART 7: LLM Schema Generation
This function demonstrates: This function demonstrates:
- Configuring LLM providers via LlmConfig - Configuring LLM providers via LLMConfig
- Using LLM to generate extraction schemas - Using LLM to generate extraction schemas
- JsonCssExtractionStrategy - JsonCssExtractionStrategy
@@ -406,9 +406,9 @@ async def llm_schema_generation():
<div class="rating">4.7/5</div> <div class="rating">4.7/5</div>
</div> </div>
""" """
print("\n📊 Setting up LlmConfig...") print("\n📊 Setting up LLMConfig...")
# Create LLM configuration # Create LLM configuration
llm_config = LlmConfig( llm_config = LLMConfig(
provider="gemini/gemini-1.5-pro", provider="gemini/gemini-1.5-pro",
api_token="env:GEMINI_API_KEY" api_token="env:GEMINI_API_KEY"
) )
@@ -416,7 +416,7 @@ async def llm_schema_generation():
print(" This would use the LLM to analyze HTML and create an extraction schema") print(" This would use the LLM to analyze HTML and create an extraction schema")
schema = JsonCssExtractionStrategy.generate_schema( schema = JsonCssExtractionStrategy.generate_schema(
html=sample_html, html=sample_html,
llmConfig = llm_config, llm_config = llm_config,
query="Extract product name and price" query="Extract product name and price"
) )
print("\n✅ Generated Schema:") print("\n✅ Generated Schema:")

View File

@@ -245,8 +245,8 @@ run_config = CrawlerRunConfig(
) )
``` ```
# 3. **LlmConfig** - Setting up LLM providers # 3. **LLMConfig** - Setting up LLM providers
LlmConfig is useful to pass LLM provider config to strategies and functions that rely on LLMs to do extraction, filtering, schema generation etc. Currently it can be used in the following - LLMConfig is useful to pass LLM provider config to strategies and functions that rely on LLMs to do extraction, filtering, schema generation etc. Currently it can be used in the following -
1. LLMExtractionStrategy 1. LLMExtractionStrategy
2. LLMContentFilter 2. LLMContentFilter
@@ -262,7 +262,7 @@ LlmConfig is useful to pass LLM provider config to strategies and functions that
## 3.2 Example Usage ## 3.2 Example Usage
```python ```python
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")) llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
``` ```
## 4. Putting It All Together ## 4. Putting It All Together
@@ -270,7 +270,7 @@ llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI
- **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent. - **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent.
- **Use** `CrawlerRunConfig` for each crawls **context**: how to filter content, handle caching, wait for dynamic elements, or run JS. - **Use** `CrawlerRunConfig` for each crawls **context**: how to filter content, handle caching, wait for dynamic elements, or run JS.
- **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`). - **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`).
- **Use** `LlmConfig` for LLM provider configurations that can be used across all extraction, filtering, schema generation tasks. Can be used in - `LLMExtractionStrategy`, `LLMContentFilter`, `JsonCssExtractionStrategy.generate_schema` & `JsonXPathExtractionStrategy.generate_schema` - **Use** `LLMConfig` for LLM provider configurations that can be used across all extraction, filtering, schema generation tasks. Can be used in - `LLMExtractionStrategy`, `LLMContentFilter`, `JsonCssExtractionStrategy.generate_schema` & `JsonXPathExtractionStrategy.generate_schema`
```python ```python
# Create a modified copy with the clone() method # Create a modified copy with the clone() method

View File

@@ -131,7 +131,7 @@ OverlappingWindowChunking(
```python ```python
from pydantic import BaseModel from pydantic import BaseModel
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai.async_configs import LlmConfig from crawl4ai.types import LLMConfig
# Define schema # Define schema
class Article(BaseModel): class Article(BaseModel):
@@ -141,7 +141,7 @@ class Article(BaseModel):
# Create strategy # Create strategy
strategy = LLMExtractionStrategy( strategy = LLMExtractionStrategy(
llmConfig = LlmConfig(provider="ollama/llama2"), llm_config = LLMConfig(provider="ollama/llama2"),
schema=Article.schema(), schema=Article.schema(),
instruction="Extract article details" instruction="Extract article details"
) )
@@ -198,7 +198,7 @@ result = await crawler.arun(
```python ```python
from crawl4ai.chunking_strategy import OverlappingWindowChunking from crawl4ai.chunking_strategy import OverlappingWindowChunking
from crawl4ai.async_configs import LlmConfig from crawl4ai.types import LLMConfig
# Create chunking strategy # Create chunking strategy
chunker = OverlappingWindowChunking( chunker = OverlappingWindowChunking(
@@ -208,7 +208,7 @@ chunker = OverlappingWindowChunking(
# Use with extraction strategy # Use with extraction strategy
strategy = LLMExtractionStrategy( strategy = LLMExtractionStrategy(
llmConfig = LlmConfig(provider="ollama/llama2"), llm_config = LLMConfig(provider="ollama/llama2"),
chunking_strategy=chunker chunking_strategy=chunker
) )

View File

@@ -16,7 +16,7 @@ My dear friends and crawlers, there you go, this is the release of Crawl4AI v0.5
* **Multiple Crawler Strategies:** Choose between the full-featured Playwright browser-based crawler or a new, *much* faster HTTP-only crawler for simpler tasks. * **Multiple Crawler Strategies:** Choose between the full-featured Playwright browser-based crawler or a new, *much* faster HTTP-only crawler for simpler tasks.
* **Docker Deployment:** Deploy Crawl4AI as a scalable, self-contained service with built-in API endpoints and optional JWT authentication. * **Docker Deployment:** Deploy Crawl4AI as a scalable, self-contained service with built-in API endpoints and optional JWT authentication.
* **Command-Line Interface (CLI):** Interact with Crawl4AI directly from your terminal. Crawl, configure, and extract data with simple commands. * **Command-Line Interface (CLI):** Interact with Crawl4AI directly from your terminal. Crawl, configure, and extract data with simple commands.
* **LLM Configuration (`LlmConfig`):** A new, unified way to configure LLM providers (OpenAI, Anthropic, Ollama, etc.) for extraction, filtering, and schema generation. Simplifies API key management and switching between models. * **LLM Configuration (`LLMConfig`):** A new, unified way to configure LLM providers (OpenAI, Anthropic, Ollama, etc.) for extraction, filtering, and schema generation. Simplifies API key management and switching between models.
**Minor Updates & Improvements:** **Minor Updates & Improvements:**
@@ -47,7 +47,7 @@ This release includes several breaking changes to improve the library's structur
* **Config**: FastFilterChain has been replaced with FilterChain * **Config**: FastFilterChain has been replaced with FilterChain
* **Deep-Crawl**: DeepCrawlStrategy.arun now returns Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] * **Deep-Crawl**: DeepCrawlStrategy.arun now returns Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
* **Proxy**: Removed synchronous WebCrawler support and related rate limiting configurations * **Proxy**: Removed synchronous WebCrawler support and related rate limiting configurations
* **LLM Parameters:** Use the new `LlmConfig` object instead of passing `provider`, `api_token`, `base_url`, and `api_base` directly to `LLMExtractionStrategy` and `LLMContentFilter`. * **LLM Parameters:** Use the new `LLMConfig` object instead of passing `provider`, `api_token`, `base_url`, and `api_base` directly to `LLMExtractionStrategy` and `LLMContentFilter`.
**In short:** Update imports, adjust `arun_many()` usage, check for optional fields, and review the Docker deployment guide. **In short:** Update imports, adjust `arun_many()` usage, check for optional fields, and review the Docker deployment guide.

View File

@@ -305,13 +305,13 @@ asyncio.run(main())
```python ```python
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import LLMContentFilter from crawl4ai.content_filter_strategy import LLMContentFilter
from crawl4ai.async_configs import LlmConfig from crawl4ai.types import LLMConfig
import asyncio import asyncio
llm_config = LlmConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY") llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
markdown_generator = DefaultMarkdownGenerator( markdown_generator = DefaultMarkdownGenerator(
content_filter=LLMContentFilter(llmConfig=llm_config, instruction="Extract key concepts and summaries") content_filter=LLMContentFilter(llm_config=llm_config, instruction="Extract key concepts and summaries")
) )
config = CrawlerRunConfig(markdown_generator=markdown_generator) config = CrawlerRunConfig(markdown_generator=markdown_generator)
@@ -335,13 +335,13 @@ asyncio.run(main())
```python ```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.async_configs import LlmConfig from crawl4ai.types import LLMConfig
llm_config = LlmConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY") llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
schema = JsonCssExtractionStrategy.generate_schema( schema = JsonCssExtractionStrategy.generate_schema(
html="<div class='product'><h2>Product Name</h2><span class='price'>$99</span></div>", html="<div class='product'><h2>Product Name</h2><span class='price'>$99</span></div>",
llmConfig = llm_config, llm_config = llm_config,
query="Extract product name and price" query="Extract product name and price"
) )
print(schema) print(schema)
@@ -394,20 +394,20 @@ print(schema)
serialization, especially for sets of allowed/blocked domains. No code changes serialization, especially for sets of allowed/blocked domains. No code changes
required. required.
- **Added: New `LlmConfig` parameter.** This new parameter can be passed for - **Added: New `LLMConfig` parameter.** This new parameter can be passed for
extraction, filtering, and schema generation tasks. It simplifies passing extraction, filtering, and schema generation tasks. It simplifies passing
provider strings, API tokens, and base URLs across all sections where LLM provider strings, API tokens, and base URLs across all sections where LLM
configuration is necessary. It also enables reuse and allows for quick configuration is necessary. It also enables reuse and allows for quick
experimentation between different LLM configurations. experimentation between different LLM configurations.
```python ```python
from crawl4ai.async_configs import LlmConfig from crawl4ai.types import LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
# Example of using LlmConfig with LLMExtractionStrategy # Example of using LLMConfig with LLMExtractionStrategy
llm_config = LlmConfig(provider="openai/gpt-4o", api_token="YOUR_API_KEY") llm_config = LLMConfig(provider="openai/gpt-4o", api_token="YOUR_API_KEY")
strategy = LLMExtractionStrategy(llmConfig=llm_config, schema=...) strategy = LLMExtractionStrategy(llm_config=llm_config, schema=...)
# Example usage within a crawler # Example usage within a crawler
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
@@ -418,7 +418,7 @@ print(schema)
``` ```
**Breaking Change:** Removed old parameters like `provider`, `api_token`, **Breaking Change:** Removed old parameters like `provider`, `api_token`,
`base_url`, and `api_base` from `LLMExtractionStrategy` and `base_url`, and `api_base` from `LLMExtractionStrategy` and
`LLMContentFilter`. Users should migrate to using the `LlmConfig` object. `LLMContentFilter`. Users should migrate to using the `LLMConfig` object.
- **Changed: Improved browser context management and added shared data support. - **Changed: Improved browser context management and added shared data support.
(Breaking Change:** `BrowserContext` API updated). Browser contexts are now (Breaking Change:** `BrowserContext` API updated). Browser contexts are now

View File

@@ -4,7 +4,7 @@ Crawl4AIs flexibility stems from two key classes:
1. **`BrowserConfig`** Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent). 1. **`BrowserConfig`** Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent).
2. **`CrawlerRunConfig`** Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.). 2. **`CrawlerRunConfig`** Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.).
3. **`LlmConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.) 3. **`LLMConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.)
In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md). In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md).
@@ -239,7 +239,7 @@ The `clone()` method:
## 3. LlmConfig Essentials ## 3. LLMConfig Essentials
### Key fields to note ### Key fields to note
@@ -256,16 +256,16 @@ The `clone()` method:
- If your provider has a custom endpoint - If your provider has a custom endpoint
```python ```python
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")) llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
``` ```
## 4. Putting It All Together ## 4. Putting It All Together
In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LlmConfig` depending on each calls needs: In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each calls needs:
```python ```python
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
async def main(): async def main():
@@ -289,14 +289,14 @@ async def main():
# 3) Example LLM content filtering # 3) Example LLM content filtering
gemini_config = LlmConfig( gemini_config = LLMConfig(
provider="gemini/gemini-1.5-pro" provider="gemini/gemini-1.5-pro"
api_token = "env:GEMINI_API_TOKEN" api_token = "env:GEMINI_API_TOKEN"
) )
# Initialize LLM filter with specific instruction # Initialize LLM filter with specific instruction
filter = LLMContentFilter( filter = LLMContentFilter(
llmConfig=gemini_config, # or your preferred provider llm_config=gemini_config, # or your preferred provider
instruction=""" instruction="""
Focus on extracting the core educational content. Focus on extracting the core educational content.
Include: Include:
@@ -343,7 +343,7 @@ if __name__ == "__main__":
For a **detailed list** of available parameters (including advanced ones), see: For a **detailed list** of available parameters (including advanced ones), see:
- [BrowserConfig, CrawlerRunConfig & LlmConfig Reference](../api/parameters.md) - [BrowserConfig, CrawlerRunConfig & LLMConfig Reference](../api/parameters.md)
You can explore topics like: You can explore topics like:
@@ -356,7 +356,7 @@ You can explore topics like:
## 6. Conclusion ## 6. Conclusion
**BrowserConfig**, **CrawlerRunConfig** and **LlmConfig** give you straightforward ways to define: **BrowserConfig**, **CrawlerRunConfig** and **LLMConfig** give you straightforward ways to define:
- **Which** browser to launch, how it should run, and any proxy or user agent needs. - **Which** browser to launch, how it should run, and any proxy or user agent needs.
- **How** each crawl should behave—caching, timeouts, JavaScript code, extraction strategies, etc. - **How** each crawl should behave—caching, timeouts, JavaScript code, extraction strategies, etc.

View File

@@ -211,7 +211,7 @@ if __name__ == "__main__":
import asyncio import asyncio
import json import json
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LlmConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai.extraction_strategy import LLMExtractionStrategy
class ArticleData(BaseModel): class ArticleData(BaseModel):
@@ -220,7 +220,7 @@ class ArticleData(BaseModel):
async def main(): async def main():
llm_strategy = LLMExtractionStrategy( llm_strategy = LLMExtractionStrategy(
llmConfig = LlmConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY") llm_config = LLMConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY")
schema=ArticleData.schema(), schema=ArticleData.schema(),
extraction_type="schema", extraction_type="schema",
instruction="Extract 'headline' and a short 'summary' from the content." instruction="Extract 'headline' and a short 'summary' from the content."

View File

@@ -175,13 +175,13 @@ prune_filter = PruningContentFilter(
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure: For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
```python ```python
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LlmConfig from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
from crawl4ai.content_filter_strategy import LLMContentFilter from crawl4ai.content_filter_strategy import LLMContentFilter
async def main(): async def main():
# Initialize LLM filter with specific instruction # Initialize LLM filter with specific instruction
filter = LLMContentFilter( filter = LLMContentFilter(
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable
instruction=""" instruction="""
Focus on extracting the core educational content. Focus on extracting the core educational content.
Include: Include:

View File

@@ -128,7 +128,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B
```python ```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.async_configs import LlmConfig from crawl4ai.types import LLMConfig
# Generate a schema (one-time cost) # Generate a schema (one-time cost)
html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</span></div>" html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</span></div>"
@@ -136,13 +136,13 @@ html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</
# Using OpenAI (requires API token) # Using OpenAI (requires API token)
schema = JsonCssExtractionStrategy.generate_schema( schema = JsonCssExtractionStrategy.generate_schema(
html, html,
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-openai-token") # Required for OpenAI llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token") # Required for OpenAI
) )
# Or using Ollama (open source, no token needed) # Or using Ollama (open source, no token needed)
schema = JsonCssExtractionStrategy.generate_schema( schema = JsonCssExtractionStrategy.generate_schema(
html, html,
llmConfig = LlmConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
) )
# Use the schema for fast, repeated extractions # Use the schema for fast, repeated extractions
@@ -211,7 +211,7 @@ import os
import json import json
import asyncio import asyncio
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LlmConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai.extraction_strategy import LLMExtractionStrategy
class OpenAIModelFee(BaseModel): class OpenAIModelFee(BaseModel):
@@ -241,7 +241,7 @@ async def extract_structured_data_using_llm(
word_count_threshold=1, word_count_threshold=1,
page_timeout=80000, page_timeout=80000,
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
llmConfig = LlmConfig(provider=provider,api_token=api_token), llm_config = LLMConfig(provider=provider,api_token=api_token),
schema=OpenAIModelFee.model_json_schema(), schema=OpenAIModelFee.model_json_schema(),
extraction_type="schema", extraction_type="schema",
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.

View File

@@ -71,7 +71,7 @@ Below is an overview of important LLM extraction parameters. All are typically s
```python ```python
extraction_strategy = LLMExtractionStrategy( extraction_strategy = LLMExtractionStrategy(
llmConfig = LlmConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"), llm_config = LLMConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
schema=MyModel.model_json_schema(), schema=MyModel.model_json_schema(),
extraction_type="schema", extraction_type="schema",
instruction="Extract a list of items from the text with 'name' and 'price' fields.", instruction="Extract a list of items from the text with 'name' and 'price' fields.",
@@ -96,7 +96,7 @@ import asyncio
import json import json
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from typing import List from typing import List
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai.extraction_strategy import LLMExtractionStrategy
class Product(BaseModel): class Product(BaseModel):
@@ -106,7 +106,7 @@ class Product(BaseModel):
async def main(): async def main():
# 1. Define the LLM extraction strategy # 1. Define the LLM extraction strategy
llm_strategy = LLMExtractionStrategy( llm_strategy = LLMExtractionStrategy(
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')), llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
schema=Product.schema_json(), # Or use model_json_schema() schema=Product.schema_json(), # Or use model_json_schema()
extraction_type="schema", extraction_type="schema",
instruction="Extract all product objects with 'name' and 'price' from the content.", instruction="Extract all product objects with 'name' and 'price' from the content.",

View File

@@ -415,7 +415,7 @@ The schema generator is available as a static method on both `JsonCssExtractionS
```python ```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
from crawl4ai.async_configs import LlmConfig from crawl4ai.types import LLMConfig
# Sample HTML with product information # Sample HTML with product information
html = """ html = """
@@ -435,14 +435,14 @@ html = """
css_schema = JsonCssExtractionStrategy.generate_schema( css_schema = JsonCssExtractionStrategy.generate_schema(
html, html,
schema_type="css", schema_type="css",
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-openai-token") llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token")
) )
# Option 2: Using Ollama (open source, no token needed) # Option 2: Using Ollama (open source, no token needed)
xpath_schema = JsonXPathExtractionStrategy.generate_schema( xpath_schema = JsonXPathExtractionStrategy.generate_schema(
html, html,
schema_type="xpath", schema_type="xpath",
llmConfig = LlmConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
) )
# Use the generated schema for fast, repeated extractions # Use the generated schema for fast, repeated extractions

View File

@@ -1,7 +1,7 @@
import os import os
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.async_configs import LlmConfig from crawl4ai.types import LLMConfig
from crawl4ai.content_filter_strategy import LLMContentFilter from crawl4ai.content_filter_strategy import LLMContentFilter
async def test_llm_filter(): async def test_llm_filter():
@@ -23,7 +23,7 @@ async def test_llm_filter():
# Initialize LLM filter with focused instruction # Initialize LLM filter with focused instruction
filter = LLMContentFilter( filter = LLMContentFilter(
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')), llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
instruction=""" instruction="""
Focus on extracting the core educational content about Python classes. Focus on extracting the core educational content about Python classes.
Include: Include:
@@ -43,7 +43,7 @@ async def test_llm_filter():
) )
filter = LLMContentFilter( filter = LLMContentFilter(
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')), llm_config = LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2 chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
instruction=""" instruction="""
Extract the main educational content while preserving its original wording and substance completely. Your task is to: Extract the main educational content while preserving its original wording and substance completely. Your task is to:

View File

@@ -7,7 +7,7 @@ import json
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir) sys.path.append(parent_dir)
from crawl4ai.async_configs import LlmConfig from crawl4ai.types import LLMConfig
from crawl4ai.async_webcrawler import AsyncWebCrawler from crawl4ai.async_webcrawler import AsyncWebCrawler
from crawl4ai.chunking_strategy import RegexChunking from crawl4ai.chunking_strategy import RegexChunking
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai.extraction_strategy import LLMExtractionStrategy
@@ -49,7 +49,7 @@ async def test_llm_extraction_strategy():
async with AsyncWebCrawler(verbose=True) as crawler: async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://www.nbcnews.com/business" url = "https://www.nbcnews.com/business"
extraction_strategy = LLMExtractionStrategy( extraction_strategy = LLMExtractionStrategy(
llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")), llm_config=LLMConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
instruction="Extract only content related to technology", instruction="Extract only content related to technology",
) )
result = await crawler.arun( result = await crawler.arun(

View File

@@ -7,7 +7,7 @@ from crawl4ai import (
BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator, BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator,
PruningContentFilter, JsonCssExtractionStrategy, LLMContentFilter, CacheMode PruningContentFilter, JsonCssExtractionStrategy, LLMContentFilter, CacheMode
) )
from crawl4ai.async_configs import LlmConfig from crawl4ai.types import LLMConfig
from crawl4ai.docker_client import Crawl4aiDockerClient from crawl4ai.docker_client import Crawl4aiDockerClient
class Crawl4AiTester: class Crawl4AiTester:
@@ -143,7 +143,7 @@ async def test_with_client():
cache_mode=CacheMode.BYPASS, cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator( markdown_generator=DefaultMarkdownGenerator(
content_filter=LLMContentFilter( content_filter=LLMContentFilter(
llmConfig=LlmConfig(provider="openai/gpt-40"), llm_config=LLMConfig(provider="openai/gpt-40"),
instruction="Extract key technical concepts" instruction="Extract key technical concepts"
) )
), ),

View File

@@ -2,7 +2,7 @@ import inspect
from typing import Any, Dict from typing import Any, Dict
from enum import Enum from enum import Enum
from crawl4ai.async_configs import LlmConfig from crawl4ai.types import LLMConfig
def to_serializable_dict(obj: Any) -> Dict: def to_serializable_dict(obj: Any) -> Dict:
""" """
@@ -224,7 +224,7 @@ if __name__ == "__main__":
config3 = CrawlerRunConfig( config3 = CrawlerRunConfig(
markdown_generator=DefaultMarkdownGenerator( markdown_generator=DefaultMarkdownGenerator(
content_filter=LLMContentFilter( content_filter=LLMContentFilter(
llmConfig = LlmConfig(provider="openai/gpt-4"), llm_config = LLMConfig(provider="openai/gpt-4"),
instruction="Extract key technical concepts", instruction="Extract key technical concepts",
chunk_token_threshold=2000, chunk_token_threshold=2000,
overlap_rate=0.1 overlap_rate=0.1

View File

@@ -1,5 +1,5 @@
import unittest, os import unittest, os
from crawl4ai.async_configs import LlmConfig from crawl4ai.types import LLMConfig
from crawl4ai.web_crawler import WebCrawler from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import ( from crawl4ai.chunking_strategy import (
RegexChunking, RegexChunking,
@@ -43,7 +43,7 @@ class TestWebCrawler(unittest.TestCase):
word_count_threshold=5, word_count_threshold=5,
chunking_strategy=FixedLengthWordChunking(chunk_size=100), chunking_strategy=FixedLengthWordChunking(chunk_size=100),
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
llmConfig=LlmConfig(provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY")) llm_config=LLMConfig(provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY"))
), ),
bypass_cache=True, bypass_cache=True,
) )