Feat/llm config (#724)
* feature: Add LlmConfig to easily configure and pass LLM configs to different strategies * pulled in next branch and resolved conflicts * feat: Add gemini and deepseek providers. Make ignore_cache in llm content filter to true by default to avoid confusions * Refactor: Update LlmConfig in LLMExtractionStrategy class and deprecate old params * updated tests, docs and readme
This commit is contained in:
@@ -71,8 +71,7 @@ Below is an overview of important LLM extraction parameters. All are typically s
|
||||
|
||||
```python
|
||||
extraction_strategy = LLMExtractionStrategy(
|
||||
provider="openai/gpt-4",
|
||||
api_token="YOUR_OPENAI_KEY",
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
|
||||
schema=MyModel.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="Extract a list of items from the text with 'name' and 'price' fields.",
|
||||
@@ -97,7 +96,7 @@ import asyncio
|
||||
import json
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
|
||||
class Product(BaseModel):
|
||||
@@ -107,9 +106,8 @@ class Product(BaseModel):
|
||||
async def main():
|
||||
# 1. Define the LLM extraction strategy
|
||||
llm_strategy = LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o-mini", # e.g. "ollama/llama2"
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
schema=Product.schema_json(), # Or use model_json_schema()
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
|
||||
schema=Product.schema_json(), # Or use model_json_schema()
|
||||
extraction_type="schema",
|
||||
instruction="Extract all product objects with 'name' and 'price' from the content.",
|
||||
chunk_token_threshold=1000,
|
||||
|
||||
@@ -415,6 +415,7 @@ The schema generator is available as a static method on both `JsonCssExtractionS
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
|
||||
# Sample HTML with product information
|
||||
html = """
|
||||
@@ -433,17 +434,15 @@ html = """
|
||||
# Option 1: Using OpenAI (requires API token)
|
||||
css_schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html,
|
||||
schema_type="css", # This is the default
|
||||
provider="openai/gpt-4o", # Default provider
|
||||
api_token="your-openai-token" # Required for OpenAI
|
||||
schema_type="css",
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-openai-token")
|
||||
)
|
||||
|
||||
# Option 2: Using Ollama (open source, no token needed)
|
||||
xpath_schema = JsonXPathExtractionStrategy.generate_schema(
|
||||
html,
|
||||
schema_type="xpath",
|
||||
provider="ollama/llama3.3", # Open source alternative
|
||||
api_token=None # Not needed for Ollama
|
||||
llmConfig = LlmConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
|
||||
)
|
||||
|
||||
# Use the generated schema for fast, repeated extractions
|
||||
|
||||
Reference in New Issue
Block a user