Feat/llm config (#724)

* feature: Add LlmConfig to easily configure and pass LLM configs to different strategies * pulled in next branch and resolved conflicts * feat: Add gemini and deepseek providers. Make ignore_cache in llm content filter to true by default to avoid confusions * Refactor: Update LlmConfig in LLMExtractionStrategy class and deprecate old params * updated tests, docs and readme
2025-02-21 13:11:37 +05:30
parent 3cb28875c3
commit 2af958e12c
25 changed files with 420 additions and 240 deletions
--- a/docs/md_v2/extraction/llm-strategies.md
+++ b/docs/md_v2/extraction/llm-strategies.md
@@ -71,8 +71,7 @@ Below is an overview of important LLM extraction parameters. All are typically s

 ```python
 extraction_strategy = LLMExtractionStrategy(
-    provider="openai/gpt-4",
-    api_token="YOUR_OPENAI_KEY",
+    llmConfig = LlmConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
    schema=MyModel.model_json_schema(),
    extraction_type="schema",
    instruction="Extract a list of items from the text with 'name' and 'price' fields.",
@@ -97,7 +96,7 @@ import asyncio
 import json
 from pydantic import BaseModel, Field
 from typing import List
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig
 from crawl4ai.extraction_strategy import LLMExtractionStrategy

 class Product(BaseModel):
@@ -107,9 +106,8 @@ class Product(BaseModel):
 async def main():
    # 1. Define the LLM extraction strategy
    llm_strategy = LLMExtractionStrategy(
-        provider="openai/gpt-4o-mini",            # e.g. "ollama/llama2"
-        api_token=os.getenv('OPENAI_API_KEY'),
-        schema=Product.schema_json(),            # Or use model_json_schema()
+        llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
+        schema=Product.schema_json(), # Or use model_json_schema()
        extraction_type="schema",
        instruction="Extract all product objects with 'name' and 'price' from the content.",
        chunk_token_threshold=1000,