Feat/llm config (#724)

* feature: Add LlmConfig to easily configure and pass LLM configs to different strategies * pulled in next branch and resolved conflicts * feat: Add gemini and deepseek providers. Make ignore_cache in llm content filter to true by default to avoid confusions * Refactor: Update LlmConfig in LLMExtractionStrategy class and deprecate old params * updated tests, docs and readme
2025-02-21 13:11:37 +05:30
parent 3cb28875c3
commit 2af958e12c
25 changed files with 420 additions and 240 deletions
--- a/docs/examples/extraction_strategies_examples.py
+++ b/docs/examples/extraction_strategies_examples.py
@@ -11,6 +11,7 @@ import asyncio
 import os

 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.async_configs import LlmConfig
 from crawl4ai.extraction_strategy import (
    LLMExtractionStrategy,
    JsonCssExtractionStrategy,
@@ -60,22 +61,19 @@ async def main():

    # 1. LLM Extraction with different input formats
    markdown_strategy = LLMExtractionStrategy(
-        provider="openai/gpt-4o-mini",
-        api_token=os.getenv("OPENAI_API_KEY"),
+        llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
        instruction="Extract product information including name, price, and description",
    )

    html_strategy = LLMExtractionStrategy(
        input_format="html",
-        provider="openai/gpt-4o-mini",
-        api_token=os.getenv("OPENAI_API_KEY"),
+        llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
        instruction="Extract product information from HTML including structured data",
    )

    fit_markdown_strategy = LLMExtractionStrategy(
        input_format="fit_markdown",
-        provider="openai/gpt-4o-mini",
-        api_token=os.getenv("OPENAI_API_KEY"),
+        llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
        instruction="Extract product information from cleaned markdown",
    )

--- a/docs/examples/llm_extraction_openai_pricing.py
+++ b/docs/examples/llm_extraction_openai_pricing.py
@@ -1,3 +1,4 @@
+from crawl4ai.async_configs import LlmConfig
 from crawl4ai.extraction_strategy import *
 from crawl4ai.crawler_strategy import *
 import asyncio
@@ -25,8 +26,7 @@ async def main():
            word_count_threshold=1,
            extraction_strategy=LLMExtractionStrategy(
                # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
-                provider="groq/llama-3.1-70b-versatile",
-                api_token=os.getenv("GROQ_API_KEY"),
+                llmConfig=LlmConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
                schema=OpenAIModelFee.model_json_schema(),
                extraction_type="schema",
                instruction="From the crawled content, extract all mentioned model names along with their "
--- a/docs/examples/llm_markdown_generator.py
+++ b/docs/examples/llm_markdown_generator.py
@@ -1,6 +1,7 @@
 import os
 import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.async_configs import LlmConfig
 from crawl4ai.content_filter_strategy import LLMContentFilter

 async def test_llm_filter():
@@ -22,8 +23,7 @@ async def test_llm_filter():

        # Initialize LLM filter with focused instruction
        filter = LLMContentFilter(
-            provider="openai/gpt-4o",
-            api_token=os.getenv('OPENAI_API_KEY'),
+            llmConfig=LlmConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
            instruction="""
            Focus on extracting the core educational content about Python classes.
            Include:
@@ -43,8 +43,7 @@ async def test_llm_filter():
        )
        
        filter = LLMContentFilter(
-            provider="openai/gpt-4o",
-            api_token=os.getenv('OPENAI_API_KEY'),
+            llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
            chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
            ignore_cache = True,
            instruction="""
--- a/docs/examples/quickstart_async.config.py
+++ b/docs/examples/quickstart_async.config.py
@@ -1,5 +1,7 @@
 import os, sys

+from crawl4ai.async_configs import LlmConfig
+
 sys.path.append(
    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 )
@@ -209,8 +211,7 @@ async def extract_structured_data_using_llm(
        word_count_threshold=1,
        page_timeout=80000,
        extraction_strategy=LLMExtractionStrategy(
-            provider=provider,
-            api_token=api_token,
+            llmConfig=LlmConfig(provider=provider,api_token=api_token),
            schema=OpenAIModelFee.model_json_schema(),
            extraction_type="schema",
            instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -1,5 +1,7 @@
 import os, sys

+from crawl4ai.async_configs import LlmConfig
+
 # append parent directory to system path
 sys.path.append(
    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -145,8 +147,7 @@ async def extract_structured_data_using_llm(
            url="https://openai.com/api/pricing/",
            word_count_threshold=1,
            extraction_strategy=LLMExtractionStrategy(
-                provider=provider,
-                api_token=api_token,
+                llmConfig=LlmConfig(provider=provider,api_token=api_token),
                schema=OpenAIModelFee.model_json_schema(),
                extraction_type="schema",
                instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
@@ -569,8 +570,7 @@ async def generate_knowledge_graph():
        relationships: List[Relationship]

    extraction_strategy = LLMExtractionStrategy(
-        provider="openai/gpt-4o-mini",  # Or any other provider, including Ollama and open source models
-        api_token=os.getenv("OPENAI_API_KEY"),  # In case of Ollama just pass "no-token"
+        llmConfig=LlmConfig(provider="openai/gpt-4o-mini",  api_token=os.getenv("OPENAI_API_KEY")),  # In case of Ollama just pass "no-token"
        schema=KnowledgeGraph.model_json_schema(),
        extraction_type="schema",
        instruction="""Extract entities and relationships from the given text.""",
--- a/docs/examples/quickstart_sync.py
+++ b/docs/examples/quickstart_sync.py
@@ -1,5 +1,6 @@
 import os
 import time
+from crawl4ai.async_configs import LlmConfig
 from crawl4ai.web_crawler import WebCrawler
 from crawl4ai.chunking_strategy import *
 from crawl4ai.extraction_strategy import *
@@ -178,7 +179,7 @@ def add_llm_extraction_strategy(crawler):
    result = crawler.run(
        url="https://www.nbcnews.com/business",
        extraction_strategy=LLMExtractionStrategy(
-            provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")
+            llmConfig =  LlmConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
        ),
    )
    cprint(
@@ -197,8 +198,7 @@ def add_llm_extraction_strategy(crawler):
    result = crawler.run(
        url="https://www.nbcnews.com/business",
        extraction_strategy=LLMExtractionStrategy(
-            provider="openai/gpt-4o",
-            api_token=os.getenv("OPENAI_API_KEY"),
+            llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
            instruction="I am interested in only financial news",
        ),
    )
@@ -210,8 +210,7 @@ def add_llm_extraction_strategy(crawler):
    result = crawler.run(
        url="https://www.nbcnews.com/business",
        extraction_strategy=LLMExtractionStrategy(
-            provider="openai/gpt-4o",
-            api_token=os.getenv("OPENAI_API_KEY"),
+            llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
            instruction="Extract only content related to technology",
        ),
    )