Feat/llm config (#724)

* feature: Add LlmConfig to easily configure and pass LLM configs to different strategies * pulled in next branch and resolved conflicts * feat: Add gemini and deepseek providers. Make ignore_cache in llm content filter to true by default to avoid confusions * Refactor: Update LlmConfig in LLMExtractionStrategy class and deprecate old params * updated tests, docs and readme
2025-02-21 13:11:37 +05:30
parent 3cb28875c3
commit 2af958e12c
25 changed files with 420 additions and 240 deletions
--- a/tests/20241401/test_llm_filter.py
+++ b/tests/20241401/test_llm_filter.py
@@ -1,6 +1,7 @@
 import os
 import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.async_configs import LlmConfig
 from crawl4ai.content_filter_strategy import LLMContentFilter

 async def test_llm_filter():
@@ -22,8 +23,7 @@ async def test_llm_filter():

        # Initialize LLM filter with focused instruction
        filter = LLMContentFilter(
-            provider="openai/gpt-4o",
-            api_token=os.getenv('OPENAI_API_KEY'),
+            llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
            instruction="""
            Focus on extracting the core educational content about Python classes.
            Include:
@@ -43,8 +43,7 @@ async def test_llm_filter():
        )
        
        filter = LLMContentFilter(
-            provider="openai/gpt-4o",
-            api_token=os.getenv('OPENAI_API_KEY'),
+            llmConfig = LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
            chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
            instruction="""
            Extract the main educational content while preserving its original wording and substance completely. Your task is to:
--- a/tests/async/test_chunking_and_extraction_strategies.py
+++ b/tests/async/test_chunking_and_extraction_strategies.py
@@ -7,6 +7,7 @@ import json
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)

+from crawl4ai.async_configs import LlmConfig
 from crawl4ai.async_webcrawler import AsyncWebCrawler
 from crawl4ai.chunking_strategy import RegexChunking
 from crawl4ai.extraction_strategy import LLMExtractionStrategy
@@ -48,8 +49,7 @@ async def test_llm_extraction_strategy():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
        extraction_strategy = LLMExtractionStrategy(
-            provider="openai/gpt-4o-mini",
-            api_token=os.getenv("OPENAI_API_KEY"),
+            llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
            instruction="Extract only content related to technology",
        )
        result = await crawler.arun(
--- a/tests/docker/test_docker.py
+++ b/tests/docker/test_docker.py
@@ -7,6 +7,7 @@ from crawl4ai import (
    BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator,
    PruningContentFilter, JsonCssExtractionStrategy, LLMContentFilter, CacheMode
 )
+from crawl4ai.async_configs import LlmConfig
 from crawl4ai.docker_client import Crawl4aiDockerClient

 class Crawl4AiTester:
@@ -142,7 +143,7 @@ async def test_with_client():
            cache_mode=CacheMode.BYPASS,
            markdown_generator=DefaultMarkdownGenerator(
                content_filter=LLMContentFilter(
-                    provider="openai/gpt-40",
+                    llmConfig=LlmConfig(provider="openai/gpt-40"),
                    instruction="Extract key technical concepts"
                )
            ),
--- a/tests/docker/test_serialization.py
+++ b/tests/docker/test_serialization.py
@@ -2,6 +2,8 @@ import inspect
 from typing import Any, Dict
 from enum import Enum

+from crawl4ai.async_configs import LlmConfig
+
 def to_serializable_dict(obj: Any) -> Dict:
    """
    Recursively convert an object to a serializable dictionary using {type, params} structure
@@ -222,7 +224,7 @@ if __name__ == "__main__":
    config3 = CrawlerRunConfig(
        markdown_generator=DefaultMarkdownGenerator(
            content_filter=LLMContentFilter(
-                provider="openai/gpt-4",
+                llmConfig = LlmConfig(provider="openai/gpt-4"),
                instruction="Extract key technical concepts",
                chunk_token_threshold=2000,
                overlap_rate=0.1
--- a/tests/test_web_crawler.py
+++ b/tests/test_web_crawler.py
@@ -1,4 +1,5 @@
 import unittest, os
+from crawl4ai.async_configs import LlmConfig
 from crawl4ai.web_crawler import WebCrawler
 from crawl4ai.chunking_strategy import (
    RegexChunking,
@@ -42,7 +43,7 @@ class TestWebCrawler(unittest.TestCase):
            word_count_threshold=5,
            chunking_strategy=FixedLengthWordChunking(chunk_size=100),
            extraction_strategy=LLMExtractionStrategy(
-                provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY")
+                llmConfig=LlmConfig(provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY"))
            ),
            bypass_cache=True,
        )