Feat/llm config (#724)

* feature: Add LlmConfig to easily configure and pass LLM configs to different strategies

* pulled in next branch and resolved conflicts

* feat: Add gemini and deepseek providers. Make ignore_cache in llm content filter to true by default to avoid confusions

* Refactor: Update LlmConfig in LLMExtractionStrategy class and deprecate old params

* updated tests, docs and readme
This commit is contained in:
Aravind
2025-02-21 13:11:37 +05:30
committed by GitHub
parent 3cb28875c3
commit 2af958e12c
25 changed files with 420 additions and 240 deletions

View File

@@ -1,6 +1,7 @@
import os
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.async_configs import LlmConfig
from crawl4ai.content_filter_strategy import LLMContentFilter
async def test_llm_filter():
@@ -22,8 +23,7 @@ async def test_llm_filter():
# Initialize LLM filter with focused instruction
filter = LLMContentFilter(
provider="openai/gpt-4o",
api_token=os.getenv('OPENAI_API_KEY'),
llmConfig=LlmConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
instruction="""
Focus on extracting the core educational content about Python classes.
Include:
@@ -43,8 +43,7 @@ async def test_llm_filter():
)
filter = LLMContentFilter(
provider="openai/gpt-4o",
api_token=os.getenv('OPENAI_API_KEY'),
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
ignore_cache = True,
instruction="""