Feat/llm config (#724)
* feature: Add LlmConfig to easily configure and pass LLM configs to different strategies * pulled in next branch and resolved conflicts * feat: Add gemini and deepseek providers. Make ignore_cache in llm content filter to true by default to avoid confusions * Refactor: Update LlmConfig in LLMExtractionStrategy class and deprecate old params * updated tests, docs and readme
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import os
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
|
||||
async def test_llm_filter():
|
||||
@@ -22,8 +23,7 @@ async def test_llm_filter():
|
||||
|
||||
# Initialize LLM filter with focused instruction
|
||||
filter = LLMContentFilter(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
||||
instruction="""
|
||||
Focus on extracting the core educational content about Python classes.
|
||||
Include:
|
||||
@@ -43,8 +43,7 @@ async def test_llm_filter():
|
||||
)
|
||||
|
||||
filter = LLMContentFilter(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
||||
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
|
||||
instruction="""
|
||||
Extract the main educational content while preserving its original wording and substance completely. Your task is to:
|
||||
|
||||
@@ -7,6 +7,7 @@ import json
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(parent_dir)
|
||||
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
from crawl4ai.chunking_strategy import RegexChunking
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
@@ -48,8 +49,7 @@ async def test_llm_extraction_strategy():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
url = "https://www.nbcnews.com/business"
|
||||
extraction_strategy = LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o-mini",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
|
||||
instruction="Extract only content related to technology",
|
||||
)
|
||||
result = await crawler.arun(
|
||||
|
||||
@@ -7,6 +7,7 @@ from crawl4ai import (
|
||||
BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator,
|
||||
PruningContentFilter, JsonCssExtractionStrategy, LLMContentFilter, CacheMode
|
||||
)
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||
|
||||
class Crawl4AiTester:
|
||||
@@ -142,7 +143,7 @@ async def test_with_client():
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=LLMContentFilter(
|
||||
provider="openai/gpt-40",
|
||||
llmConfig=LlmConfig(provider="openai/gpt-40"),
|
||||
instruction="Extract key technical concepts"
|
||||
)
|
||||
),
|
||||
|
||||
@@ -2,6 +2,8 @@ import inspect
|
||||
from typing import Any, Dict
|
||||
from enum import Enum
|
||||
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
|
||||
def to_serializable_dict(obj: Any) -> Dict:
|
||||
"""
|
||||
Recursively convert an object to a serializable dictionary using {type, params} structure
|
||||
@@ -222,7 +224,7 @@ if __name__ == "__main__":
|
||||
config3 = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=LLMContentFilter(
|
||||
provider="openai/gpt-4",
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4"),
|
||||
instruction="Extract key technical concepts",
|
||||
chunk_token_threshold=2000,
|
||||
overlap_rate=0.1
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import unittest, os
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import (
|
||||
RegexChunking,
|
||||
@@ -42,7 +43,7 @@ class TestWebCrawler(unittest.TestCase):
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=FixedLengthWordChunking(chunk_size=100),
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY")
|
||||
llmConfig=LlmConfig(provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
),
|
||||
bypass_cache=True,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user