Feat/llm config (#724)
* feature: Add LlmConfig to easily configure and pass LLM configs to different strategies * pulled in next branch and resolved conflicts * feat: Add gemini and deepseek providers. Make ignore_cache in llm content filter to true by default to avoid confusions * Refactor: Update LlmConfig in LLMExtractionStrategy class and deprecate old params * updated tests, docs and readme
This commit is contained in:
@@ -11,6 +11,7 @@ import asyncio
|
||||
import os
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.extraction_strategy import (
|
||||
LLMExtractionStrategy,
|
||||
JsonCssExtractionStrategy,
|
||||
@@ -60,22 +61,19 @@ async def main():
|
||||
|
||||
# 1. LLM Extraction with different input formats
|
||||
markdown_strategy = LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o-mini",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
|
||||
instruction="Extract product information including name, price, and description",
|
||||
)
|
||||
|
||||
html_strategy = LLMExtractionStrategy(
|
||||
input_format="html",
|
||||
provider="openai/gpt-4o-mini",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
|
||||
instruction="Extract product information from HTML including structured data",
|
||||
)
|
||||
|
||||
fit_markdown_strategy = LLMExtractionStrategy(
|
||||
input_format="fit_markdown",
|
||||
provider="openai/gpt-4o-mini",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
|
||||
instruction="Extract product information from cleaned markdown",
|
||||
)
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.extraction_strategy import *
|
||||
from crawl4ai.crawler_strategy import *
|
||||
import asyncio
|
||||
@@ -25,8 +26,7 @@ async def main():
|
||||
word_count_threshold=1,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
# provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
|
||||
provider="groq/llama-3.1-70b-versatile",
|
||||
api_token=os.getenv("GROQ_API_KEY"),
|
||||
llmConfig=LlmConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="From the crawled content, extract all mentioned model names along with their "
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import os
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
|
||||
async def test_llm_filter():
|
||||
@@ -22,8 +23,7 @@ async def test_llm_filter():
|
||||
|
||||
# Initialize LLM filter with focused instruction
|
||||
filter = LLMContentFilter(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
|
||||
instruction="""
|
||||
Focus on extracting the core educational content about Python classes.
|
||||
Include:
|
||||
@@ -43,8 +43,7 @@ async def test_llm_filter():
|
||||
)
|
||||
|
||||
filter = LLMContentFilter(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
||||
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
|
||||
ignore_cache = True,
|
||||
instruction="""
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import os, sys
|
||||
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
|
||||
sys.path.append(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
)
|
||||
@@ -209,8 +211,7 @@ async def extract_structured_data_using_llm(
|
||||
word_count_threshold=1,
|
||||
page_timeout=80000,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider=provider,
|
||||
api_token=api_token,
|
||||
llmConfig=LlmConfig(provider=provider,api_token=api_token),
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import os, sys
|
||||
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
|
||||
# append parent directory to system path
|
||||
sys.path.append(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
@@ -145,8 +147,7 @@ async def extract_structured_data_using_llm(
|
||||
url="https://openai.com/api/pricing/",
|
||||
word_count_threshold=1,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider=provider,
|
||||
api_token=api_token,
|
||||
llmConfig=LlmConfig(provider=provider,api_token=api_token),
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||
@@ -569,8 +570,7 @@ async def generate_knowledge_graph():
|
||||
relationships: List[Relationship]
|
||||
|
||||
extraction_strategy = LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o-mini", # Or any other provider, including Ollama and open source models
|
||||
api_token=os.getenv("OPENAI_API_KEY"), # In case of Ollama just pass "no-token"
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), # In case of Ollama just pass "no-token"
|
||||
schema=KnowledgeGraph.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""Extract entities and relationships from the given text.""",
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import os
|
||||
import time
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import *
|
||||
from crawl4ai.extraction_strategy import *
|
||||
@@ -178,7 +179,7 @@ def add_llm_extraction_strategy(crawler):
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
),
|
||||
)
|
||||
cprint(
|
||||
@@ -197,8 +198,7 @@ def add_llm_extraction_strategy(crawler):
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
|
||||
instruction="I am interested in only financial news",
|
||||
),
|
||||
)
|
||||
@@ -210,8 +210,7 @@ def add_llm_extraction_strategy(crawler):
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
|
||||
instruction="Extract only content related to technology",
|
||||
),
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user