diff --git a/docs/examples/llm_extraction_openai_pricing.py b/docs/examples/llm_extraction_openai_pricing.py index 27a1c310..de9c1c4a 100644 --- a/docs/examples/llm_extraction_openai_pricing.py +++ b/docs/examples/llm_extraction_openai_pricing.py @@ -1,43 +1,55 @@ -from crawl4ai import LLMConfig -from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy import asyncio -import os -import json from pydantic import BaseModel, Field - -url = "https://openai.com/api/pricing/" +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig, BrowserConfig, CacheMode +from crawl4ai.extraction_strategy import LLMExtractionStrategy +from typing import Dict +import os class OpenAIModelFee(BaseModel): model_name: str = Field(..., description="Name of the OpenAI model.") input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") - output_fee: str = Field( - ..., description="Fee for output token for the OpenAI model." + output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") + + +async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None): + print(f"\n--- Extracting Structured Data with {provider} ---") + + if api_token is None and provider != "ollama": + print(f"API token is required for {provider}. Skipping this example.") + return + + browser_config = BrowserConfig(headless=True) + + extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000} + if extra_headers: + extra_args["extra_headers"] = extra_headers + + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + word_count_threshold=1, + page_timeout=80000, + extraction_strategy=LLMExtractionStrategy( + llm_config=LLMConfig(provider=provider, api_token=api_token), + schema=OpenAIModelFee.model_json_schema(), + extraction_type="schema", + instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. + Do not miss any models in the entire content.""", + extra_args=extra_args, + ), ) -async def main(): - # Use AsyncWebCrawler - async with AsyncWebCrawler() as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( - url=url, - word_count_threshold=1, - extraction_strategy=LLMExtractionStrategy( - # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), - llm_config=LLMConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")), - schema=OpenAIModelFee.model_json_schema(), - extraction_type="schema", - instruction="From the crawled content, extract all mentioned model names along with their " - "fees for input and output tokens. Make sure not to miss anything in the entire content. " - "One extracted model JSON format should look like this: " - '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }', - ), + url="https://openai.com/api/pricing/", + config=crawler_config ) - print("Success:", result.success) - model_fees = json.loads(result.extracted_content) - print(len(model_fees)) - - with open(".data/data.json", "w", encoding="utf-8") as f: - f.write(result.extracted_content) + print(result.extracted_content) -asyncio.run(main()) +if __name__ == "__main__": + asyncio.run( + extract_structured_data_using_llm( + provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY") + ) + )