refactor: Update LLM extraction example with the updated structure

2025-06-12 12:23:03 +02:00
parent 5d9213a0e9
commit dc85481180
1 changed files with 42 additions and 30 deletions
--- a/docs/examples/llm_extraction_openai_pricing.py
+++ b/docs/examples/llm_extraction_openai_pricing.py
@@ -1,43 +1,55 @@
 from crawl4ai import LLMConfig
 from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
 import asyncio
 import os
 import json
 from pydantic import BaseModel, Field
-
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig, BrowserConfig, CacheMode
-url = "https://openai.com/api/pricing/"
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
 from typing import Dict
 import os
 class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="Name of the OpenAI model.")
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
-    output_fee: str = Field(
+    output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
-        ..., description="Fee for output token for the OpenAI model."
+
 async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
    print(f"\n--- Extracting Structured Data with {provider} ---")
    if api_token is None and provider != "ollama":
        print(f"API token is required for {provider}. Skipping this example.")
        return
    browser_config = BrowserConfig(headless=True)
    extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
    if extra_headers:
        extra_args["extra_headers"] = extra_headers
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        word_count_threshold=1,
        page_timeout=80000,
        extraction_strategy=LLMExtractionStrategy(
            llm_config=LLMConfig(provider=provider, api_token=api_token),
            schema=OpenAIModelFee.model_json_schema(),
            extraction_type="schema",
            instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
            Do not miss any models in the entire content.""",
            extra_args=extra_args,
        ),
    )
-async def main():
+    async with AsyncWebCrawler(config=browser_config) as crawler:
    # Use AsyncWebCrawler
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
-            url=url,
+            url="https://openai.com/api/pricing/", 
-            word_count_threshold=1,
+            config=crawler_config
            extraction_strategy=LLMExtractionStrategy(
                # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
                llm_config=LLMConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
                schema=OpenAIModelFee.model_json_schema(),
                extraction_type="schema",
                instruction="From the crawled content, extract all mentioned model names along with their "
                "fees for input and output tokens. Make sure not to miss anything in the entire content. "
                "One extracted model JSON format should look like this: "
                '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }',
            ),
        )
-        print("Success:", result.success)
+        print(result.extracted_content)
        model_fees = json.loads(result.extracted_content)
        print(len(model_fees))
        with open(".data/data.json", "w", encoding="utf-8") as f:
            f.write(result.extracted_content)
-asyncio.run(main())
+if __name__ == "__main__":
    asyncio.run(
        extract_structured_data_using_llm(
            provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")
        )
    )