Fix #340 example llm_extraction (#358)

@Haopeng138 Thank you so much. They are still part of the library. I forgot to update them since I moved the asynchronous versions years ago. I really appreciate it. I have to say that I feel weak in the documentation. That's why I spent a lot of time on it last week. Now, when you mention some of the things in the example folder, I realize I forgot about the example folder. I'll try to update it more. If you find anything else, please help and support. Thank you. I will add your name to contributor name as well.
2024-12-24 12:56:07 +01:00
parent ed7bc1909c
commit bacbeb3ed4
1 changed files with 27 additions and 28 deletions
--- a/docs/examples/llm_extraction_openai_pricing.py
+++ b/docs/examples/llm_extraction_openai_pricing.py
@@ -1,41 +1,40 @@
-import os
-import time
-from crawl4ai.web_crawler import WebCrawler
-from crawl4ai.chunking_strategy import *
 from crawl4ai.extraction_strategy import *
 from crawl4ai.crawler_strategy import *
+import asyncio
+from pydantic import BaseModel, Field

 url = r'https://openai.com/api/pricing/'

-crawler = WebCrawler()
-crawler.warmup()
-
-from pydantic import BaseModel, Field
-
 class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="Name of the OpenAI model.")
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
    output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")

-result = crawler.run(
-    url=url,
-    word_count_threshold=1,
-    extraction_strategy= LLMExtractionStrategy(
-        # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), 
-        provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'), 
-        schema=OpenAIModelFee.model_json_schema(),
-        extraction_type="schema",
-        instruction="From the crawled content, extract all mentioned model names along with their "\
-            "fees for input and output tokens. Make sure not to miss anything in the entire content. "\
-            'One extracted model JSON format should look like this: '\
-            '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
-    ),
-    bypass_cache=True,
-)
+from crawl4ai import AsyncWebCrawler

-model_fees = json.loads(result.extracted_content)
+async def main():
+    # Use AsyncWebCrawler
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url=url,
+            word_count_threshold=1,
+            extraction_strategy= LLMExtractionStrategy(
+                # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
+                provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'),
+                schema=OpenAIModelFee.model_json_schema(),
+                extraction_type="schema",
+                instruction="From the crawled content, extract all mentioned model names along with their " \
+                            "fees for input and output tokens. Make sure not to miss anything in the entire content. " \
+                            'One extracted model JSON format should look like this: ' \
+                            '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
+            ),

-print(len(model_fees))
+        )
+        print("Success:", result.success)
+        model_fees = json.loads(result.extracted_content)
+        print(len(model_fees))

-with open(".data/data.json", "w", encoding="utf-8") as f:
-    f.write(result.extracted_content)
+        with open(".data/data.json", "w", encoding="utf-8") as f:
+            f.write(result.extracted_content)
+
+asyncio.run(main())