From bacbeb3ed4b33f8835abfef7f617b99cfe818667 Mon Sep 17 00:00:00 2001 From: Haopeng138 <58230486+Haopeng138@users.noreply.github.com> Date: Tue, 24 Dec 2024 12:56:07 +0100 Subject: [PATCH] Fix #340 example llm_extraction (#358) @Haopeng138 Thank you so much. They are still part of the library. I forgot to update them since I moved the asynchronous versions years ago. I really appreciate it. I have to say that I feel weak in the documentation. That's why I spent a lot of time on it last week. Now, when you mention some of the things in the example folder, I realize I forgot about the example folder. I'll try to update it more. If you find anything else, please help and support. Thank you. I will add your name to contributor name as well. --- .../examples/llm_extraction_openai_pricing.py | 55 +++++++++---------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/docs/examples/llm_extraction_openai_pricing.py b/docs/examples/llm_extraction_openai_pricing.py index d05a1b6b..5ae3d4d1 100644 --- a/docs/examples/llm_extraction_openai_pricing.py +++ b/docs/examples/llm_extraction_openai_pricing.py @@ -1,41 +1,40 @@ -import os -import time -from crawl4ai.web_crawler import WebCrawler -from crawl4ai.chunking_strategy import * from crawl4ai.extraction_strategy import * from crawl4ai.crawler_strategy import * +import asyncio +from pydantic import BaseModel, Field url = r'https://openai.com/api/pricing/' -crawler = WebCrawler() -crawler.warmup() - -from pydantic import BaseModel, Field - class OpenAIModelFee(BaseModel): model_name: str = Field(..., description="Name of the OpenAI model.") input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") -result = crawler.run( - url=url, - word_count_threshold=1, - extraction_strategy= LLMExtractionStrategy( - # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), - provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'), - schema=OpenAIModelFee.model_json_schema(), - extraction_type="schema", - instruction="From the crawled content, extract all mentioned model names along with their "\ - "fees for input and output tokens. Make sure not to miss anything in the entire content. "\ - 'One extracted model JSON format should look like this: '\ - '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }' - ), - bypass_cache=True, -) +from crawl4ai import AsyncWebCrawler -model_fees = json.loads(result.extracted_content) +async def main(): + # Use AsyncWebCrawler + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url=url, + word_count_threshold=1, + extraction_strategy= LLMExtractionStrategy( + # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), + provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'), + schema=OpenAIModelFee.model_json_schema(), + extraction_type="schema", + instruction="From the crawled content, extract all mentioned model names along with their " \ + "fees for input and output tokens. Make sure not to miss anything in the entire content. " \ + 'One extracted model JSON format should look like this: ' \ + '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }' + ), -print(len(model_fees)) + ) + print("Success:", result.success) + model_fees = json.loads(result.extracted_content) + print(len(model_fees)) -with open(".data/data.json", "w", encoding="utf-8") as f: - f.write(result.extracted_content) \ No newline at end of file + with open(".data/data.json", "w", encoding="utf-8") as f: + f.write(result.extracted_content) + +asyncio.run(main())