Fix #340 example llm_extraction (#358)

@Haopeng138 Thank you so much. They are still part of the library. I forgot to update them since I moved the asynchronous versions years ago. I really appreciate it. I have to say that I feel weak in the documentation. That's why I spent a lot of time on it last week. Now, when you mention some of the things in the example folder, I realize I forgot about the example folder. I'll try to update it more. If you find anything else, please help and support. Thank you. I will add your name to contributor name as well.
This commit is contained in:
Haopeng138
2024-12-24 12:56:07 +01:00
committed by GitHub
parent ed7bc1909c
commit bacbeb3ed4

View File

@@ -1,23 +1,21 @@
import os
import time
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import * from crawl4ai.extraction_strategy import *
from crawl4ai.crawler_strategy import * from crawl4ai.crawler_strategy import *
import asyncio
from pydantic import BaseModel, Field
url = r'https://openai.com/api/pricing/' url = r'https://openai.com/api/pricing/'
crawler = WebCrawler()
crawler.warmup()
from pydantic import BaseModel, Field
class OpenAIModelFee(BaseModel): class OpenAIModelFee(BaseModel):
model_name: str = Field(..., description="Name of the OpenAI model.") model_name: str = Field(..., description="Name of the OpenAI model.")
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
result = crawler.run( from crawl4ai import AsyncWebCrawler
async def main():
# Use AsyncWebCrawler
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url, url=url,
word_count_threshold=1, word_count_threshold=1,
extraction_strategy= LLMExtractionStrategy( extraction_strategy= LLMExtractionStrategy(
@@ -30,12 +28,13 @@ result = crawler.run(
'One extracted model JSON format should look like this: ' \ 'One extracted model JSON format should look like this: ' \
'{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }' '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
), ),
bypass_cache=True,
) )
print("Success:", result.success)
model_fees = json.loads(result.extracted_content) model_fees = json.loads(result.extracted_content)
print(len(model_fees)) print(len(model_fees))
with open(".data/data.json", "w", encoding="utf-8") as f: with open(".data/data.json", "w", encoding="utf-8") as f:
f.write(result.extracted_content) f.write(result.extracted_content)
asyncio.run(main())