From f51b0780423ceadd626f2b747bbf291cd0587dfa Mon Sep 17 00:00:00 2001 From: unclecode Date: Mon, 24 Jun 2024 22:54:29 +0800 Subject: [PATCH] Update reame example. --- README.md | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index ebea270c..39f7cc9c 100644 --- a/README.md +++ b/README.md @@ -60,19 +60,30 @@ Crawl all OpenAI models and their fees from the official page. import os from crawl4ai import WebCrawler from crawl4ai.extraction_strategy import LLMExtractionStrategy +from pydantic import BaseModel, Field + +class OpenAIModelFee(BaseModel): + model_name: str = Field(..., description="Name of the OpenAI model.") + input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") + output_fee: str = Field(..., description="Fee for output token ßfor the OpenAI model.") url = 'https://openai.com/api/pricing/' crawler = WebCrawler() crawler.warmup() result = crawler.run( - url=url, - extraction_strategy=LLMExtractionStrategy( - provider="openai/gpt-4", - api_token=os.getenv('OPENAI_API_KEY'), - instruction="Extract all model names and their fees for input and output tokens." - ), -) + url=url, + word_count_threshold=1, + extraction_strategy= LLMExtractionStrategy( + provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), + schema=OpenAIModelFee.schema(), + extraction_type="schema", + instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. + Do not miss any models in the entire content. One extracted model JSON format should look like this: + {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""" + ), + bypass_cache=True, + ) print(result.extracted_content) ```