Merge branch '2025-JUN-1' into next-MAY

2025-07-09 09:41:03 +02:00
parent 026e96a2df 9332326457
commit 0ebce590f8
15 changed files with 664 additions and 46 deletions
--- a/docs/examples/llm_extraction_openai_pricing.py
+++ b/docs/examples/llm_extraction_openai_pricing.py
@@ -1,43 +1,55 @@
-from crawl4ai import LLMConfig
-from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
 import asyncio
-import os
-import json
 from pydantic import BaseModel, Field
-
-url = "https://openai.com/api/pricing/"
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig, BrowserConfig, CacheMode
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+from typing import Dict
+import os


 class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="Name of the OpenAI model.")
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
-    output_fee: str = Field(
-        ..., description="Fee for output token for the OpenAI model."
+    output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
+
+
+async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
+    print(f"\n--- Extracting Structured Data with {provider} ---")
+
+    if api_token is None and provider != "ollama":
+        print(f"API token is required for {provider}. Skipping this example.")
+        return
+
+    browser_config = BrowserConfig(headless=True)
+
+    extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
+    if extra_headers:
+        extra_args["extra_headers"] = extra_headers
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        word_count_threshold=1,
+        page_timeout=80000,
+        extraction_strategy=LLMExtractionStrategy(
+            llm_config=LLMConfig(provider=provider, api_token=api_token),
+            schema=OpenAIModelFee.model_json_schema(),
+            extraction_type="schema",
+            instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
+            Do not miss any models in the entire content.""",
+            extra_args=extra_args,
+        ),
    )

-async def main():
-    # Use AsyncWebCrawler
-    async with AsyncWebCrawler() as crawler:
+    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
-            url=url,
-            word_count_threshold=1,
-            extraction_strategy=LLMExtractionStrategy(
-                # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
-                llm_config=LLMConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
-                schema=OpenAIModelFee.model_json_schema(),
-                extraction_type="schema",
-                instruction="From the crawled content, extract all mentioned model names along with their "
-                "fees for input and output tokens. Make sure not to miss anything in the entire content. "
-                "One extracted model JSON format should look like this: "
-                '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }',
-            ),
+            url="https://openai.com/api/pricing/", 
+            config=crawler_config
        )
-        print("Success:", result.success)
-        model_fees = json.loads(result.extracted_content)
-        print(len(model_fees))
-
-        with open(".data/data.json", "w", encoding="utf-8") as f:
-            f.write(result.extracted_content)
+        print(result.extracted_content)


-asyncio.run(main())
+if __name__ == "__main__":
+    asyncio.run(
+        extract_structured_data_using_llm(
+            provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")
+        )
+    )