Push async version last changes for merge to main branch

2024-09-24 20:52:08 +08:00
parent d628bc4034
commit 4d48bd31ca
61 changed files with 6219 additions and 891 deletions
--- a/docs/md/examples/llm_extraction.md
+++ b/docs/md/examples/llm_extraction.md
@@ -1,6 +1,6 @@
-# LLM Extraction
+# LLM Extraction with AsyncWebCrawler

-Crawl4AI allows you to use Language Models (LLMs) to extract structured data or relevant content from web pages. Below are two examples demonstrating how to use LLMExtractionStrategy for different purposes.
+Crawl4AI's AsyncWebCrawler allows you to use Language Models (LLMs) to extract structured data or relevant content from web pages asynchronously. Below are two examples demonstrating how to use `LLMExtractionStrategy` for different purposes with the AsyncWebCrawler.

 ## Example 1: Extract Structured Data

@@ -8,17 +8,10 @@ In this example, we use the `LLMExtractionStrategy` to extract structured data (

 ```python
 import os
-import time
-from crawl4ai.web_crawler import WebCrawler
-from crawl4ai.chunking_strategy import *
-from crawl4ai.extraction_strategy import *
-from crawl4ai.crawler_strategy import *
-
-url = r'https://openai.com/api/pricing/'
-
-crawler = WebCrawler()
-crawler.warmup()
-
+import json
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
 from pydantic import BaseModel, Field

 class OpenAIModelFee(BaseModel):
@@ -26,27 +19,33 @@ class OpenAIModelFee(BaseModel):
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
    output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")

-result = crawler.run(
-    url=url,
-    word_count_threshold=1,
-    extraction_strategy= LLMExtractionStrategy(
-        provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), 
-        schema=OpenAIModelFee.model_json_schema(),
-        extraction_type="schema",
-        instruction="From the crawled content, extract all mentioned model names along with their "\
-            "fees for input and output tokens. Make sure not to miss anything in the entire content. "\
-            'One extracted model JSON format should look like this: '\
-            '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
-    ),
-    bypass_cache=True,
-)
+async def extract_openai_fees():
+    url = 'https://openai.com/api/pricing/'

-model_fees = json.loads(result.extracted_content)
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url=url,
+            word_count_threshold=1,
+            extraction_strategy=LLMExtractionStrategy(
+                provider="openai/gpt-4o",
+                api_token=os.getenv('OPENAI_API_KEY'),
+                schema=OpenAIModelFee.model_json_schema(),
+                extraction_type="schema",
+                instruction="From the crawled content, extract all mentioned model names along with their "
+                            "fees for input and output tokens. Make sure not to miss anything in the entire content. "
+                            'One extracted model JSON format should look like this: '
+                            '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
+            ),
+            bypass_cache=True,
+        )

-print(len(model_fees))
+    model_fees = json.loads(result.extracted_content)
+    print(f"Number of models extracted: {len(model_fees)}")

-with open(".data/data.json", "w", encoding="utf-8") as f:
-    f.write(result.extracted_content)
+    with open(".data/openai_fees.json", "w", encoding="utf-8") as f:
+        json.dump(model_fees, f, indent=2)
+
+asyncio.run(extract_openai_fees())
 ```

 ## Example 2: Extract Relevant Content
@@ -54,30 +53,80 @@ with open(".data/data.json", "w", encoding="utf-8") as f:
 In this example, we instruct the LLM to extract only content related to technology from the NBC News business page.

 ```python
-crawler = WebCrawler()
-crawler.warmup()
+import os
+import json
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.extraction_strategy import LLMExtractionStrategy

-result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=LLMExtractionStrategy(
-            provider="openai/gpt-4o",
-            api_token=os.getenv('OPENAI_API_KEY'),
-            instruction="Extract only content related to technology"
-        ),
-    bypass_cache=True,
-    )
+async def extract_tech_content():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            extraction_strategy=LLMExtractionStrategy(
+                provider="openai/gpt-4o",
+                api_token=os.getenv('OPENAI_API_KEY'),
+                instruction="Extract only content related to technology"
+            ),
+            bypass_cache=True,
+        )

-model_fees = json.loads(result.extracted_content)
+    tech_content = json.loads(result.extracted_content)
+    print(f"Number of tech-related items extracted: {len(tech_content)}")

-print(len(model_fees))
+    with open(".data/tech_content.json", "w", encoding="utf-8") as f:
+        json.dump(tech_content, f, indent=2)

-with open(".data/data.json", "w", encoding="utf-8") as f:
-    f.write(result.extracted_content)
+asyncio.run(extract_tech_content())
+```
+
+## Advanced Usage: Combining JS Execution with LLM Extraction
+
+This example demonstrates how to combine JavaScript execution with LLM extraction to handle dynamic content:
+
+```python
+async def extract_dynamic_content():
+    js_code = """
+    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
+    if (loadMoreButton) {
+        loadMoreButton.click();
+        await new Promise(resolve => setTimeout(resolve, 2000));
+    }
+    """
+
+    wait_for = """
+    () => {
+        const articles = document.querySelectorAll('article.tease-card');
+        return articles.length > 10;
+    }
+    """
+
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            js_code=js_code,
+            wait_for=wait_for,
+            css_selector="article.tease-card",
+            extraction_strategy=LLMExtractionStrategy(
+                provider="openai/gpt-4o",
+                api_token=os.getenv('OPENAI_API_KEY'),
+                instruction="Summarize each article, focusing on technology-related content"
+            ),
+            bypass_cache=True,
+        )
+
+    summaries = json.loads(result.extracted_content)
+    print(f"Number of summarized articles: {len(summaries)}")
+
+    with open(".data/tech_summaries.json", "w", encoding="utf-8") as f:
+        json.dump(summaries, f, indent=2)
+
+asyncio.run(extract_dynamic_content())
 ```

 ## Customizing LLM Provider

-Under the hood, Crawl4AI uses the `litellm` library, which allows you to use any LLM provider you want. Just pass the correct model name and API token.
+Crawl4AI uses the `litellm` library under the hood, which allows you to use any LLM provider you want. Just pass the correct model name and API token:

 ```python
 extraction_strategy=LLMExtractionStrategy(
@@ -88,3 +137,43 @@ extraction_strategy=LLMExtractionStrategy(
 ```

 This flexibility allows you to integrate with various LLM providers and tailor the extraction process to your specific needs.
+
+## Error Handling and Retries
+
+When working with external LLM APIs, it's important to handle potential errors and implement retry logic. Here's an example of how you might do this:
+
+```python
+import asyncio
+from tenacity import retry, stop_after_attempt, wait_exponential
+
+class LLMExtractionError(Exception):
+    pass
+
+@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
+async def extract_with_retry(crawler, url, extraction_strategy):
+    try:
+        result = await crawler.arun(url=url, extraction_strategy=extraction_strategy, bypass_cache=True)
+        return json.loads(result.extracted_content)
+    except Exception as e:
+        raise LLMExtractionError(f"Failed to extract content: {str(e)}")
+
+async def main():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        try:
+            content = await extract_with_retry(
+                crawler,
+                "https://www.example.com",
+                LLMExtractionStrategy(
+                    provider="openai/gpt-4o",
+                    api_token=os.getenv('OPENAI_API_KEY'),
+                    instruction="Extract and summarize main points"
+                )
+            )
+            print("Extracted content:", content)
+        except LLMExtractionError as e:
+            print(f"Extraction failed after retries: {e}")
+
+asyncio.run(main())
+```
+
+This example uses the `tenacity` library to implement a retry mechanism with exponential backoff, which can help handle temporary failures or rate limiting from the LLM API.