Push async version last changes for merge to main branch
This commit is contained in:
@@ -1,100 +1,110 @@
|
||||
# Hooks & Auth
|
||||
# Hooks & Auth for AsyncWebCrawler
|
||||
|
||||
Crawl4AI allows you to customize the behavior of the web crawler using hooks. Hooks are functions that are called at specific points in the crawling process, allowing you to modify the crawler's behavior or perform additional actions. This example demonstrates how to use various hooks to customize the crawling process.
|
||||
Crawl4AI's AsyncWebCrawler allows you to customize the behavior of the web crawler using hooks. Hooks are asynchronous functions that are called at specific points in the crawling process, allowing you to modify the crawler's behavior or perform additional actions. This example demonstrates how to use various hooks to customize the asynchronous crawling process.
|
||||
|
||||
## Example: Using Crawler Hooks
|
||||
## Example: Using Crawler Hooks with AsyncWebCrawler
|
||||
|
||||
Let's see how we can customize the crawler using hooks! In this example, we'll:
|
||||
Let's see how we can customize the AsyncWebCrawler using hooks! In this example, we'll:
|
||||
|
||||
1. Maximize the browser window and log in to a website when the driver is created.
|
||||
2. Add a custom header before fetching the URL.
|
||||
3. Log the current URL after fetching it.
|
||||
4. Log the length of the HTML before returning it.
|
||||
1. Configure the browser when it's created.
|
||||
2. Add custom headers before navigating to the URL.
|
||||
3. Log the current URL after navigation.
|
||||
4. Perform actions after JavaScript execution.
|
||||
5. Log the length of the HTML before returning it.
|
||||
|
||||
### Hook Definitions
|
||||
|
||||
```python
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.crawler_strategy import *
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
from playwright.async_api import Page, Browser
|
||||
|
||||
def on_driver_created(driver):
|
||||
print("[HOOK] on_driver_created")
|
||||
# Example customization: maximize the window
|
||||
driver.maximize_window()
|
||||
async def on_browser_created(browser: Browser):
|
||||
print("[HOOK] on_browser_created")
|
||||
# Example customization: set browser viewport size
|
||||
context = await browser.new_context(viewport={'width': 1920, 'height': 1080})
|
||||
page = await context.new_page()
|
||||
|
||||
# Example customization: logging in to a hypothetical website
|
||||
driver.get('https://example.com/login')
|
||||
await page.goto('https://example.com/login')
|
||||
await page.fill('input[name="username"]', 'testuser')
|
||||
await page.fill('input[name="password"]', 'password123')
|
||||
await page.click('button[type="submit"]')
|
||||
await page.wait_for_selector('#welcome')
|
||||
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.NAME, 'username'))
|
||||
)
|
||||
driver.find_element(By.NAME, 'username').send_keys('testuser')
|
||||
driver.find_element(By.NAME, 'password').send_keys('password123')
|
||||
driver.find_element(By.NAME, 'login').click()
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.ID, 'welcome'))
|
||||
)
|
||||
# Add a custom cookie
|
||||
driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
|
||||
return driver
|
||||
await context.add_cookies([{'name': 'test_cookie', 'value': 'cookie_value', 'url': 'https://example.com'}])
|
||||
|
||||
await page.close()
|
||||
await context.close()
|
||||
|
||||
def before_get_url(driver):
|
||||
print("[HOOK] before_get_url")
|
||||
# Example customization: add a custom header
|
||||
# Enable Network domain for sending headers
|
||||
driver.execute_cdp_cmd('Network.enable', {})
|
||||
# Add a custom header
|
||||
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
|
||||
return driver
|
||||
async def before_goto(page: Page):
|
||||
print("[HOOK] before_goto")
|
||||
# Example customization: add custom headers
|
||||
await page.set_extra_http_headers({'X-Test-Header': 'test'})
|
||||
|
||||
def after_get_url(driver):
|
||||
print("[HOOK] after_get_url")
|
||||
async def after_goto(page: Page):
|
||||
print("[HOOK] after_goto")
|
||||
# Example customization: log the URL
|
||||
print(driver.current_url)
|
||||
return driver
|
||||
print(f"Current URL: {page.url}")
|
||||
|
||||
def before_return_html(driver, html):
|
||||
async def on_execution_started(page: Page):
|
||||
print("[HOOK] on_execution_started")
|
||||
# Example customization: perform actions after JS execution
|
||||
await page.evaluate("console.log('Custom JS executed')")
|
||||
|
||||
async def before_return_html(page: Page, html: str):
|
||||
print("[HOOK] before_return_html")
|
||||
# Example customization: log the HTML
|
||||
print(len(html))
|
||||
return driver
|
||||
# Example customization: log the HTML length
|
||||
print(f"HTML length: {len(html)}")
|
||||
return page
|
||||
```
|
||||
|
||||
### Using the Hooks with the WebCrawler
|
||||
### Using the Hooks with the AsyncWebCrawler
|
||||
|
||||
```python
|
||||
print("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||
crawler_strategy.set_hook('on_driver_created', on_driver_created)
|
||||
crawler_strategy.set_hook('before_get_url', before_get_url)
|
||||
crawler_strategy.set_hook('after_get_url', after_get_url)
|
||||
crawler_strategy.set_hook('before_return_html', before_return_html)
|
||||
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||
crawler.warmup()
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
result = crawler.run(url="https://example.com")
|
||||
async def main():
|
||||
print("\n🔗 Using Crawler Hooks: Let's see how we can customize the AsyncWebCrawler using hooks!")
|
||||
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(verbose=True)
|
||||
crawler_strategy.set_hook('on_browser_created', on_browser_created)
|
||||
crawler_strategy.set_hook('before_goto', before_goto)
|
||||
crawler_strategy.set_hook('after_goto', after_goto)
|
||||
crawler_strategy.set_hook('on_execution_started', on_execution_started)
|
||||
crawler_strategy.set_hook('before_return_html', before_return_html)
|
||||
|
||||
async with AsyncWebCrawler(verbose=True, crawler_strategy=crawler_strategy) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
js_code="window.scrollTo(0, document.body.scrollHeight);",
|
||||
wait_for="footer"
|
||||
)
|
||||
|
||||
print("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||||
print(result)
|
||||
print("📦 Crawler Hooks result:")
|
||||
print(result)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### Explanation
|
||||
|
||||
- `on_driver_created`: This hook is called when the Selenium driver is created. In this example, it maximizes the window, logs in to a website, and adds a custom cookie.
|
||||
- `before_get_url`: This hook is called right before Selenium fetches the URL. In this example, it adds a custom HTTP header.
|
||||
- `after_get_url`: This hook is called after Selenium fetches the URL. In this example, it logs the current URL.
|
||||
- `before_return_html`: This hook is called before returning the HTML content. In this example, it logs the length of the HTML content.
|
||||
- `on_browser_created`: This hook is called when the Playwright browser is created. It sets up the browser context, logs in to a website, and adds a custom cookie.
|
||||
- `before_goto`: This hook is called right before Playwright navigates to the URL. It adds custom HTTP headers.
|
||||
- `after_goto`: This hook is called after Playwright navigates to the URL. It logs the current URL.
|
||||
- `on_execution_started`: This hook is called after any custom JavaScript is executed. It performs additional JavaScript actions.
|
||||
- `before_return_html`: This hook is called before returning the HTML content. It logs the length of the HTML content.
|
||||
|
||||
### Additional Ideas
|
||||
|
||||
- **Add custom headers to requests**: You can add custom headers to the requests using the `before_get_url` hook.
|
||||
- **Perform safety checks**: Use the hooks to perform safety checks before the crawling process starts.
|
||||
- **Modify the HTML content**: Use the `before_return_html` hook to modify the HTML content before it is returned.
|
||||
- **Log additional information**: Use the hooks to log additional information for debugging or monitoring purposes.
|
||||
- **Handling authentication**: Use the `on_browser_created` hook to handle login processes or set authentication tokens.
|
||||
- **Dynamic header modification**: Modify headers based on the target URL or other conditions in the `before_goto` hook.
|
||||
- **Content verification**: Use the `after_goto` hook to verify that the expected content is present on the page.
|
||||
- **Custom JavaScript injection**: Inject and execute custom JavaScript using the `on_execution_started` hook.
|
||||
- **Content preprocessing**: Modify or analyze the HTML content in the `before_return_html` hook before it's returned.
|
||||
|
||||
By using these hooks, you can customize the behavior of the crawler to suit your specific needs.
|
||||
By using these hooks, you can customize the behavior of the AsyncWebCrawler to suit your specific needs, including handling authentication, modifying requests, and preprocessing content.
|
||||
@@ -8,6 +8,10 @@ Welcome to the examples section of Crawl4AI documentation! In this section, you
|
||||
|
||||
This example demonstrates how to use Crawl4AI to extract information using Large Language Models (LLMs). You will learn how to configure the `LLMExtractionStrategy` to get structured data from web pages.
|
||||
|
||||
### [JSON CSS Extraction](json_css_extraction.md)
|
||||
|
||||
This example demonstrates how to use Crawl4AI to extract structured data without using LLM, and just focusing on page structure. You will learn how to use the `JsonCssExtractionStrategy` to extract data using CSS selectors.
|
||||
|
||||
### [JS Execution & CSS Filtering](js_execution_css_filtering.md)
|
||||
|
||||
Learn how to execute custom JavaScript code and filter data using CSS selectors. This example shows how to perform complex web interactions and extract specific content from web pages.
|
||||
|
||||
@@ -1,44 +1,104 @@
|
||||
# JS Execution & CSS Filtering
|
||||
# JS Execution & CSS Filtering with AsyncWebCrawler
|
||||
|
||||
In this example, we'll demonstrate how to use Crawl4AI to execute JavaScript, filter data with CSS selectors, and use a cosine similarity strategy to extract relevant content. This approach is particularly useful when you need to interact with dynamic content on web pages, such as clicking "Load More" buttons.
|
||||
In this example, we'll demonstrate how to use Crawl4AI's AsyncWebCrawler to execute JavaScript, filter data with CSS selectors, and use a cosine similarity strategy to extract relevant content. This approach is particularly useful when you need to interact with dynamic content on web pages, such as clicking "Load More" buttons.
|
||||
|
||||
## Example: Extracting Structured Data
|
||||
## Example: Extracting Structured Data Asynchronously
|
||||
|
||||
```python
|
||||
# Import necessary modules
|
||||
from crawl4ai import WebCrawler
|
||||
from crawl4ai.chunking_strategy import *
|
||||
from crawl4ai.extraction_strategy import *
|
||||
from crawl4ai.crawler_strategy import *
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.chunking_strategy import RegexChunking
|
||||
from crawl4ai.extraction_strategy import CosineStrategy
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
# Define the JavaScript code to click the "Load More" button
|
||||
js_code = ["""
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""]
|
||||
async def main():
|
||||
# Define the JavaScript code to click the "Load More" button
|
||||
js_code = """
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
if (loadMoreButton) {
|
||||
loadMoreButton.click();
|
||||
// Wait for new content to load
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
}
|
||||
"""
|
||||
|
||||
crawler = WebCrawler(verbose=True)
|
||||
crawler.warmup()
|
||||
# Run the crawler with keyword filtering and CSS selector
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js=js_code,
|
||||
css_selector="p",
|
||||
extraction_strategy=CosineStrategy(
|
||||
semantic_filter="technology",
|
||||
),
|
||||
)
|
||||
# Define a wait_for function to ensure content is loaded
|
||||
wait_for = """
|
||||
() => {
|
||||
const articles = document.querySelectorAll('article.tease-card');
|
||||
return articles.length > 10;
|
||||
}
|
||||
"""
|
||||
|
||||
# Display the extracted result
|
||||
print(result)
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# Run the crawler with keyword filtering and CSS selector
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js_code=js_code,
|
||||
wait_for=wait_for,
|
||||
css_selector="article.tease-card",
|
||||
extraction_strategy=CosineStrategy(
|
||||
semantic_filter="technology",
|
||||
),
|
||||
chunking_strategy=RegexChunking(),
|
||||
)
|
||||
|
||||
# Display the extracted result
|
||||
print(result.extracted_content)
|
||||
|
||||
# Run the async function
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### Explanation
|
||||
|
||||
1. **JavaScript Execution**: The `js_code` variable contains JavaScript code that simulates clicking a "Load More" button. This is useful for loading additional content dynamically.
|
||||
2. **CSS Selector**: The `css_selector="p"` parameter ensures that only paragraph (`<p>`) tags are extracted from the web page.
|
||||
3. **Extraction Strategy**: The `CosineStrategy` is used with a semantic filter for "technology" to extract relevant content based on cosine similarity.
|
||||
1. **Asynchronous Execution**: We use `AsyncWebCrawler` with async/await syntax for non-blocking execution.
|
||||
|
||||
2. **JavaScript Execution**: The `js_code` variable contains JavaScript code that simulates clicking a "Load More" button and waits for new content to load.
|
||||
|
||||
3. **Wait Condition**: The `wait_for` function ensures that the page has loaded more than 10 articles before proceeding with the extraction.
|
||||
|
||||
4. **CSS Selector**: The `css_selector="article.tease-card"` parameter ensures that only article cards are extracted from the web page.
|
||||
|
||||
5. **Extraction Strategy**: The `CosineStrategy` is used with a semantic filter for "technology" to extract relevant content based on cosine similarity.
|
||||
|
||||
6. **Chunking Strategy**: We use `RegexChunking()` to split the content into manageable chunks for processing.
|
||||
|
||||
## Advanced Usage: Custom Session and Multiple Requests
|
||||
|
||||
For more complex scenarios where you need to maintain state across multiple requests or execute additional JavaScript after the initial page load, you can use a custom session:
|
||||
|
||||
```python
|
||||
async def advanced_crawl():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# Initial crawl with custom session
|
||||
result1 = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js_code=js_code,
|
||||
wait_for=wait_for,
|
||||
css_selector="article.tease-card",
|
||||
session_id="business_session"
|
||||
)
|
||||
|
||||
# Execute additional JavaScript in the same session
|
||||
result2 = await crawler.crawler_strategy.execute_js(
|
||||
session_id="business_session",
|
||||
js_code="window.scrollTo(0, document.body.scrollHeight);",
|
||||
wait_for_js="() => window.innerHeight + window.scrollY >= document.body.offsetHeight"
|
||||
)
|
||||
|
||||
# Process results
|
||||
print("Initial crawl result:", result1.extracted_content)
|
||||
print("Additional JS execution result:", result2.html)
|
||||
|
||||
asyncio.run(advanced_crawl())
|
||||
```
|
||||
|
||||
This advanced example demonstrates how to:
|
||||
1. Use a custom session to maintain state across requests.
|
||||
2. Execute additional JavaScript after the initial page load.
|
||||
3. Wait for specific conditions using JavaScript functions.
|
||||
|
||||
## Try It Yourself
|
||||
|
||||
This example demonstrates the power and flexibility of Crawl4AI in handling complex web interactions and extracting meaningful data. You can customize the JavaScript code, CSS selectors, and extraction strategies to suit your specific requirements.
|
||||
These examples demonstrate the power and flexibility of Crawl4AI's AsyncWebCrawler in handling complex web interactions and extracting meaningful data asynchronously. You can customize the JavaScript code, CSS selectors, extraction strategies, and waiting conditions to suit your specific requirements.
|
||||
142
docs/md/examples/json_css_extraction.md
Normal file
142
docs/md/examples/json_css_extraction.md
Normal file
@@ -0,0 +1,142 @@
|
||||
# JSON CSS Extraction Strategy with AsyncWebCrawler
|
||||
|
||||
The `JsonCssExtractionStrategy` is a powerful feature of Crawl4AI that allows you to extract structured data from web pages using CSS selectors. This method is particularly useful when you need to extract specific data points from a consistent HTML structure, such as tables or repeated elements. Here's how to use it with the AsyncWebCrawler.
|
||||
|
||||
## Overview
|
||||
|
||||
The `JsonCssExtractionStrategy` works by defining a schema that specifies:
|
||||
1. A base CSS selector for the repeating elements
|
||||
2. Fields to extract from each element, each with its own CSS selector
|
||||
|
||||
This strategy is fast and efficient, as it doesn't rely on external services like LLMs for extraction.
|
||||
|
||||
## Example: Extracting Cryptocurrency Prices from Coinbase
|
||||
|
||||
Let's look at an example that extracts cryptocurrency prices from the Coinbase explore page.
|
||||
|
||||
```python
|
||||
import json
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
|
||||
async def extract_structured_data_using_css_extractor():
|
||||
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
|
||||
|
||||
# Define the extraction schema
|
||||
schema = {
|
||||
"name": "Coinbase Crypto Prices",
|
||||
"baseSelector": ".cds-tableRow-t45thuk",
|
||||
"fields": [
|
||||
{
|
||||
"name": "crypto",
|
||||
"selector": "td:nth-child(1) h2",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "symbol",
|
||||
"selector": "td:nth-child(1) p",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "price",
|
||||
"selector": "td:nth-child(2)",
|
||||
"type": "text",
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
# Create the extraction strategy
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
||||
|
||||
# Use the AsyncWebCrawler with the extraction strategy
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.coinbase.com/explore",
|
||||
extraction_strategy=extraction_strategy,
|
||||
bypass_cache=True,
|
||||
)
|
||||
|
||||
assert result.success, "Failed to crawl the page"
|
||||
|
||||
# Parse the extracted content
|
||||
crypto_prices = json.loads(result.extracted_content)
|
||||
print(f"Successfully extracted {len(crypto_prices)} cryptocurrency prices")
|
||||
print(json.dumps(crypto_prices[0], indent=2))
|
||||
|
||||
return crypto_prices
|
||||
|
||||
# Run the async function
|
||||
asyncio.run(extract_structured_data_using_css_extractor())
|
||||
```
|
||||
|
||||
## Explanation of the Schema
|
||||
|
||||
The schema defines how to extract the data:
|
||||
|
||||
- `name`: A descriptive name for the extraction task.
|
||||
- `baseSelector`: The CSS selector for the repeating elements (in this case, table rows).
|
||||
- `fields`: An array of fields to extract from each element:
|
||||
- `name`: The name to give the extracted data.
|
||||
- `selector`: The CSS selector to find the specific data within the base element.
|
||||
- `type`: The type of data to extract (usually "text" for textual content).
|
||||
|
||||
## Advantages of JsonCssExtractionStrategy
|
||||
|
||||
1. **Speed**: CSS selectors are fast to execute, making this method efficient for large datasets.
|
||||
2. **Precision**: You can target exactly the elements you need.
|
||||
3. **Structured Output**: The result is already structured as JSON, ready for further processing.
|
||||
4. **No External Dependencies**: Unlike LLM-based strategies, this doesn't require any API calls to external services.
|
||||
|
||||
## Tips for Using JsonCssExtractionStrategy
|
||||
|
||||
1. **Inspect the Page**: Use browser developer tools to identify the correct CSS selectors.
|
||||
2. **Test Selectors**: Verify your selectors in the browser console before using them in the script.
|
||||
3. **Handle Dynamic Content**: If the page uses JavaScript to load content, you may need to combine this with JS execution (see the Advanced Usage section).
|
||||
4. **Error Handling**: Always check the `result.success` flag and handle potential failures.
|
||||
|
||||
## Advanced Usage: Combining with JavaScript Execution
|
||||
|
||||
For pages that load data dynamically, you can combine the `JsonCssExtractionStrategy` with JavaScript execution:
|
||||
|
||||
```python
|
||||
async def extract_dynamic_structured_data():
|
||||
schema = {
|
||||
"name": "Dynamic Crypto Prices",
|
||||
"baseSelector": ".crypto-row",
|
||||
"fields": [
|
||||
{"name": "name", "selector": ".crypto-name", "type": "text"},
|
||||
{"name": "price", "selector": ".crypto-price", "type": "text"},
|
||||
]
|
||||
}
|
||||
|
||||
js_code = """
|
||||
window.scrollTo(0, document.body.scrollHeight);
|
||||
await new Promise(resolve => setTimeout(resolve, 2000)); // Wait for 2 seconds
|
||||
"""
|
||||
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com/crypto-prices",
|
||||
extraction_strategy=extraction_strategy,
|
||||
js_code=js_code,
|
||||
wait_for=".crypto-row:nth-child(20)", # Wait for 20 rows to load
|
||||
bypass_cache=True,
|
||||
)
|
||||
|
||||
crypto_data = json.loads(result.extracted_content)
|
||||
print(f"Extracted {len(crypto_data)} cryptocurrency entries")
|
||||
|
||||
asyncio.run(extract_dynamic_structured_data())
|
||||
```
|
||||
|
||||
This advanced example demonstrates how to:
|
||||
1. Execute JavaScript to trigger dynamic content loading.
|
||||
2. Wait for a specific condition (20 rows loaded) before extraction.
|
||||
3. Extract data from the dynamically loaded content.
|
||||
|
||||
By mastering the `JsonCssExtractionStrategy`, you can efficiently extract structured data from a wide variety of web pages, making it a valuable tool in your web scraping toolkit.
|
||||
|
||||
For more details on schema definitions and advanced extraction strategies, check out the[Advanced JsonCssExtraction](../full_details/advanced_jsoncss_extraction.md).
|
||||
@@ -1,6 +1,6 @@
|
||||
# LLM Extraction
|
||||
# LLM Extraction with AsyncWebCrawler
|
||||
|
||||
Crawl4AI allows you to use Language Models (LLMs) to extract structured data or relevant content from web pages. Below are two examples demonstrating how to use LLMExtractionStrategy for different purposes.
|
||||
Crawl4AI's AsyncWebCrawler allows you to use Language Models (LLMs) to extract structured data or relevant content from web pages asynchronously. Below are two examples demonstrating how to use `LLMExtractionStrategy` for different purposes with the AsyncWebCrawler.
|
||||
|
||||
## Example 1: Extract Structured Data
|
||||
|
||||
@@ -8,17 +8,10 @@ In this example, we use the `LLMExtractionStrategy` to extract structured data (
|
||||
|
||||
```python
|
||||
import os
|
||||
import time
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import *
|
||||
from crawl4ai.extraction_strategy import *
|
||||
from crawl4ai.crawler_strategy import *
|
||||
|
||||
url = r'https://openai.com/api/pricing/'
|
||||
|
||||
crawler = WebCrawler()
|
||||
crawler.warmup()
|
||||
|
||||
import json
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
class OpenAIModelFee(BaseModel):
|
||||
@@ -26,27 +19,33 @@ class OpenAIModelFee(BaseModel):
|
||||
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
||||
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
|
||||
|
||||
result = crawler.run(
|
||||
url=url,
|
||||
word_count_threshold=1,
|
||||
extraction_strategy= LLMExtractionStrategy(
|
||||
provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="From the crawled content, extract all mentioned model names along with their "\
|
||||
"fees for input and output tokens. Make sure not to miss anything in the entire content. "\
|
||||
'One extracted model JSON format should look like this: '\
|
||||
'{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
|
||||
),
|
||||
bypass_cache=True,
|
||||
)
|
||||
async def extract_openai_fees():
|
||||
url = 'https://openai.com/api/pricing/'
|
||||
|
||||
model_fees = json.loads(result.extracted_content)
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
word_count_threshold=1,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="From the crawled content, extract all mentioned model names along with their "
|
||||
"fees for input and output tokens. Make sure not to miss anything in the entire content. "
|
||||
'One extracted model JSON format should look like this: '
|
||||
'{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
|
||||
),
|
||||
bypass_cache=True,
|
||||
)
|
||||
|
||||
print(len(model_fees))
|
||||
model_fees = json.loads(result.extracted_content)
|
||||
print(f"Number of models extracted: {len(model_fees)}")
|
||||
|
||||
with open(".data/data.json", "w", encoding="utf-8") as f:
|
||||
f.write(result.extracted_content)
|
||||
with open(".data/openai_fees.json", "w", encoding="utf-8") as f:
|
||||
json.dump(model_fees, f, indent=2)
|
||||
|
||||
asyncio.run(extract_openai_fees())
|
||||
```
|
||||
|
||||
## Example 2: Extract Relevant Content
|
||||
@@ -54,30 +53,80 @@ with open(".data/data.json", "w", encoding="utf-8") as f:
|
||||
In this example, we instruct the LLM to extract only content related to technology from the NBC News business page.
|
||||
|
||||
```python
|
||||
crawler = WebCrawler()
|
||||
crawler.warmup()
|
||||
import os
|
||||
import json
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
instruction="Extract only content related to technology"
|
||||
),
|
||||
bypass_cache=True,
|
||||
)
|
||||
async def extract_tech_content():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
instruction="Extract only content related to technology"
|
||||
),
|
||||
bypass_cache=True,
|
||||
)
|
||||
|
||||
model_fees = json.loads(result.extracted_content)
|
||||
tech_content = json.loads(result.extracted_content)
|
||||
print(f"Number of tech-related items extracted: {len(tech_content)}")
|
||||
|
||||
print(len(model_fees))
|
||||
with open(".data/tech_content.json", "w", encoding="utf-8") as f:
|
||||
json.dump(tech_content, f, indent=2)
|
||||
|
||||
with open(".data/data.json", "w", encoding="utf-8") as f:
|
||||
f.write(result.extracted_content)
|
||||
asyncio.run(extract_tech_content())
|
||||
```
|
||||
|
||||
## Advanced Usage: Combining JS Execution with LLM Extraction
|
||||
|
||||
This example demonstrates how to combine JavaScript execution with LLM extraction to handle dynamic content:
|
||||
|
||||
```python
|
||||
async def extract_dynamic_content():
|
||||
js_code = """
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
if (loadMoreButton) {
|
||||
loadMoreButton.click();
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
}
|
||||
"""
|
||||
|
||||
wait_for = """
|
||||
() => {
|
||||
const articles = document.querySelectorAll('article.tease-card');
|
||||
return articles.length > 10;
|
||||
}
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js_code=js_code,
|
||||
wait_for=wait_for,
|
||||
css_selector="article.tease-card",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
instruction="Summarize each article, focusing on technology-related content"
|
||||
),
|
||||
bypass_cache=True,
|
||||
)
|
||||
|
||||
summaries = json.loads(result.extracted_content)
|
||||
print(f"Number of summarized articles: {len(summaries)}")
|
||||
|
||||
with open(".data/tech_summaries.json", "w", encoding="utf-8") as f:
|
||||
json.dump(summaries, f, indent=2)
|
||||
|
||||
asyncio.run(extract_dynamic_content())
|
||||
```
|
||||
|
||||
## Customizing LLM Provider
|
||||
|
||||
Under the hood, Crawl4AI uses the `litellm` library, which allows you to use any LLM provider you want. Just pass the correct model name and API token.
|
||||
Crawl4AI uses the `litellm` library under the hood, which allows you to use any LLM provider you want. Just pass the correct model name and API token:
|
||||
|
||||
```python
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
@@ -88,3 +137,43 @@ extraction_strategy=LLMExtractionStrategy(
|
||||
```
|
||||
|
||||
This flexibility allows you to integrate with various LLM providers and tailor the extraction process to your specific needs.
|
||||
|
||||
## Error Handling and Retries
|
||||
|
||||
When working with external LLM APIs, it's important to handle potential errors and implement retry logic. Here's an example of how you might do this:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||
|
||||
class LLMExtractionError(Exception):
|
||||
pass
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
||||
async def extract_with_retry(crawler, url, extraction_strategy):
|
||||
try:
|
||||
result = await crawler.arun(url=url, extraction_strategy=extraction_strategy, bypass_cache=True)
|
||||
return json.loads(result.extracted_content)
|
||||
except Exception as e:
|
||||
raise LLMExtractionError(f"Failed to extract content: {str(e)}")
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
try:
|
||||
content = await extract_with_retry(
|
||||
crawler,
|
||||
"https://www.example.com",
|
||||
LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
instruction="Extract and summarize main points"
|
||||
)
|
||||
)
|
||||
print("Extracted content:", content)
|
||||
except LLMExtractionError as e:
|
||||
print(f"Extraction failed after retries: {e}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
This example uses the `tenacity` library to implement a retry mechanism with exponential backoff, which can help handle temporary failures or rate limiting from the LLM API.
|
||||
@@ -1,33 +1,32 @@
|
||||
## Research Assistant Example
|
||||
# Research Assistant Example with AsyncWebCrawler
|
||||
|
||||
This example demonstrates how to build a research assistant using `Chainlit` and `Crawl4AI`. The assistant will be capable of crawling web pages for information and answering questions based on the crawled content. Additionally, it integrates speech-to-text functionality for audio inputs.
|
||||
This example demonstrates how to build an advanced research assistant using `Chainlit`, `Crawl4AI`'s `AsyncWebCrawler`, and various AI services. The assistant can crawl web pages asynchronously, answer questions based on the crawled content, and handle audio inputs.
|
||||
|
||||
### Step-by-Step Guide
|
||||
## Step-by-Step Guide
|
||||
|
||||
1. **Install Required Packages**
|
||||
|
||||
Ensure you have the necessary packages installed. You need `chainlit`, `groq`, `requests`, and `openai`.
|
||||
Ensure you have the necessary packages installed:
|
||||
|
||||
```bash
|
||||
pip install chainlit groq requests openai
|
||||
pip install chainlit groq openai crawl4ai
|
||||
```
|
||||
|
||||
2. **Import Libraries**
|
||||
|
||||
Import all the necessary modules and initialize the OpenAI client.
|
||||
|
||||
```python
|
||||
import os
|
||||
import time
|
||||
import asyncio
|
||||
from openai import AsyncOpenAI
|
||||
import chainlit as cl
|
||||
import re
|
||||
import requests
|
||||
from io import BytesIO
|
||||
from chainlit.element import ElementBased
|
||||
from groq import Groq
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.extraction_strategy import NoExtractionStrategy
|
||||
from crawl4ai.chunking_strategy import RegexChunking
|
||||
|
||||
client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY"))
|
||||
|
||||
@@ -37,8 +36,6 @@ This example demonstrates how to build a research assistant using `Chainlit` and
|
||||
|
||||
3. **Set Configuration**
|
||||
|
||||
Define the model settings for the assistant.
|
||||
|
||||
```python
|
||||
settings = {
|
||||
"model": "llama3-8b-8192",
|
||||
@@ -52,35 +49,25 @@ This example demonstrates how to build a research assistant using `Chainlit` and
|
||||
|
||||
4. **Define Utility Functions**
|
||||
|
||||
- **Extract URLs from Text**: Use regex to find URLs in messages.
|
||||
```python
|
||||
def extract_urls(text):
|
||||
url_pattern = re.compile(r'(https?://\S+)')
|
||||
return url_pattern.findall(text)
|
||||
|
||||
```python
|
||||
def extract_urls(text):
|
||||
url_pattern = re.compile(r'(https?://\S+)')
|
||||
return url_pattern.findall(text)
|
||||
```
|
||||
|
||||
- **Crawl URL**: Send a request to `Crawl4AI` to fetch the content of a URL.
|
||||
|
||||
```python
|
||||
def crawl_url(url):
|
||||
data = {
|
||||
"urls": [url],
|
||||
"include_raw_html": True,
|
||||
"word_count_threshold": 10,
|
||||
"extraction_strategy": "NoExtractionStrategy",
|
||||
"chunking_strategy": "RegexChunking"
|
||||
}
|
||||
response = requests.post("https://crawl4ai.com/crawl", json=data)
|
||||
response_data = response.json()
|
||||
response_data = response_data['results'][0]
|
||||
return response_data['markdown']
|
||||
```
|
||||
async def crawl_urls(urls):
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
word_count_threshold=10,
|
||||
extraction_strategy=NoExtractionStrategy(),
|
||||
chunking_strategy=RegexChunking(),
|
||||
bypass_cache=True
|
||||
)
|
||||
return [result.markdown for result in results if result.success]
|
||||
```
|
||||
|
||||
5. **Initialize Chat Start Event**
|
||||
|
||||
Set up the initial chat message and user session.
|
||||
|
||||
```python
|
||||
@cl.on_chat_start
|
||||
async def on_chat_start():
|
||||
@@ -88,15 +75,11 @@ This example demonstrates how to build a research assistant using `Chainlit` and
|
||||
"history": [],
|
||||
"context": {}
|
||||
})
|
||||
await cl.Message(
|
||||
content="Welcome to the chat! How can I assist you today?"
|
||||
).send()
|
||||
await cl.Message(content="Welcome to the chat! How can I assist you today?").send()
|
||||
```
|
||||
|
||||
6. **Handle Incoming Messages**
|
||||
|
||||
Process user messages, extract URLs, and crawl them concurrently. Update the chat history and system message.
|
||||
|
||||
```python
|
||||
@cl.on_message
|
||||
async def on_message(message: cl.Message):
|
||||
@@ -105,19 +88,14 @@ This example demonstrates how to build a research assistant using `Chainlit` and
|
||||
# Extract URLs from the user's message
|
||||
urls = extract_urls(message.content)
|
||||
|
||||
futures = []
|
||||
with ThreadPoolExecutor() as executor:
|
||||
for url in urls:
|
||||
futures.append(executor.submit(crawl_url, url))
|
||||
|
||||
results = [future.result() for future in futures]
|
||||
|
||||
for url, result in zip(urls, results):
|
||||
ref_number = f"REF_{len(user_session['context']) + 1}"
|
||||
user_session["context"][ref_number] = {
|
||||
"url": url,
|
||||
"content": result
|
||||
}
|
||||
if urls:
|
||||
crawled_contents = await crawl_urls(urls)
|
||||
for url, content in zip(urls, crawled_contents):
|
||||
ref_number = f"REF_{len(user_session['context']) + 1}"
|
||||
user_session["context"][ref_number] = {
|
||||
"url": url,
|
||||
"content": content
|
||||
}
|
||||
|
||||
user_session["history"].append({
|
||||
"role": "user",
|
||||
@@ -129,33 +107,24 @@ This example demonstrates how to build a research assistant using `Chainlit` and
|
||||
f'<appendix ref="{ref}">\n{data["content"]}\n</appendix>'
|
||||
for ref, data in user_session["context"].items()
|
||||
]
|
||||
if context_messages:
|
||||
system_message = {
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful bot. Use the following context for answering questions. "
|
||||
"Refer to the sources using the REF number in square brackets, e.g., [1], only if the source is given in the appendices below.\n\n"
|
||||
"If the question requires any information from the provided appendices or context, refer to the sources. "
|
||||
"If not, there is no need to add a references section. "
|
||||
"At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n"
|
||||
"\n\n".join(context_messages)
|
||||
)
|
||||
}
|
||||
else:
|
||||
system_message = {
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
}
|
||||
system_message = {
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful bot. Use the following context for answering questions. "
|
||||
"Refer to the sources using the REF number in square brackets, e.g., [1], only if the source is given in the appendices below.\n\n"
|
||||
"If the question requires any information from the provided appendices or context, refer to the sources. "
|
||||
"If not, there is no need to add a references section. "
|
||||
"At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n"
|
||||
"\n\n".join(context_messages)
|
||||
) if context_messages else "You are a helpful assistant."
|
||||
}
|
||||
|
||||
msg = cl.Message(content="")
|
||||
await msg.send()
|
||||
|
||||
# Get response from the LLM
|
||||
stream = await client.chat.completions.create(
|
||||
messages=[
|
||||
system_message,
|
||||
*user_session["history"]
|
||||
],
|
||||
messages=[system_message, *user_session["history"]],
|
||||
stream=True,
|
||||
**settings
|
||||
)
|
||||
@@ -174,18 +143,16 @@ This example demonstrates how to build a research assistant using `Chainlit` and
|
||||
await msg.update()
|
||||
|
||||
# Append the reference section to the assistant's response
|
||||
reference_section = "\n\nReferences:\n"
|
||||
for ref, data in user_session["context"].items():
|
||||
reference_section += f"[{ref.split('_')[1]}]: {data['url']}\n"
|
||||
|
||||
msg.content += reference_section
|
||||
await msg.update()
|
||||
if user_session["context"]:
|
||||
reference_section = "\n\nReferences:\n"
|
||||
for ref, data in user_session["context"].items():
|
||||
reference_section += f"[{ref.split('_')[1]}]: {data['url']}\n"
|
||||
msg.content += reference_section
|
||||
await msg.update()
|
||||
```
|
||||
|
||||
7. **Handle Audio Input**
|
||||
|
||||
Capture and transcribe audio input. Store the audio buffer and transcribe it when the audio ends.
|
||||
|
||||
```python
|
||||
@cl.on_audio_chunk
|
||||
async def on_audio_chunk(chunk: cl.AudioChunk):
|
||||
@@ -194,12 +161,10 @@ This example demonstrates how to build a research assistant using `Chainlit` and
|
||||
buffer.name = f"input_audio.{chunk.mimeType.split('/')[1]}"
|
||||
cl.user_session.set("audio_buffer", buffer)
|
||||
cl.user_session.set("audio_mime_type", chunk.mimeType)
|
||||
|
||||
cl.user_session.get("audio_buffer").write(chunk.data)
|
||||
|
||||
@cl.step(type="tool")
|
||||
async def speech_to_text(audio_file):
|
||||
cli = Groq()
|
||||
response = await client.audio.transcriptions.create(
|
||||
model="whisper-large-v3", file=audio_file
|
||||
)
|
||||
@@ -217,32 +182,39 @@ This example demonstrates how to build a research assistant using `Chainlit` and
|
||||
end_time = time.time()
|
||||
print(f"Transcription took {end_time - start_time} seconds")
|
||||
|
||||
user_msg = cl.Message(
|
||||
author="You",
|
||||
type="user_message",
|
||||
content=transcription
|
||||
)
|
||||
user_msg = cl.Message(author="You", type="user_message", content=transcription)
|
||||
await user_msg.send()
|
||||
await on_message(user_msg)
|
||||
```
|
||||
|
||||
8. **Run the Chat Application**
|
||||
|
||||
Start the Chainlit application.
|
||||
|
||||
```python
|
||||
if __name__ == "__main__":
|
||||
from chainlit.cli import run_chainlit
|
||||
run_chainlit(__file__)
|
||||
```
|
||||
|
||||
### Explanation
|
||||
## Explanation
|
||||
|
||||
- **Libraries and Configuration**: Import necessary libraries and configure the OpenAI client.
|
||||
- **Utility Functions**: Define functions to extract URLs and crawl them.
|
||||
- **Chat Start Event**: Initialize chat session and welcome message.
|
||||
- **Message Handling**: Extract URLs, crawl them concurrently, and update chat history and context.
|
||||
- **Audio Handling**: Capture, buffer, and transcribe audio input, then process the transcription as text.
|
||||
- **Running the Application**: Start the Chainlit server to interact with the assistant.
|
||||
- **Libraries and Configuration**: We import necessary libraries, including `AsyncWebCrawler` from `crawl4ai`.
|
||||
- **Utility Functions**:
|
||||
- `extract_urls`: Uses regex to find URLs in messages.
|
||||
- `crawl_urls`: An asynchronous function that uses `AsyncWebCrawler` to fetch content from multiple URLs concurrently.
|
||||
- **Chat Start Event**: Initializes the chat session and sends a welcome message.
|
||||
- **Message Handling**:
|
||||
- Extracts URLs from user messages.
|
||||
- Asynchronously crawls the URLs using `AsyncWebCrawler`.
|
||||
- Updates chat history and context with crawled content.
|
||||
- Generates a response using the LLM, incorporating the crawled context.
|
||||
- **Audio Handling**: Captures, buffers, and transcribes audio input, then processes the transcription as text.
|
||||
- **Running the Application**: Starts the Chainlit server for interaction with the assistant.
|
||||
|
||||
This example showcases how to create an interactive research assistant that can fetch, process, and summarize web content, along with handling audio inputs for a seamless user experience.
|
||||
## Key Improvements
|
||||
|
||||
1. **Asynchronous Web Crawling**: Using `AsyncWebCrawler` allows for efficient, concurrent crawling of multiple URLs.
|
||||
2. **Improved Context Management**: The assistant now maintains a context of crawled content, allowing for more informed responses.
|
||||
3. **Dynamic Reference System**: The assistant can refer to specific sources in its responses and provide a reference section.
|
||||
4. **Seamless Audio Integration**: The ability to handle audio inputs makes the assistant more versatile and user-friendly.
|
||||
|
||||
This updated Research Assistant showcases how to create a powerful, interactive tool that can efficiently fetch and process web content, handle various input types, and provide informed responses based on the gathered information.
|
||||
@@ -1,44 +1,34 @@
|
||||
## Summarization Example
|
||||
# Summarization Example with AsyncWebCrawler
|
||||
|
||||
This example demonstrates how to use `Crawl4AI` to extract a summary from a web page. The goal is to obtain the title, a detailed summary, a brief summary, and a list of keywords from the given page.
|
||||
This example demonstrates how to use Crawl4AI's `AsyncWebCrawler` to extract a summary from a web page asynchronously. The goal is to obtain the title, a detailed summary, a brief summary, and a list of keywords from the given page.
|
||||
|
||||
### Step-by-Step Guide
|
||||
## Step-by-Step Guide
|
||||
|
||||
1. **Import Necessary Modules**
|
||||
|
||||
First, import the necessary modules and classes.
|
||||
First, import the necessary modules and classes:
|
||||
|
||||
```python
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import *
|
||||
from crawl4ai.extraction_strategy import *
|
||||
from crawl4ai.crawler_strategy import *
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from crawl4ai.chunking_strategy import RegexChunking
|
||||
from pydantic import BaseModel, Field
|
||||
```
|
||||
|
||||
2. **Define the URL to be Crawled**
|
||||
|
||||
Set the URL of the web page you want to summarize.
|
||||
Set the URL of the web page you want to summarize:
|
||||
|
||||
```python
|
||||
url = r'https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot'
|
||||
url = 'https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot'
|
||||
```
|
||||
|
||||
3. **Initialize the WebCrawler**
|
||||
3. **Define the Data Model**
|
||||
|
||||
Create an instance of the `WebCrawler` and call the `warmup` method.
|
||||
|
||||
```python
|
||||
crawler = WebCrawler()
|
||||
crawler.warmup()
|
||||
```
|
||||
|
||||
4. **Define the Data Model**
|
||||
|
||||
Use Pydantic to define the structure of the extracted data.
|
||||
Use Pydantic to define the structure of the extracted data:
|
||||
|
||||
```python
|
||||
class PageSummary(BaseModel):
|
||||
@@ -48,61 +38,116 @@ This example demonstrates how to use `Crawl4AI` to extract a summary from a web
|
||||
keywords: list = Field(..., description="Keywords assigned to the page.")
|
||||
```
|
||||
|
||||
5. **Run the Crawler**
|
||||
4. **Create the Extraction Strategy**
|
||||
|
||||
Set up and run the crawler with the `LLMExtractionStrategy`. Provide the necessary parameters, including the schema for the extracted data and the instruction for the LLM.
|
||||
Set up the `LLMExtractionStrategy` with the necessary parameters:
|
||||
|
||||
```python
|
||||
result = crawler.run(
|
||||
url=url,
|
||||
word_count_threshold=1,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
schema=PageSummary.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
apply_chunking=False,
|
||||
instruction=(
|
||||
"From the crawled content, extract the following details: "
|
||||
"1. Title of the page "
|
||||
"2. Summary of the page, which is a detailed summary "
|
||||
"3. Brief summary of the page, which is a paragraph text "
|
||||
"4. Keywords assigned to the page, which is a list of keywords. "
|
||||
'The extracted JSON format should look like this: '
|
||||
'{ "title": "Page Title", "summary": "Detailed summary of the page.", '
|
||||
'"brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }'
|
||||
)
|
||||
),
|
||||
bypass_cache=True,
|
||||
extraction_strategy = LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
schema=PageSummary.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
apply_chunking=False,
|
||||
instruction=(
|
||||
"From the crawled content, extract the following details: "
|
||||
"1. Title of the page "
|
||||
"2. Summary of the page, which is a detailed summary "
|
||||
"3. Brief summary of the page, which is a paragraph text "
|
||||
"4. Keywords assigned to the page, which is a list of keywords. "
|
||||
'The extracted JSON format should look like this: '
|
||||
'{ "title": "Page Title", "summary": "Detailed summary of the page.", '
|
||||
'"brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }'
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
6. **Process the Extracted Data**
|
||||
5. **Define the Async Crawl Function**
|
||||
|
||||
Load the extracted content into a JSON object and print it.
|
||||
Create an asynchronous function to run the crawler:
|
||||
|
||||
```python
|
||||
page_summary = json.loads(result.extracted_content)
|
||||
print(page_summary)
|
||||
async def crawl_and_summarize(url):
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
word_count_threshold=1,
|
||||
extraction_strategy=extraction_strategy,
|
||||
chunking_strategy=RegexChunking(),
|
||||
bypass_cache=True,
|
||||
)
|
||||
return result
|
||||
```
|
||||
|
||||
7. **Save the Extracted Data**
|
||||
6. **Run the Crawler and Process Results**
|
||||
|
||||
Save the extracted data to a file for further use.
|
||||
Use asyncio to run the crawler and process the results:
|
||||
|
||||
```python
|
||||
with open(".data/page_summary.json", "w", encoding="utf-8") as f:
|
||||
f.write(result.extracted_content)
|
||||
async def main():
|
||||
result = await crawl_and_summarize(url)
|
||||
|
||||
if result.success:
|
||||
page_summary = json.loads(result.extracted_content)
|
||||
print("Extracted Page Summary:")
|
||||
print(json.dumps(page_summary, indent=2))
|
||||
|
||||
# Save the extracted data
|
||||
with open(".data/page_summary.json", "w", encoding="utf-8") as f:
|
||||
json.dump(page_summary, f, indent=2)
|
||||
print("Page summary saved to .data/page_summary.json")
|
||||
else:
|
||||
print(f"Failed to crawl and summarize the page. Error: {result.error_message}")
|
||||
|
||||
# Run the async main function
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### Explanation
|
||||
## Explanation
|
||||
|
||||
- **Importing Modules**: Import the necessary modules, including `WebCrawler` and `LLMExtractionStrategy` from `Crawl4AI`.
|
||||
- **URL Definition**: Set the URL of the web page you want to crawl and summarize.
|
||||
- **WebCrawler Initialization**: Create an instance of `WebCrawler` and call the `warmup` method to prepare the crawler.
|
||||
- **Data Model Definition**: Define the structure of the data you want to extract using Pydantic's `BaseModel`.
|
||||
- **Crawler Execution**: Run the crawler with the `LLMExtractionStrategy`, providing the schema and detailed instructions for the extraction process.
|
||||
- **Data Processing**: Load the extracted content into a JSON object and print it to verify the results.
|
||||
- **Data Saving**: Save the extracted data to a file for further use.
|
||||
- **Importing Modules**: We import the necessary modules, including `AsyncWebCrawler` and `LLMExtractionStrategy` from Crawl4AI.
|
||||
- **URL Definition**: We set the URL of the web page to crawl and summarize.
|
||||
- **Data Model Definition**: We define the structure of the data to extract using Pydantic's `BaseModel`.
|
||||
- **Extraction Strategy Setup**: We create an instance of `LLMExtractionStrategy` with the schema and detailed instructions for the extraction process.
|
||||
- **Async Crawl Function**: We define an asynchronous function `crawl_and_summarize` that uses `AsyncWebCrawler` to perform the crawling and extraction.
|
||||
- **Main Execution**: In the `main` function, we run the crawler, process the results, and save the extracted data.
|
||||
|
||||
This example demonstrates how to harness the power of `Crawl4AI` to perform advanced web crawling and data extraction tasks with minimal code.
|
||||
## Advanced Usage: Crawling Multiple URLs
|
||||
|
||||
To demonstrate the power of `AsyncWebCrawler`, here's how you can summarize multiple pages concurrently:
|
||||
|
||||
```python
|
||||
async def crawl_multiple_urls(urls):
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
tasks = [crawler.arun(
|
||||
url=url,
|
||||
word_count_threshold=1,
|
||||
extraction_strategy=extraction_strategy,
|
||||
chunking_strategy=RegexChunking(),
|
||||
bypass_cache=True
|
||||
) for url in urls]
|
||||
results = await asyncio.gather(*tasks)
|
||||
return results
|
||||
|
||||
async def main():
|
||||
urls = [
|
||||
'https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot',
|
||||
'https://marketplace.visualstudio.com/items?itemName=GitHub.copilot',
|
||||
'https://marketplace.visualstudio.com/items?itemName=ms-python.python'
|
||||
]
|
||||
results = await crawl_multiple_urls(urls)
|
||||
|
||||
for i, result in enumerate(results):
|
||||
if result.success:
|
||||
page_summary = json.loads(result.extracted_content)
|
||||
print(f"\nSummary for URL {i+1}:")
|
||||
print(json.dumps(page_summary, indent=2))
|
||||
else:
|
||||
print(f"\nFailed to summarize URL {i+1}. Error: {result.error_message}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
This advanced example shows how to use `AsyncWebCrawler` to efficiently summarize multiple web pages concurrently, significantly reducing the total processing time compared to sequential crawling.
|
||||
|
||||
By leveraging the asynchronous capabilities of Crawl4AI, you can perform advanced web crawling and data extraction tasks with improved efficiency and scalability.
|
||||
Reference in New Issue
Block a user