chore: resolve merge conflicts for v0.4.24

This commit is contained in:
UncleCode
2024-12-31 19:24:03 +08:00
79 changed files with 11903 additions and 7543 deletions

View File

@@ -0,0 +1,114 @@
"""
This example demonstrates how to use JSON CSS extraction to scrape product information
from Amazon search results. It shows how to extract structured data like product titles,
prices, ratings, and other details using CSS selectors.
"""
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
import json
async def extract_amazon_products():
# Initialize browser config
browser_config = BrowserConfig(
browser_type="chromium",
headless=True
)
# Initialize crawler config with JSON CSS extraction strategy
crawler_config = CrawlerRunConfig(
extraction_strategy=JsonCssExtractionStrategy(
schema={
"name": "Amazon Product Search Results",
"baseSelector": "[data-component-type='s-search-result']",
"fields": [
{
"name": "asin",
"selector": "",
"type": "attribute",
"attribute": "data-asin"
},
{
"name": "title",
"selector": "h2 a span",
"type": "text"
},
{
"name": "url",
"selector": "h2 a",
"type": "attribute",
"attribute": "href"
},
{
"name": "image",
"selector": ".s-image",
"type": "attribute",
"attribute": "src"
},
{
"name": "rating",
"selector": ".a-icon-star-small .a-icon-alt",
"type": "text"
},
{
"name": "reviews_count",
"selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
"type": "text"
},
{
"name": "price",
"selector": ".a-price .a-offscreen",
"type": "text"
},
{
"name": "original_price",
"selector": ".a-price.a-text-price .a-offscreen",
"type": "text"
},
{
"name": "sponsored",
"selector": ".puis-sponsored-label-text",
"type": "exists"
},
{
"name": "delivery_info",
"selector": "[data-cy='delivery-recipe'] .a-color-base",
"type": "text",
"multiple": True
}
]
}
)
)
# Example search URL (you should replace with your actual Amazon URL)
url = "https://www.amazon.com/s?k=Samsung+Galaxy+Tab"
# Use context manager for proper resource handling
async with AsyncWebCrawler(config=browser_config) as crawler:
# Extract the data
result = await crawler.arun(url=url, config=crawler_config)
# Process and print the results
if result and result.extracted_content:
# Parse the JSON string into a list of products
products = json.loads(result.extracted_content)
# Process each product in the list
for product in products:
print("\nProduct Details:")
print(f"ASIN: {product.get('asin')}")
print(f"Title: {product.get('title')}")
print(f"Price: {product.get('price')}")
print(f"Original Price: {product.get('original_price')}")
print(f"Rating: {product.get('rating')}")
print(f"Reviews: {product.get('reviews_count')}")
print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
if product.get('delivery_info'):
print(f"Delivery: {' '.join(product['delivery_info'])}")
print("-" * 80)
if __name__ == "__main__":
import asyncio
asyncio.run(extract_amazon_products())

View File

@@ -0,0 +1,145 @@
"""
This example demonstrates how to use JSON CSS extraction to scrape product information
from Amazon search results. It shows how to extract structured data like product titles,
prices, ratings, and other details using CSS selectors.
"""
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
import json
from playwright.async_api import Page, BrowserContext
async def extract_amazon_products():
# Initialize browser config
browser_config = BrowserConfig(
# browser_type="chromium",
headless=True
)
# Initialize crawler config with JSON CSS extraction strategy nav-search-submit-button
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=JsonCssExtractionStrategy(
schema={
"name": "Amazon Product Search Results",
"baseSelector": "[data-component-type='s-search-result']",
"fields": [
{
"name": "asin",
"selector": "",
"type": "attribute",
"attribute": "data-asin"
},
{
"name": "title",
"selector": "h2 a span",
"type": "text"
},
{
"name": "url",
"selector": "h2 a",
"type": "attribute",
"attribute": "href"
},
{
"name": "image",
"selector": ".s-image",
"type": "attribute",
"attribute": "src"
},
{
"name": "rating",
"selector": ".a-icon-star-small .a-icon-alt",
"type": "text"
},
{
"name": "reviews_count",
"selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
"type": "text"
},
{
"name": "price",
"selector": ".a-price .a-offscreen",
"type": "text"
},
{
"name": "original_price",
"selector": ".a-price.a-text-price .a-offscreen",
"type": "text"
},
{
"name": "sponsored",
"selector": ".puis-sponsored-label-text",
"type": "exists"
},
{
"name": "delivery_info",
"selector": "[data-cy='delivery-recipe'] .a-color-base",
"type": "text",
"multiple": True
}
]
}
)
)
url = "https://www.amazon.com/"
async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs):
"""Hook called after navigating to each URL"""
print(f"[HOOK] after_goto - Successfully loaded: {url}")
try:
# Wait for search box to be available
search_box = await page.wait_for_selector('#twotabsearchtextbox', timeout=1000)
# Type the search query
await search_box.fill('Samsung Galaxy Tab')
# Get the search button and prepare for navigation
search_button = await page.wait_for_selector('#nav-search-submit-button', timeout=1000)
# Click with navigation waiting
await search_button.click()
# Wait for search results to load
await page.wait_for_selector('[data-component-type="s-search-result"]', timeout=10000)
print("[HOOK] Search completed and results loaded!")
except Exception as e:
print(f"[HOOK] Error during search operation: {str(e)}")
return page
# Use context manager for proper resource handling
async with AsyncWebCrawler(config=browser_config) as crawler:
crawler.crawler_strategy.set_hook("after_goto", after_goto)
# Extract the data
result = await crawler.arun(url=url, config=crawler_config)
# Process and print the results
if result and result.extracted_content:
# Parse the JSON string into a list of products
products = json.loads(result.extracted_content)
# Process each product in the list
for product in products:
print("\nProduct Details:")
print(f"ASIN: {product.get('asin')}")
print(f"Title: {product.get('title')}")
print(f"Price: {product.get('price')}")
print(f"Original Price: {product.get('original_price')}")
print(f"Rating: {product.get('rating')}")
print(f"Reviews: {product.get('reviews_count')}")
print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
if product.get('delivery_info'):
print(f"Delivery: {' '.join(product['delivery_info'])}")
print("-" * 80)
if __name__ == "__main__":
import asyncio
asyncio.run(extract_amazon_products())

View File

@@ -0,0 +1,129 @@
"""
This example demonstrates how to use JSON CSS extraction to scrape product information
from Amazon search results. It shows how to extract structured data like product titles,
prices, ratings, and other details using CSS selectors.
"""
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
import json
from playwright.async_api import Page, BrowserContext
async def extract_amazon_products():
# Initialize browser config
browser_config = BrowserConfig(
# browser_type="chromium",
headless=True
)
js_code_to_search = """
const task = async () => {
document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab';
document.querySelector('#nav-search-submit-button').click();
}
await task();
"""
js_code_to_search_sync = """
document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab';
document.querySelector('#nav-search-submit-button').click();
"""
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
js_code = js_code_to_search,
wait_for='css:[data-component-type="s-search-result"]',
extraction_strategy=JsonCssExtractionStrategy(
schema={
"name": "Amazon Product Search Results",
"baseSelector": "[data-component-type='s-search-result']",
"fields": [
{
"name": "asin",
"selector": "",
"type": "attribute",
"attribute": "data-asin"
},
{
"name": "title",
"selector": "h2 a span",
"type": "text"
},
{
"name": "url",
"selector": "h2 a",
"type": "attribute",
"attribute": "href"
},
{
"name": "image",
"selector": ".s-image",
"type": "attribute",
"attribute": "src"
},
{
"name": "rating",
"selector": ".a-icon-star-small .a-icon-alt",
"type": "text"
},
{
"name": "reviews_count",
"selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
"type": "text"
},
{
"name": "price",
"selector": ".a-price .a-offscreen",
"type": "text"
},
{
"name": "original_price",
"selector": ".a-price.a-text-price .a-offscreen",
"type": "text"
},
{
"name": "sponsored",
"selector": ".puis-sponsored-label-text",
"type": "exists"
},
{
"name": "delivery_info",
"selector": "[data-cy='delivery-recipe'] .a-color-base",
"type": "text",
"multiple": True
}
]
}
)
)
# Example search URL (you should replace with your actual Amazon URL)
url = "https://www.amazon.com/"
# Use context manager for proper resource handling
async with AsyncWebCrawler(config=browser_config) as crawler:
# Extract the data
result = await crawler.arun(url=url, config=crawler_config)
# Process and print the results
if result and result.extracted_content:
# Parse the JSON string into a list of products
products = json.loads(result.extracted_content)
# Process each product in the list
for product in products:
print("\nProduct Details:")
print(f"ASIN: {product.get('asin')}")
print(f"Title: {product.get('title')}")
print(f"Price: {product.get('price')}")
print(f"Original Price: {product.get('original_price')}")
print(f"Rating: {product.get('rating')}")
print(f"Reviews: {product.get('reviews_count')}")
print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
if product.get('delivery_info'):
print(f"Delivery: {' '.join(product['delivery_info'])}")
print("-" * 80)
if __name__ == "__main__":
import asyncio
asyncio.run(extract_amazon_products())

View File

@@ -0,0 +1,128 @@
"""
This example demonstrates optimal browser usage patterns in Crawl4AI:
1. Sequential crawling with session reuse
2. Parallel crawling with browser instance reuse
3. Performance optimization settings
"""
import asyncio
import os
from typing import List
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
async def crawl_sequential(urls: List[str]):
"""
Sequential crawling using session reuse - most efficient for moderate workloads
"""
print("\n=== Sequential Crawling with Session Reuse ===")
# Configure browser with optimized settings
browser_config = BrowserConfig(
headless=True,
browser_args=[
"--disable-gpu", # Disable GPU acceleration
"--disable-dev-shm-usage", # Disable /dev/shm usage
"--no-sandbox", # Required for Docker
],
viewport={
"width": 800,
"height": 600,
}, # Smaller viewport for better performance
)
# Configure crawl settings
crawl_config = CrawlerRunConfig(
markdown_generator=DefaultMarkdownGenerator(
# content_filter=PruningContentFilter(), In case you need fit_markdown
),
)
# Create single crawler instance
crawler = AsyncWebCrawler(config=browser_config)
await crawler.start()
try:
session_id = "session1" # Use same session for all URLs
for url in urls:
result = await crawler.arun(
url=url,
config=crawl_config,
session_id=session_id, # Reuse same browser tab
)
if result.success:
print(f"Successfully crawled {url}")
print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
finally:
await crawler.close()
async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
"""
Parallel crawling while reusing browser instance - best for large workloads
"""
print("\n=== Parallel Crawling with Browser Reuse ===")
browser_config = BrowserConfig(
headless=True,
browser_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
viewport={"width": 800, "height": 600},
)
crawl_config = CrawlerRunConfig(
markdown_generator=DefaultMarkdownGenerator(
# content_filter=PruningContentFilter(), In case you need fit_markdown
),
)
# Create single crawler instance for all parallel tasks
crawler = AsyncWebCrawler(config=browser_config)
await crawler.start()
try:
# Create tasks in batches to control concurrency
for i in range(0, len(urls), max_concurrent):
batch = urls[i : i + max_concurrent]
tasks = []
for j, url in enumerate(batch):
session_id = (
f"parallel_session_{j}" # Different session per concurrent task
)
task = crawler.arun(url=url, config=crawl_config, session_id=session_id)
tasks.append(task)
# Wait for batch to complete
results = await asyncio.gather(*tasks, return_exceptions=True)
# Process results
for url, result in zip(batch, results):
if isinstance(result, Exception):
print(f"Error crawling {url}: {str(result)}")
elif result.success:
print(f"Successfully crawled {url}")
print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
finally:
await crawler.close()
async def main():
# Example URLs
urls = [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3",
"https://example.com/page4",
]
# Demo sequential crawling
await crawl_sequential(urls)
# Demo parallel crawling
await crawl_parallel(urls, max_concurrent=2)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,115 @@
"""
Example demonstrating different extraction strategies with various input formats.
This example shows how to:
1. Use different input formats (markdown, HTML, fit_markdown)
2. Work with JSON-based extractors (CSS and XPath)
3. Use LLM-based extraction with different input formats
4. Configure browser and crawler settings properly
"""
import asyncio
import os
from typing import Dict, Any
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import (
LLMExtractionStrategy,
JsonCssExtractionStrategy,
JsonXPathExtractionStrategy
)
from crawl4ai.chunking_strategy import RegexChunking, IdentityChunking
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str):
"""Helper function to run extraction with proper configuration"""
try:
# Configure the crawler run settings
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=strategy,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter() # For fit_markdown support
)
)
# Run the crawler
result = await crawler.arun(url=url, config=config)
if result.success:
print(f"\n=== {name} Results ===")
print(f"Extracted Content: {result.extracted_content}")
print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}")
print(f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}")
else:
print(f"Error in {name}: Crawl failed")
except Exception as e:
print(f"Error in {name}: {str(e)}")
async def main():
# Example URL (replace with actual URL)
url = "https://example.com/product-page"
# Configure browser settings
browser_config = BrowserConfig(
headless=True,
verbose=True
)
# Initialize extraction strategies
# 1. LLM Extraction with different input formats
markdown_strategy = LLMExtractionStrategy(
provider="openai/gpt-4o-mini",
api_token=os.getenv("OPENAI_API_KEY"),
instruction="Extract product information including name, price, and description"
)
html_strategy = LLMExtractionStrategy(
input_format="html",
provider="openai/gpt-4o-mini",
api_token=os.getenv("OPENAI_API_KEY"),
instruction="Extract product information from HTML including structured data"
)
fit_markdown_strategy = LLMExtractionStrategy(
input_format="fit_markdown",
provider="openai/gpt-4o-mini",
api_token=os.getenv("OPENAI_API_KEY"),
instruction="Extract product information from cleaned markdown"
)
# 2. JSON CSS Extraction (automatically uses HTML input)
css_schema = {
"baseSelector": ".product",
"fields": [
{"name": "title", "selector": "h1.product-title", "type": "text"},
{"name": "price", "selector": ".price", "type": "text"},
{"name": "description", "selector": ".description", "type": "text"}
]
}
css_strategy = JsonCssExtractionStrategy(schema=css_schema)
# 3. JSON XPath Extraction (automatically uses HTML input)
xpath_schema = {
"baseSelector": "//div[@class='product']",
"fields": [
{"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"},
{"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"},
{"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"}
]
}
xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema)
# Use context manager for proper resource handling
async with AsyncWebCrawler(config=browser_config) as crawler:
# Run all strategies
await run_extraction(crawler, url, markdown_strategy, "Markdown LLM")
await run_extraction(crawler, url, html_strategy, "HTML LLM")
await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM")
await run_extraction(crawler, url, css_strategy, "CSS Extraction")
await run_extraction(crawler, url, xpath_strategy, "XPath Extraction")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -39,8 +39,8 @@ async def main():
f.write(b64decode(result.screenshot))
# Save PDF
if result.pdf_data:
pdf_bytes = b64decode(result.pdf_data)
if result.pdf:
pdf_bytes = b64decode(result.pdf)
with open(os.path.join(__location__, "page.pdf"), "wb") as f:
f.write(pdf_bytes)

View File

@@ -0,0 +1,107 @@
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from playwright.async_api import Page, BrowserContext
async def main():
print("🔗 Hooks Example: Demonstrating different hook use cases")
# Configure browser settings
browser_config = BrowserConfig(
headless=True
)
# Configure crawler settings
crawler_run_config = CrawlerRunConfig(
js_code="window.scrollTo(0, document.body.scrollHeight);",
wait_for="body",
cache_mode=CacheMode.BYPASS
)
# Create crawler instance
crawler = AsyncWebCrawler(config=browser_config)
# Define and set hook functions
async def on_browser_created(browser, context: BrowserContext, **kwargs):
"""Hook called after the browser is created"""
print("[HOOK] on_browser_created - Browser is ready!")
# Example: Set a cookie that will be used for all requests
return browser
async def on_page_context_created(page: Page, context: BrowserContext, **kwargs):
"""Hook called after a new page and context are created"""
print("[HOOK] on_page_context_created - New page created!")
# Example: Set default viewport size
await context.add_cookies([{
'name': 'session_id',
'value': 'example_session',
'domain': '.example.com',
'path': '/'
}])
await page.set_viewport_size({"width": 1920, "height": 1080})
return page
async def on_user_agent_updated(page: Page, context: BrowserContext, user_agent: str, **kwargs):
"""Hook called when the user agent is updated"""
print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}")
return page
async def on_execution_started(page: Page, context: BrowserContext, **kwargs):
"""Hook called after custom JavaScript execution"""
print("[HOOK] on_execution_started - Custom JS executed!")
return page
async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs):
"""Hook called before navigating to each URL"""
print(f"[HOOK] before_goto - About to visit: {url}")
# Example: Add custom headers for the request
await page.set_extra_http_headers({
"Custom-Header": "my-value"
})
return page
async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs):
"""Hook called after navigating to each URL"""
print(f"[HOOK] after_goto - Successfully loaded: {url}")
# Example: Wait for a specific element to be loaded
try:
await page.wait_for_selector('.content', timeout=1000)
print("Content element found!")
except:
print("Content element not found, continuing anyway")
return page
async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs):
"""Hook called before retrieving the HTML content"""
print("[HOOK] before_retrieve_html - About to get HTML content")
# Example: Scroll to bottom to trigger lazy loading
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
return page
async def before_return_html(page: Page, context: BrowserContext, html:str, **kwargs):
"""Hook called before returning the HTML content"""
print(f"[HOOK] before_return_html - Got HTML content (length: {len(html)})")
# Example: You could modify the HTML content here if needed
return page
# Set all the hooks
crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
crawler.crawler_strategy.set_hook("on_user_agent_updated", on_user_agent_updated)
crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
crawler.crawler_strategy.set_hook("before_goto", before_goto)
crawler.crawler_strategy.set_hook("after_goto", after_goto)
crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html)
crawler.crawler_strategy.set_hook("before_return_html", before_return_html)
await crawler.start()
# Example usage: crawl a simple website
url = 'https://example.com'
result = await crawler.arun(url, config=crawler_run_config)
print(f"\nCrawled URL: {result.url}")
print(f"HTML length: {len(result.html)}")
await crawler.close()
if __name__ == "__main__":
import asyncio
asyncio.run(main())

View File

@@ -1,6 +1,8 @@
import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
os.environ['FIRECRAWL_API_KEY'] = "fc-84b370ccfad44beabc686b38f1769692"
sys.path.append(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
)
import asyncio
import time
@@ -12,7 +14,10 @@ from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
from crawl4ai.extraction_strategy import (
JsonCssExtractionStrategy,
LLMExtractionStrategy,
)
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
@@ -21,128 +26,182 @@ print("GitHub Repository: https://github.com/unclecode/crawl4ai")
print("Twitter: @unclecode")
print("Website: https://crawl4ai.com")
# Basic Example - Simple Crawl
async def simple_crawl():
print("\n--- Basic Usage ---")
browser_config = BrowserConfig(headless=True)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS
)
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
config=crawler_config
url="https://www.nbcnews.com/business", config=crawler_config
)
print(result.markdown[:500])
async def clean_content():
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
excluded_tags=["nav", "footer", "aside"],
remove_overlay_elements=True,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48, threshold_type="fixed", min_word_threshold=0
),
options={"ignore_links": True},
),
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://en.wikipedia.org/wiki/Apple",
config=crawler_config,
)
full_markdown_length = len(result.markdown_v2.raw_markdown)
fit_markdown_length = len(result.markdown_v2.fit_markdown)
print(f"Full Markdown Length: {full_markdown_length}")
print(f"Fit Markdown Length: {fit_markdown_length}")
async def link_analysis():
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.ENABLED,
exclude_external_links=True,
exclude_social_media_links=True,
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
config=crawler_config,
)
print(f"Found {len(result.links['internal'])} internal links")
print(f"Found {len(result.links['external'])} external links")
for link in result.links['internal'][:5]:
print(f"Href: {link['href']}\nText: {link['text']}\n")
# JavaScript Execution Example
async def simple_example_with_running_js_code():
print("\n--- Executing JavaScript and Using CSS Selectors ---")
browser_config = BrowserConfig(
headless=True,
java_script_enabled=True
)
browser_config = BrowserConfig(headless=True, java_script_enabled=True)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();",
# wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }"
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
config=crawler_config
url="https://www.nbcnews.com/business", config=crawler_config
)
print(result.markdown[:500])
# CSS Selector Example
async def simple_example_with_css_selector():
print("\n--- Using CSS Selectors ---")
browser_config = BrowserConfig(headless=True)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
css_selector=".wide-tease-item__description"
cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description"
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business", config=crawler_config
)
print(result.markdown[:500])
async def media_handling():
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
config=crawler_config
)
print(result.markdown[:500])
for img in result.media['images'][:5]:
print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")
async def custom_hook_workflow(verbose=True):
async with AsyncWebCrawler() as crawler:
# Set a 'before_goto' hook to run custom code just before navigation
crawler.crawler_strategy.set_hook("before_goto", lambda page, context: print("[Hook] Preparing to navigate..."))
# Perform the crawl operation
result = await crawler.arun(
url="https://crawl4ai.com"
)
print(result.markdown_v2.raw_markdown[:500].replace("\n", " -- "))
# Proxy Example
async def use_proxy():
print("\n--- Using a Proxy ---")
browser_config = BrowserConfig(
headless=True,
proxy="http://your-proxy-url:port"
proxy_config={
"server": "http://proxy.example.com:8080",
"username": "username",
"password": "password",
},
)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS
)
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
config=crawler_config
url="https://www.nbcnews.com/business", config=crawler_config
)
if result.success:
print(result.markdown[:500])
# Screenshot Example
async def capture_and_save_screenshot(url: str, output_path: str):
browser_config = BrowserConfig(headless=True)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
screenshot=True
)
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url=url,
config=crawler_config
)
result = await crawler.arun(url=url, config=crawler_config)
if result.success and result.screenshot:
import base64
screenshot_data = base64.b64decode(result.screenshot)
with open(output_path, 'wb') as f:
with open(output_path, "wb") as f:
f.write(screenshot_data)
print(f"Screenshot saved successfully to {output_path}")
else:
print("Failed to capture screenshot")
# LLM Extraction Example
class OpenAIModelFee(BaseModel):
model_name: str = Field(..., description="Name of the OpenAI model.")
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
output_fee: str = Field(
..., description="Fee for output token for the OpenAI model."
)
async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
async def extract_structured_data_using_llm(
provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
):
print(f"\n--- Extracting Structured Data with {provider} ---")
if api_token is None and provider != "ollama":
print(f"API token is required for {provider}. Skipping this example.")
return
browser_config = BrowserConfig(headless=True)
extra_args = {
"temperature": 0,
"top_p": 0.9,
"max_tokens": 2000
}
extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
if extra_headers:
extra_args["extra_headers"] = extra_headers
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=1,
page_timeout = 80000,
page_timeout=80000,
extraction_strategy=LLMExtractionStrategy(
provider=provider,
api_token=api_token,
@@ -150,17 +209,17 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
extraction_type="schema",
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
Do not miss any models in the entire content.""",
extra_args=extra_args
)
extra_args=extra_args,
),
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://openai.com/api/pricing/",
config=crawler_config
url="https://openai.com/api/pricing/", config=crawler_config
)
print(result.extracted_content)
# CSS Extraction Example
async def extract_structured_data_using_css_extractor():
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
@@ -192,16 +251,13 @@ async def extract_structured_data_using_css_extractor():
"name": "course_icon",
"selector": ".image-92",
"type": "attribute",
"attribute": "src"
}
]
"attribute": "src",
},
],
}
browser_config = BrowserConfig(
headless=True,
java_script_enabled=True
)
browser_config = BrowserConfig(headless=True, java_script_enabled=True)
js_click_tabs = """
(async () => {
const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
@@ -212,23 +268,23 @@ async def extract_structured_data_using_css_extractor():
}
})();
"""
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=JsonCssExtractionStrategy(schema),
js_code=[js_click_tabs]
js_code=[js_click_tabs],
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://www.kidocode.com/degrees/technology",
config=crawler_config
url="https://www.kidocode.com/degrees/technology", config=crawler_config
)
companies = json.loads(result.extracted_content)
print(f"Successfully extracted {len(companies)} companies")
print(json.dumps(companies[0], indent=2))
# Dynamic Content Examples - Method 1
async def crawl_dynamic_content_pages_method_1():
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
@@ -249,10 +305,7 @@ async def crawl_dynamic_content_pages_method_1():
except Exception as e:
print(f"Warning: New content didn't appear after JavaScript execution: {e}")
browser_config = BrowserConfig(
headless=False,
java_script_enabled=True
)
browser_config = BrowserConfig(headless=False, java_script_enabled=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
@@ -272,7 +325,7 @@ async def crawl_dynamic_content_pages_method_1():
css_selector="li.Box-sc-g0xbh4-0",
js_code=js_next_page if page > 0 else None,
js_only=page > 0,
session_id=session_id
session_id=session_id,
)
result = await crawler.arun(url=url, config=crawler_config)
@@ -286,14 +339,12 @@ async def crawl_dynamic_content_pages_method_1():
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
# Dynamic Content Examples - Method 2
async def crawl_dynamic_content_pages_method_2():
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
browser_config = BrowserConfig(
headless=False,
java_script_enabled=True
)
browser_config = BrowserConfig(headless=False, java_script_enabled=True)
js_next_page_and_wait = """
(async () => {
@@ -343,7 +394,7 @@ async def crawl_dynamic_content_pages_method_2():
extraction_strategy=extraction_strategy,
js_code=js_next_page_and_wait if page > 0 else None,
js_only=page > 0,
session_id=session_id
session_id=session_id,
)
result = await crawler.arun(url=url, config=crawler_config)
@@ -355,88 +406,128 @@ async def crawl_dynamic_content_pages_method_2():
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
async def cosine_similarity_extraction():
crawl_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=CosineStrategy(
word_count_threshold=10,
max_dist=0.2, # Maximum distance between two words
linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single)
top_k=3, # Number of top keywords to extract
sim_threshold=0.3, # Similarity threshold for clustering
semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings
verbose=True
),
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
config=crawl_config
)
print(json.loads(result.extracted_content)[:5])
# Browser Comparison
async def crawl_custom_browser_type():
print("\n--- Browser Comparison ---")
# Firefox
browser_config_firefox = BrowserConfig(
browser_type="firefox",
headless=True
)
browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True)
start = time.time()
async with AsyncWebCrawler(config=browser_config_firefox) as crawler:
result = await crawler.arun(
url="https://www.example.com",
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
)
print("Firefox:", time.time() - start)
print(result.markdown[:500])
# WebKit
browser_config_webkit = BrowserConfig(
browser_type="webkit",
headless=True
)
browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True)
start = time.time()
async with AsyncWebCrawler(config=browser_config_webkit) as crawler:
result = await crawler.arun(
url="https://www.example.com",
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
)
print("WebKit:", time.time() - start)
print(result.markdown[:500])
# Chromium (default)
browser_config_chromium = BrowserConfig(
browser_type="chromium",
headless=True
)
browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True)
start = time.time()
async with AsyncWebCrawler(config=browser_config_chromium) as crawler:
result = await crawler.arun(
url="https://www.example.com",
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
)
print("Chromium:", time.time() - start)
print(result.markdown[:500])
# Anti-Bot and User Simulation
async def crawl_with_user_simulation():
browser_config = BrowserConfig(
headless=True,
user_agent_mode="random",
user_agent_generator_config={
"device_type": "mobile",
"os_type": "android"
}
user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
magic=True,
simulate_user=True,
override_navigator=True
override_navigator=True,
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="YOUR-URL-HERE",
config=crawler_config
)
result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config)
print(result.markdown)
async def ssl_certification():
# Configure crawler to fetch SSL certificate
config = CrawlerRunConfig(
fetch_ssl_certificate=True,
cache_mode=CacheMode.BYPASS # Bypass cache to always get fresh certificates
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url='https://example.com',
config=config
)
if result.success and result.ssl_certificate:
cert = result.ssl_certificate
# 1. Access certificate properties directly
print("\nCertificate Information:")
print(f"Issuer: {cert.issuer.get('CN', '')}")
print(f"Valid until: {cert.valid_until}")
print(f"Fingerprint: {cert.fingerprint}")
# 2. Export certificate in different formats
cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis
print("\nCertificate exported to:")
print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem")) # For web servers
print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der")) # For Java apps
print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
# Speed Comparison
async def speed_comparison():
print("\n--- Speed Comparison ---")
# Firecrawl comparison
from firecrawl import FirecrawlApp
app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
start = time.time()
scrape_status = app.scrape_url(
'https://www.nbcnews.com/business',
params={'formats': ['markdown', 'html']}
"https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
)
end = time.time()
print("Firecrawl:")
@@ -447,16 +538,15 @@ async def speed_comparison():
# Crawl4AI comparisons
browser_config = BrowserConfig(headless=True)
# Simple crawl
async with AsyncWebCrawler(config=browser_config) as crawler:
start = time.time()
result = await crawler.arun(
url="https://www.nbcnews.com/business",
config=CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=0
)
cache_mode=CacheMode.BYPASS, word_count_threshold=0
),
)
end = time.time()
print("Crawl4AI (simple crawl):")
@@ -474,12 +564,10 @@ async def speed_comparison():
word_count_threshold=0,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48,
threshold_type="fixed",
min_word_threshold=0
threshold=0.48, threshold_type="fixed", min_word_threshold=0
)
)
)
),
),
)
end = time.time()
print("Crawl4AI (Markdown Plus):")
@@ -489,22 +577,25 @@ async def speed_comparison():
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
print()
# Main execution
async def main():
# Basic examples
# await simple_crawl()
# await simple_example_with_running_js_code()
# await simple_example_with_css_selector()
# Advanced examples
# await extract_structured_data_using_css_extractor()
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
await extract_structured_data_using_llm(
"openai/gpt-4o", os.getenv("OPENAI_API_KEY")
)
# await crawl_dynamic_content_pages_method_1()
# await crawl_dynamic_content_pages_method_2()
# Browser comparisons
# await crawl_custom_browser_type()
# Performance testing
# await speed_comparison()
@@ -514,5 +605,6 @@ async def main():
# os.path.join(__location__, "tmp/example_screenshot.jpg")
# )
if __name__ == "__main__":
asyncio.run(main())
asyncio.run(main())

View File

@@ -627,13 +627,13 @@ async def main():
# }
# await extract_structured_data_using_llm(extra_headers=custom_headers)
await crawl_dynamic_content_pages_method_1()
await crawl_dynamic_content_pages_method_2()
# await crawl_dynamic_content_pages_method_1()
# await crawl_dynamic_content_pages_method_2()
await crawl_dynamic_content_pages_method_3()
await crawl_custom_browser_type()
# await crawl_custom_browser_type()
await speed_comparison()
# await speed_comparison()
if __name__ == "__main__":

View File

@@ -0,0 +1,46 @@
"""Example showing how to work with SSL certificates in Crawl4AI."""
import asyncio
import os
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
# Create tmp directory if it doesn't exist
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
tmp_dir = os.path.join(parent_dir, "tmp")
os.makedirs(tmp_dir, exist_ok=True)
async def main():
# Configure crawler to fetch SSL certificate
config = CrawlerRunConfig(
fetch_ssl_certificate=True,
cache_mode=CacheMode.BYPASS # Bypass cache to always get fresh certificates
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url='https://example.com',
config=config
)
if result.success and result.ssl_certificate:
cert = result.ssl_certificate
# 1. Access certificate properties directly
print("\nCertificate Information:")
print(f"Issuer: {cert.issuer.get('CN', '')}")
print(f"Valid until: {cert.valid_until}")
print(f"Fingerprint: {cert.fingerprint}")
# 2. Export certificate in different formats
cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis
print("\nCertificate exported to:")
print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem")) # For web servers
print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der")) # For Java apps
print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,281 +0,0 @@
from openai import AsyncOpenAI
from chainlit.types import ThreadDict
import chainlit as cl
from chainlit.input_widget import Select, Switch, Slider
client = AsyncOpenAI()
# Instrument the OpenAI client
cl.instrument_openai()
settings = {
"model": "gpt-3.5-turbo",
"temperature": 0.5,
"max_tokens": 500,
"top_p": 1,
"frequency_penalty": 0,
"presence_penalty": 0,
}
@cl.action_callback("action_button")
async def on_action(action: cl.Action):
print("The user clicked on the action button!")
return "Thank you for clicking on the action button!"
@cl.set_chat_profiles
async def chat_profile():
return [
cl.ChatProfile(
name="GPT-3.5",
markdown_description="The underlying LLM model is **GPT-3.5**.",
icon="https://picsum.photos/200",
),
cl.ChatProfile(
name="GPT-4",
markdown_description="The underlying LLM model is **GPT-4**.",
icon="https://picsum.photos/250",
),
]
@cl.on_chat_start
async def on_chat_start():
settings = await cl.ChatSettings(
[
Select(
id="Model",
label="OpenAI - Model",
values=["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "gpt-4-32k"],
initial_index=0,
),
Switch(id="Streaming", label="OpenAI - Stream Tokens", initial=True),
Slider(
id="Temperature",
label="OpenAI - Temperature",
initial=1,
min=0,
max=2,
step=0.1,
),
Slider(
id="SAI_Steps",
label="Stability AI - Steps",
initial=30,
min=10,
max=150,
step=1,
description="Amount of inference steps performed on image generation.",
),
Slider(
id="SAI_Cfg_Scale",
label="Stability AI - Cfg_Scale",
initial=7,
min=1,
max=35,
step=0.1,
description="Influences how strongly your generation is guided to match your prompt.",
),
Slider(
id="SAI_Width",
label="Stability AI - Image Width",
initial=512,
min=256,
max=2048,
step=64,
tooltip="Measured in pixels",
),
Slider(
id="SAI_Height",
label="Stability AI - Image Height",
initial=512,
min=256,
max=2048,
step=64,
tooltip="Measured in pixels",
),
]
).send()
chat_profile = cl.user_session.get("chat_profile")
await cl.Message(
content=f"starting chat using the {chat_profile} chat profile"
).send()
print("A new chat session has started!")
cl.user_session.set("session", {
"history": [],
"context": []
})
image = cl.Image(url="https://c.tenor.com/uzWDSSLMCmkAAAAd/tenor.gif", name="cat image", display="inline")
# Attach the image to the message
await cl.Message(
content="You are such a good girl, aren't you?!",
elements=[image],
).send()
text_content = "Hello, this is a text element."
elements = [
cl.Text(name="simple_text", content=text_content, display="inline")
]
await cl.Message(
content="Check out this text element!",
elements=elements,
).send()
elements = [
cl.Audio(path="./assets/audio.mp3", display="inline"),
]
await cl.Message(
content="Here is an audio file",
elements=elements,
).send()
await cl.Avatar(
name="Tool 1",
url="https://avatars.githubusercontent.com/u/128686189?s=400&u=a1d1553023f8ea0921fba0debbe92a8c5f840dd9&v=4",
).send()
await cl.Message(
content="This message should not have an avatar!", author="Tool 0"
).send()
await cl.Message(
content="This message should have an avatar!", author="Tool 1"
).send()
elements = [
cl.File(
name="quickstart.py",
path="./quickstart.py",
display="inline",
),
]
await cl.Message(
content="This message has a file element", elements=elements
).send()
# Sending an action button within a chatbot message
actions = [
cl.Action(name="action_button", value="example_value", description="Click me!")
]
await cl.Message(content="Interact with this action button:", actions=actions).send()
# res = await cl.AskActionMessage(
# content="Pick an action!",
# actions=[
# cl.Action(name="continue", value="continue", label="✅ Continue"),
# cl.Action(name="cancel", value="cancel", label="❌ Cancel"),
# ],
# ).send()
# if res and res.get("value") == "continue":
# await cl.Message(
# content="Continue!",
# ).send()
# import plotly.graph_objects as go
# fig = go.Figure(
# data=[go.Bar(y=[2, 1, 3])],
# layout_title_text="An example figure",
# )
# elements = [cl.Plotly(name="chart", figure=fig, display="inline")]
# await cl.Message(content="This message has a chart", elements=elements).send()
# Sending a pdf with the local file path
# elements = [
# cl.Pdf(name="pdf1", display="inline", path="./pdf1.pdf")
# ]
# cl.Message(content="Look at this local pdf!", elements=elements).send()
@cl.on_settings_update
async def setup_agent(settings):
print("on_settings_update", settings)
@cl.on_stop
def on_stop():
print("The user wants to stop the task!")
@cl.on_chat_end
def on_chat_end():
print("The user disconnected!")
@cl.on_chat_resume
async def on_chat_resume(thread: ThreadDict):
print("The user resumed a previous chat session!")
# @cl.on_message
async def on_message(message: cl.Message):
cl.user_session.get("session")["history"].append({
"role": "user",
"content": message.content
})
response = await client.chat.completions.create(
messages=[
{
"content": "You are a helpful bot",
"role": "system"
},
*cl.user_session.get("session")["history"]
],
**settings
)
# Add assitanr message to the history
cl.user_session.get("session")["history"].append({
"role": "assistant",
"content": response.choices[0].message.content
})
# msg.content = response.choices[0].message.content
# await msg.update()
# await cl.Message(content=response.choices[0].message.content).send()
@cl.on_message
async def on_message(message: cl.Message):
cl.user_session.get("session")["history"].append({
"role": "user",
"content": message.content
})
msg = cl.Message(content="")
await msg.send()
stream = await client.chat.completions.create(
messages=[
{
"content": "You are a helpful bot",
"role": "system"
},
*cl.user_session.get("session")["history"]
],
stream = True,
**settings
)
async for part in stream:
if token := part.choices[0].delta.content or "":
await msg.stream_token(token)
# Add assitanr message to the history
cl.user_session.get("session")["history"].append({
"role": "assistant",
"content": msg.content
})
await msg.update()
if __name__ == "__main__":
from chainlit.cli import run_chainlit
run_chainlit(__file__)

View File

@@ -1,238 +0,0 @@
# Make sure to install the required packageschainlit and groq
import os, time
from openai import AsyncOpenAI
import chainlit as cl
import re
import requests
from io import BytesIO
from chainlit.element import ElementBased
from groq import Groq
# Import threadpools to run the crawl_url function in a separate thread
from concurrent.futures import ThreadPoolExecutor
client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY"))
# Instrument the OpenAI client
cl.instrument_openai()
settings = {
"model": "llama3-8b-8192",
"temperature": 0.5,
"max_tokens": 500,
"top_p": 1,
"frequency_penalty": 0,
"presence_penalty": 0,
}
def extract_urls(text):
url_pattern = re.compile(r'(https?://\S+)')
return url_pattern.findall(text)
def crawl_url(url):
data = {
"urls": [url],
"include_raw_html": True,
"word_count_threshold": 10,
"extraction_strategy": "NoExtractionStrategy",
"chunking_strategy": "RegexChunking"
}
response = requests.post("https://crawl4ai.com/crawl", json=data)
response_data = response.json()
response_data = response_data['results'][0]
return response_data['markdown']
@cl.on_chat_start
async def on_chat_start():
cl.user_session.set("session", {
"history": [],
"context": {}
})
await cl.Message(
content="Welcome to the chat! How can I assist you today?"
).send()
@cl.on_message
async def on_message(message: cl.Message):
user_session = cl.user_session.get("session")
# Extract URLs from the user's message
urls = extract_urls(message.content)
futures = []
with ThreadPoolExecutor() as executor:
for url in urls:
futures.append(executor.submit(crawl_url, url))
results = [future.result() for future in futures]
for url, result in zip(urls, results):
ref_number = f"REF_{len(user_session['context']) + 1}"
user_session["context"][ref_number] = {
"url": url,
"content": result
}
# for url in urls:
# # Crawl the content of each URL and add it to the session context with a reference number
# ref_number = f"REF_{len(user_session['context']) + 1}"
# crawled_content = crawl_url(url)
# user_session["context"][ref_number] = {
# "url": url,
# "content": crawled_content
# }
user_session["history"].append({
"role": "user",
"content": message.content
})
# Create a system message that includes the context
context_messages = [
f'<appendix ref="{ref}">\n{data["content"]}\n</appendix>'
for ref, data in user_session["context"].items()
]
if context_messages:
system_message = {
"role": "system",
"content": (
"You are a helpful bot. Use the following context for answering questions. "
"Refer to the sources using the REF number in square brackets, e.g., [1], only if the source is given in the appendices below.\n\n"
"If the question requires any information from the provided appendices or context, refer to the sources. "
"If not, there is no need to add a references section. "
"At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n"
"\n\n".join(context_messages)
)
}
else:
system_message = {
"role": "system",
"content": "You are a helpful assistant."
}
msg = cl.Message(content="")
await msg.send()
# Get response from the LLM
stream = await client.chat.completions.create(
messages=[
system_message,
*user_session["history"]
],
stream=True,
**settings
)
assistant_response = ""
async for part in stream:
if token := part.choices[0].delta.content:
assistant_response += token
await msg.stream_token(token)
# Add assistant message to the history
user_session["history"].append({
"role": "assistant",
"content": assistant_response
})
await msg.update()
# Append the reference section to the assistant's response
reference_section = "\n\nReferences:\n"
for ref, data in user_session["context"].items():
reference_section += f"[{ref.split('_')[1]}]: {data['url']}\n"
msg.content += reference_section
await msg.update()
@cl.on_audio_chunk
async def on_audio_chunk(chunk: cl.AudioChunk):
if chunk.isStart:
buffer = BytesIO()
# This is required for whisper to recognize the file type
buffer.name = f"input_audio.{chunk.mimeType.split('/')[1]}"
# Initialize the session for a new audio stream
cl.user_session.set("audio_buffer", buffer)
cl.user_session.set("audio_mime_type", chunk.mimeType)
# Write the chunks to a buffer and transcribe the whole audio at the end
cl.user_session.get("audio_buffer").write(chunk.data)
pass
@cl.step(type="tool")
async def speech_to_text(audio_file):
cli = Groq()
# response = cli.audio.transcriptions.create(
# file=audio_file, #(filename, file.read()),
# model="whisper-large-v3",
# )
response = await client.audio.transcriptions.create(
model="whisper-large-v3", file=audio_file
)
return response.text
@cl.on_audio_end
async def on_audio_end(elements: list[ElementBased]):
# Get the audio buffer from the session
audio_buffer: BytesIO = cl.user_session.get("audio_buffer")
audio_buffer.seek(0) # Move the file pointer to the beginning
audio_file = audio_buffer.read()
audio_mime_type: str = cl.user_session.get("audio_mime_type")
# input_audio_el = cl.Audio(
# mime=audio_mime_type, content=audio_file, name=audio_buffer.name
# )
# await cl.Message(
# author="You",
# type="user_message",
# content="",
# elements=[input_audio_el, *elements]
# ).send()
# answer_message = await cl.Message(content="").send()
start_time = time.time()
whisper_input = (audio_buffer.name, audio_file, audio_mime_type)
transcription = await speech_to_text(whisper_input)
end_time = time.time()
print(f"Transcription took {end_time - start_time} seconds")
user_msg = cl.Message(
author="You",
type="user_message",
content=transcription
)
await user_msg.send()
await on_message(user_msg)
# images = [file for file in elements if "image" in file.mime]
# text_answer = await generate_text_answer(transcription, images)
# output_name, output_audio = await text_to_speech(text_answer, audio_mime_type)
# output_audio_el = cl.Audio(
# name=output_name,
# auto_play=True,
# mime=audio_mime_type,
# content=output_audio,
# )
# answer_message.elements = [output_audio_el]
# answer_message.content = transcription
# await answer_message.update()
if __name__ == "__main__":
from chainlit.cli import run_chainlit
run_chainlit(__file__)