Commit Message:

- Added examples for Amazon product data extraction methods
  - Updated configuration options and enhance documentation
  - Minor refactoring for improved performance and readability
  - Cleaned up version control settings.
This commit is contained in:
UncleCode
2024-12-29 20:05:18 +08:00
parent f2d9912697
commit fb33a24891
27 changed files with 4371 additions and 1408 deletions

View File

@@ -0,0 +1,114 @@
"""
This example demonstrates how to use JSON CSS extraction to scrape product information
from Amazon search results. It shows how to extract structured data like product titles,
prices, ratings, and other details using CSS selectors.
"""
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
import json
async def extract_amazon_products():
# Initialize browser config
browser_config = BrowserConfig(
browser_type="chromium",
headless=True
)
# Initialize crawler config with JSON CSS extraction strategy
crawler_config = CrawlerRunConfig(
extraction_strategy=JsonCssExtractionStrategy(
schema={
"name": "Amazon Product Search Results",
"baseSelector": "[data-component-type='s-search-result']",
"fields": [
{
"name": "asin",
"selector": "",
"type": "attribute",
"attribute": "data-asin"
},
{
"name": "title",
"selector": "h2 a span",
"type": "text"
},
{
"name": "url",
"selector": "h2 a",
"type": "attribute",
"attribute": "href"
},
{
"name": "image",
"selector": ".s-image",
"type": "attribute",
"attribute": "src"
},
{
"name": "rating",
"selector": ".a-icon-star-small .a-icon-alt",
"type": "text"
},
{
"name": "reviews_count",
"selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
"type": "text"
},
{
"name": "price",
"selector": ".a-price .a-offscreen",
"type": "text"
},
{
"name": "original_price",
"selector": ".a-price.a-text-price .a-offscreen",
"type": "text"
},
{
"name": "sponsored",
"selector": ".puis-sponsored-label-text",
"type": "exists"
},
{
"name": "delivery_info",
"selector": "[data-cy='delivery-recipe'] .a-color-base",
"type": "text",
"multiple": True
}
]
}
)
)
# Example search URL (you should replace with your actual Amazon URL)
url = "https://www.amazon.com/s?k=Samsung+Galaxy+Tab"
# Use context manager for proper resource handling
async with AsyncWebCrawler(config=browser_config) as crawler:
# Extract the data
result = await crawler.arun(url=url, config=crawler_config)
# Process and print the results
if result and result.extracted_content:
# Parse the JSON string into a list of products
products = json.loads(result.extracted_content)
# Process each product in the list
for product in products:
print("\nProduct Details:")
print(f"ASIN: {product.get('asin')}")
print(f"Title: {product.get('title')}")
print(f"Price: {product.get('price')}")
print(f"Original Price: {product.get('original_price')}")
print(f"Rating: {product.get('rating')}")
print(f"Reviews: {product.get('reviews_count')}")
print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
if product.get('delivery_info'):
print(f"Delivery: {' '.join(product['delivery_info'])}")
print("-" * 80)
if __name__ == "__main__":
import asyncio
asyncio.run(extract_amazon_products())

View File

@@ -0,0 +1,145 @@
"""
This example demonstrates how to use JSON CSS extraction to scrape product information
from Amazon search results. It shows how to extract structured data like product titles,
prices, ratings, and other details using CSS selectors.
"""
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
import json
from playwright.async_api import Page, BrowserContext
async def extract_amazon_products():
# Initialize browser config
browser_config = BrowserConfig(
# browser_type="chromium",
headless=True
)
# Initialize crawler config with JSON CSS extraction strategy nav-search-submit-button
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=JsonCssExtractionStrategy(
schema={
"name": "Amazon Product Search Results",
"baseSelector": "[data-component-type='s-search-result']",
"fields": [
{
"name": "asin",
"selector": "",
"type": "attribute",
"attribute": "data-asin"
},
{
"name": "title",
"selector": "h2 a span",
"type": "text"
},
{
"name": "url",
"selector": "h2 a",
"type": "attribute",
"attribute": "href"
},
{
"name": "image",
"selector": ".s-image",
"type": "attribute",
"attribute": "src"
},
{
"name": "rating",
"selector": ".a-icon-star-small .a-icon-alt",
"type": "text"
},
{
"name": "reviews_count",
"selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
"type": "text"
},
{
"name": "price",
"selector": ".a-price .a-offscreen",
"type": "text"
},
{
"name": "original_price",
"selector": ".a-price.a-text-price .a-offscreen",
"type": "text"
},
{
"name": "sponsored",
"selector": ".puis-sponsored-label-text",
"type": "exists"
},
{
"name": "delivery_info",
"selector": "[data-cy='delivery-recipe'] .a-color-base",
"type": "text",
"multiple": True
}
]
}
)
)
url = "https://www.amazon.com/"
async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs):
"""Hook called after navigating to each URL"""
print(f"[HOOK] after_goto - Successfully loaded: {url}")
try:
# Wait for search box to be available
search_box = await page.wait_for_selector('#twotabsearchtextbox', timeout=1000)
# Type the search query
await search_box.fill('Samsung Galaxy Tab')
# Get the search button and prepare for navigation
search_button = await page.wait_for_selector('#nav-search-submit-button', timeout=1000)
# Click with navigation waiting
await search_button.click()
# Wait for search results to load
await page.wait_for_selector('[data-component-type="s-search-result"]', timeout=10000)
print("[HOOK] Search completed and results loaded!")
except Exception as e:
print(f"[HOOK] Error during search operation: {str(e)}")
return page
# Use context manager for proper resource handling
async with AsyncWebCrawler(config=browser_config) as crawler:
crawler.crawler_strategy.set_hook("after_goto", after_goto)
# Extract the data
result = await crawler.arun(url=url, config=crawler_config)
# Process and print the results
if result and result.extracted_content:
# Parse the JSON string into a list of products
products = json.loads(result.extracted_content)
# Process each product in the list
for product in products:
print("\nProduct Details:")
print(f"ASIN: {product.get('asin')}")
print(f"Title: {product.get('title')}")
print(f"Price: {product.get('price')}")
print(f"Original Price: {product.get('original_price')}")
print(f"Rating: {product.get('rating')}")
print(f"Reviews: {product.get('reviews_count')}")
print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
if product.get('delivery_info'):
print(f"Delivery: {' '.join(product['delivery_info'])}")
print("-" * 80)
if __name__ == "__main__":
import asyncio
asyncio.run(extract_amazon_products())

View File

@@ -0,0 +1,129 @@
"""
This example demonstrates how to use JSON CSS extraction to scrape product information
from Amazon search results. It shows how to extract structured data like product titles,
prices, ratings, and other details using CSS selectors.
"""
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
import json
from playwright.async_api import Page, BrowserContext
async def extract_amazon_products():
# Initialize browser config
browser_config = BrowserConfig(
# browser_type="chromium",
headless=True
)
js_code_to_search = """
const task = async () => {
document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab';
document.querySelector('#nav-search-submit-button').click();
}
await task();
"""
js_code_to_search_sync = """
document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab';
document.querySelector('#nav-search-submit-button').click();
"""
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
js_code = js_code_to_search,
wait_for='css:[data-component-type="s-search-result"]',
extraction_strategy=JsonCssExtractionStrategy(
schema={
"name": "Amazon Product Search Results",
"baseSelector": "[data-component-type='s-search-result']",
"fields": [
{
"name": "asin",
"selector": "",
"type": "attribute",
"attribute": "data-asin"
},
{
"name": "title",
"selector": "h2 a span",
"type": "text"
},
{
"name": "url",
"selector": "h2 a",
"type": "attribute",
"attribute": "href"
},
{
"name": "image",
"selector": ".s-image",
"type": "attribute",
"attribute": "src"
},
{
"name": "rating",
"selector": ".a-icon-star-small .a-icon-alt",
"type": "text"
},
{
"name": "reviews_count",
"selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
"type": "text"
},
{
"name": "price",
"selector": ".a-price .a-offscreen",
"type": "text"
},
{
"name": "original_price",
"selector": ".a-price.a-text-price .a-offscreen",
"type": "text"
},
{
"name": "sponsored",
"selector": ".puis-sponsored-label-text",
"type": "exists"
},
{
"name": "delivery_info",
"selector": "[data-cy='delivery-recipe'] .a-color-base",
"type": "text",
"multiple": True
}
]
}
)
)
# Example search URL (you should replace with your actual Amazon URL)
url = "https://www.amazon.com/"
# Use context manager for proper resource handling
async with AsyncWebCrawler(config=browser_config) as crawler:
# Extract the data
result = await crawler.arun(url=url, config=crawler_config)
# Process and print the results
if result and result.extracted_content:
# Parse the JSON string into a list of products
products = json.loads(result.extracted_content)
# Process each product in the list
for product in products:
print("\nProduct Details:")
print(f"ASIN: {product.get('asin')}")
print(f"Title: {product.get('title')}")
print(f"Price: {product.get('price')}")
print(f"Original Price: {product.get('original_price')}")
print(f"Rating: {product.get('rating')}")
print(f"Reviews: {product.get('reviews_count')}")
print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
if product.get('delivery_info'):
print(f"Delivery: {' '.join(product['delivery_info'])}")
print("-" * 80)
if __name__ == "__main__":
import asyncio
asyncio.run(extract_amazon_products())

View File

@@ -1,6 +1,8 @@
import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
os.environ['FIRECRAWL_API_KEY'] = "fc-84b370ccfad44beabc686b38f1769692"
sys.path.append(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
)
import asyncio
import time
@@ -12,7 +14,10 @@ from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
from crawl4ai.extraction_strategy import (
JsonCssExtractionStrategy,
LLMExtractionStrategy,
)
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
@@ -21,128 +26,182 @@ print("GitHub Repository: https://github.com/unclecode/crawl4ai")
print("Twitter: @unclecode")
print("Website: https://crawl4ai.com")
# Basic Example - Simple Crawl
async def simple_crawl():
print("\n--- Basic Usage ---")
browser_config = BrowserConfig(headless=True)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS
)
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
config=crawler_config
url="https://www.nbcnews.com/business", config=crawler_config
)
print(result.markdown[:500])
async def clean_content():
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
excluded_tags=["nav", "footer", "aside"],
remove_overlay_elements=True,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48, threshold_type="fixed", min_word_threshold=0
),
options={"ignore_links": True},
),
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://en.wikipedia.org/wiki/Apple",
config=crawler_config,
)
full_markdown_length = len(result.markdown_v2.raw_markdown)
fit_markdown_length = len(result.markdown_v2.fit_markdown)
print(f"Full Markdown Length: {full_markdown_length}")
print(f"Fit Markdown Length: {fit_markdown_length}")
async def link_analysis():
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.ENABLED,
exclude_external_links=True,
exclude_social_media_links=True,
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
config=crawler_config,
)
print(f"Found {len(result.links['internal'])} internal links")
print(f"Found {len(result.links['external'])} external links")
for link in result.links['internal'][:5]:
print(f"Href: {link['href']}\nText: {link['text']}\n")
# JavaScript Execution Example
async def simple_example_with_running_js_code():
print("\n--- Executing JavaScript and Using CSS Selectors ---")
browser_config = BrowserConfig(
headless=True,
java_script_enabled=True
)
browser_config = BrowserConfig(headless=True, java_script_enabled=True)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();",
# wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }"
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
config=crawler_config
url="https://www.nbcnews.com/business", config=crawler_config
)
print(result.markdown[:500])
# CSS Selector Example
async def simple_example_with_css_selector():
print("\n--- Using CSS Selectors ---")
browser_config = BrowserConfig(headless=True)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
css_selector=".wide-tease-item__description"
cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description"
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business", config=crawler_config
)
print(result.markdown[:500])
async def media_handling():
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
config=crawler_config
)
print(result.markdown[:500])
for img in result.media['images'][:5]:
print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")
async def custom_hook_workflow(verbose=True):
async with AsyncWebCrawler() as crawler:
# Set a 'before_goto' hook to run custom code just before navigation
crawler.crawler_strategy.set_hook("before_goto", lambda page, context: print("[Hook] Preparing to navigate..."))
# Perform the crawl operation
result = await crawler.arun(
url="https://crawl4ai.com"
)
print(result.markdown_v2.raw_markdown[:500].replace("\n", " -- "))
# Proxy Example
async def use_proxy():
print("\n--- Using a Proxy ---")
browser_config = BrowserConfig(
headless=True,
proxy="http://your-proxy-url:port"
proxy_config={
"server": "http://proxy.example.com:8080",
"username": "username",
"password": "password",
},
)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS
)
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
config=crawler_config
url="https://www.nbcnews.com/business", config=crawler_config
)
if result.success:
print(result.markdown[:500])
# Screenshot Example
async def capture_and_save_screenshot(url: str, output_path: str):
browser_config = BrowserConfig(headless=True)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
screenshot=True
)
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url=url,
config=crawler_config
)
result = await crawler.arun(url=url, config=crawler_config)
if result.success and result.screenshot:
import base64
screenshot_data = base64.b64decode(result.screenshot)
with open(output_path, 'wb') as f:
with open(output_path, "wb") as f:
f.write(screenshot_data)
print(f"Screenshot saved successfully to {output_path}")
else:
print("Failed to capture screenshot")
# LLM Extraction Example
class OpenAIModelFee(BaseModel):
model_name: str = Field(..., description="Name of the OpenAI model.")
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
output_fee: str = Field(
..., description="Fee for output token for the OpenAI model."
)
async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
async def extract_structured_data_using_llm(
provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
):
print(f"\n--- Extracting Structured Data with {provider} ---")
if api_token is None and provider != "ollama":
print(f"API token is required for {provider}. Skipping this example.")
return
browser_config = BrowserConfig(headless=True)
extra_args = {
"temperature": 0,
"top_p": 0.9,
"max_tokens": 2000
}
extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
if extra_headers:
extra_args["extra_headers"] = extra_headers
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=1,
page_timeout = 80000,
page_timeout=80000,
extraction_strategy=LLMExtractionStrategy(
provider=provider,
api_token=api_token,
@@ -150,17 +209,17 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
extraction_type="schema",
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
Do not miss any models in the entire content.""",
extra_args=extra_args
)
extra_args=extra_args,
),
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://openai.com/api/pricing/",
config=crawler_config
url="https://openai.com/api/pricing/", config=crawler_config
)
print(result.extracted_content)
# CSS Extraction Example
async def extract_structured_data_using_css_extractor():
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
@@ -192,16 +251,13 @@ async def extract_structured_data_using_css_extractor():
"name": "course_icon",
"selector": ".image-92",
"type": "attribute",
"attribute": "src"
}
]
"attribute": "src",
},
],
}
browser_config = BrowserConfig(
headless=True,
java_script_enabled=True
)
browser_config = BrowserConfig(headless=True, java_script_enabled=True)
js_click_tabs = """
(async () => {
const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
@@ -212,23 +268,23 @@ async def extract_structured_data_using_css_extractor():
}
})();
"""
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=JsonCssExtractionStrategy(schema),
js_code=[js_click_tabs]
js_code=[js_click_tabs],
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://www.kidocode.com/degrees/technology",
config=crawler_config
url="https://www.kidocode.com/degrees/technology", config=crawler_config
)
companies = json.loads(result.extracted_content)
print(f"Successfully extracted {len(companies)} companies")
print(json.dumps(companies[0], indent=2))
# Dynamic Content Examples - Method 1
async def crawl_dynamic_content_pages_method_1():
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
@@ -249,10 +305,7 @@ async def crawl_dynamic_content_pages_method_1():
except Exception as e:
print(f"Warning: New content didn't appear after JavaScript execution: {e}")
browser_config = BrowserConfig(
headless=False,
java_script_enabled=True
)
browser_config = BrowserConfig(headless=False, java_script_enabled=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
@@ -272,7 +325,7 @@ async def crawl_dynamic_content_pages_method_1():
css_selector="li.Box-sc-g0xbh4-0",
js_code=js_next_page if page > 0 else None,
js_only=page > 0,
session_id=session_id
session_id=session_id,
)
result = await crawler.arun(url=url, config=crawler_config)
@@ -286,14 +339,12 @@ async def crawl_dynamic_content_pages_method_1():
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
# Dynamic Content Examples - Method 2
async def crawl_dynamic_content_pages_method_2():
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
browser_config = BrowserConfig(
headless=False,
java_script_enabled=True
)
browser_config = BrowserConfig(headless=False, java_script_enabled=True)
js_next_page_and_wait = """
(async () => {
@@ -343,7 +394,7 @@ async def crawl_dynamic_content_pages_method_2():
extraction_strategy=extraction_strategy,
js_code=js_next_page_and_wait if page > 0 else None,
js_only=page > 0,
session_id=session_id
session_id=session_id,
)
result = await crawler.arun(url=url, config=crawler_config)
@@ -355,88 +406,128 @@ async def crawl_dynamic_content_pages_method_2():
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
async def cosine_similarity_extraction():
crawl_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=CosineStrategy(
word_count_threshold=10,
max_dist=0.2, # Maximum distance between two words
linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single)
top_k=3, # Number of top keywords to extract
sim_threshold=0.3, # Similarity threshold for clustering
semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings
verbose=True
),
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
config=crawl_config
)
print(json.loads(result.extracted_content)[:5])
# Browser Comparison
async def crawl_custom_browser_type():
print("\n--- Browser Comparison ---")
# Firefox
browser_config_firefox = BrowserConfig(
browser_type="firefox",
headless=True
)
browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True)
start = time.time()
async with AsyncWebCrawler(config=browser_config_firefox) as crawler:
result = await crawler.arun(
url="https://www.example.com",
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
)
print("Firefox:", time.time() - start)
print(result.markdown[:500])
# WebKit
browser_config_webkit = BrowserConfig(
browser_type="webkit",
headless=True
)
browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True)
start = time.time()
async with AsyncWebCrawler(config=browser_config_webkit) as crawler:
result = await crawler.arun(
url="https://www.example.com",
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
)
print("WebKit:", time.time() - start)
print(result.markdown[:500])
# Chromium (default)
browser_config_chromium = BrowserConfig(
browser_type="chromium",
headless=True
)
browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True)
start = time.time()
async with AsyncWebCrawler(config=browser_config_chromium) as crawler:
result = await crawler.arun(
url="https://www.example.com",
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
)
print("Chromium:", time.time() - start)
print(result.markdown[:500])
# Anti-Bot and User Simulation
async def crawl_with_user_simulation():
browser_config = BrowserConfig(
headless=True,
user_agent_mode="random",
user_agent_generator_config={
"device_type": "mobile",
"os_type": "android"
}
user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
magic=True,
simulate_user=True,
override_navigator=True
override_navigator=True,
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="YOUR-URL-HERE",
config=crawler_config
)
result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config)
print(result.markdown)
async def ssl_certification():
# Configure crawler to fetch SSL certificate
config = CrawlerRunConfig(
fetch_ssl_certificate=True,
cache_mode=CacheMode.BYPASS # Bypass cache to always get fresh certificates
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url='https://example.com',
config=config
)
if result.success and result.ssl_certificate:
cert = result.ssl_certificate
# 1. Access certificate properties directly
print("\nCertificate Information:")
print(f"Issuer: {cert.issuer.get('CN', '')}")
print(f"Valid until: {cert.valid_until}")
print(f"Fingerprint: {cert.fingerprint}")
# 2. Export certificate in different formats
cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis
print("\nCertificate exported to:")
print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem")) # For web servers
print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der")) # For Java apps
print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
# Speed Comparison
async def speed_comparison():
print("\n--- Speed Comparison ---")
# Firecrawl comparison
from firecrawl import FirecrawlApp
app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
start = time.time()
scrape_status = app.scrape_url(
'https://www.nbcnews.com/business',
params={'formats': ['markdown', 'html']}
"https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
)
end = time.time()
print("Firecrawl:")
@@ -447,16 +538,15 @@ async def speed_comparison():
# Crawl4AI comparisons
browser_config = BrowserConfig(headless=True)
# Simple crawl
async with AsyncWebCrawler(config=browser_config) as crawler:
start = time.time()
result = await crawler.arun(
url="https://www.nbcnews.com/business",
config=CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=0
)
cache_mode=CacheMode.BYPASS, word_count_threshold=0
),
)
end = time.time()
print("Crawl4AI (simple crawl):")
@@ -474,12 +564,10 @@ async def speed_comparison():
word_count_threshold=0,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48,
threshold_type="fixed",
min_word_threshold=0
threshold=0.48, threshold_type="fixed", min_word_threshold=0
)
)
)
),
),
)
end = time.time()
print("Crawl4AI (Markdown Plus):")
@@ -489,22 +577,25 @@ async def speed_comparison():
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
print()
# Main execution
async def main():
# Basic examples
# await simple_crawl()
# await simple_example_with_running_js_code()
# await simple_example_with_css_selector()
# Advanced examples
# await extract_structured_data_using_css_extractor()
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
await extract_structured_data_using_llm(
"openai/gpt-4o", os.getenv("OPENAI_API_KEY")
)
# await crawl_dynamic_content_pages_method_1()
# await crawl_dynamic_content_pages_method_2()
# Browser comparisons
# await crawl_custom_browser_type()
# Performance testing
# await speed_comparison()
@@ -514,5 +605,6 @@ async def main():
# os.path.join(__location__, "tmp/example_screenshot.jpg")
# )
if __name__ == "__main__":
asyncio.run(main())
asyncio.run(main())

View File

@@ -627,13 +627,13 @@ async def main():
# }
# await extract_structured_data_using_llm(extra_headers=custom_headers)
await crawl_dynamic_content_pages_method_1()
await crawl_dynamic_content_pages_method_2()
# await crawl_dynamic_content_pages_method_1()
# await crawl_dynamic_content_pages_method_2()
await crawl_dynamic_content_pages_method_3()
await crawl_custom_browser_type()
# await crawl_custom_browser_type()
await speed_comparison()
# await speed_comparison()
if __name__ == "__main__":

View File

@@ -1,281 +0,0 @@
from openai import AsyncOpenAI
from chainlit.types import ThreadDict
import chainlit as cl
from chainlit.input_widget import Select, Switch, Slider
client = AsyncOpenAI()
# Instrument the OpenAI client
cl.instrument_openai()
settings = {
"model": "gpt-3.5-turbo",
"temperature": 0.5,
"max_tokens": 500,
"top_p": 1,
"frequency_penalty": 0,
"presence_penalty": 0,
}
@cl.action_callback("action_button")
async def on_action(action: cl.Action):
print("The user clicked on the action button!")
return "Thank you for clicking on the action button!"
@cl.set_chat_profiles
async def chat_profile():
return [
cl.ChatProfile(
name="GPT-3.5",
markdown_description="The underlying LLM model is **GPT-3.5**.",
icon="https://picsum.photos/200",
),
cl.ChatProfile(
name="GPT-4",
markdown_description="The underlying LLM model is **GPT-4**.",
icon="https://picsum.photos/250",
),
]
@cl.on_chat_start
async def on_chat_start():
settings = await cl.ChatSettings(
[
Select(
id="Model",
label="OpenAI - Model",
values=["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "gpt-4-32k"],
initial_index=0,
),
Switch(id="Streaming", label="OpenAI - Stream Tokens", initial=True),
Slider(
id="Temperature",
label="OpenAI - Temperature",
initial=1,
min=0,
max=2,
step=0.1,
),
Slider(
id="SAI_Steps",
label="Stability AI - Steps",
initial=30,
min=10,
max=150,
step=1,
description="Amount of inference steps performed on image generation.",
),
Slider(
id="SAI_Cfg_Scale",
label="Stability AI - Cfg_Scale",
initial=7,
min=1,
max=35,
step=0.1,
description="Influences how strongly your generation is guided to match your prompt.",
),
Slider(
id="SAI_Width",
label="Stability AI - Image Width",
initial=512,
min=256,
max=2048,
step=64,
tooltip="Measured in pixels",
),
Slider(
id="SAI_Height",
label="Stability AI - Image Height",
initial=512,
min=256,
max=2048,
step=64,
tooltip="Measured in pixels",
),
]
).send()
chat_profile = cl.user_session.get("chat_profile")
await cl.Message(
content=f"starting chat using the {chat_profile} chat profile"
).send()
print("A new chat session has started!")
cl.user_session.set("session", {
"history": [],
"context": []
})
image = cl.Image(url="https://c.tenor.com/uzWDSSLMCmkAAAAd/tenor.gif", name="cat image", display="inline")
# Attach the image to the message
await cl.Message(
content="You are such a good girl, aren't you?!",
elements=[image],
).send()
text_content = "Hello, this is a text element."
elements = [
cl.Text(name="simple_text", content=text_content, display="inline")
]
await cl.Message(
content="Check out this text element!",
elements=elements,
).send()
elements = [
cl.Audio(path="./assets/audio.mp3", display="inline"),
]
await cl.Message(
content="Here is an audio file",
elements=elements,
).send()
await cl.Avatar(
name="Tool 1",
url="https://avatars.githubusercontent.com/u/128686189?s=400&u=a1d1553023f8ea0921fba0debbe92a8c5f840dd9&v=4",
).send()
await cl.Message(
content="This message should not have an avatar!", author="Tool 0"
).send()
await cl.Message(
content="This message should have an avatar!", author="Tool 1"
).send()
elements = [
cl.File(
name="quickstart.py",
path="./quickstart.py",
display="inline",
),
]
await cl.Message(
content="This message has a file element", elements=elements
).send()
# Sending an action button within a chatbot message
actions = [
cl.Action(name="action_button", value="example_value", description="Click me!")
]
await cl.Message(content="Interact with this action button:", actions=actions).send()
# res = await cl.AskActionMessage(
# content="Pick an action!",
# actions=[
# cl.Action(name="continue", value="continue", label="✅ Continue"),
# cl.Action(name="cancel", value="cancel", label="❌ Cancel"),
# ],
# ).send()
# if res and res.get("value") == "continue":
# await cl.Message(
# content="Continue!",
# ).send()
# import plotly.graph_objects as go
# fig = go.Figure(
# data=[go.Bar(y=[2, 1, 3])],
# layout_title_text="An example figure",
# )
# elements = [cl.Plotly(name="chart", figure=fig, display="inline")]
# await cl.Message(content="This message has a chart", elements=elements).send()
# Sending a pdf with the local file path
# elements = [
# cl.Pdf(name="pdf1", display="inline", path="./pdf1.pdf")
# ]
# cl.Message(content="Look at this local pdf!", elements=elements).send()
@cl.on_settings_update
async def setup_agent(settings):
print("on_settings_update", settings)
@cl.on_stop
def on_stop():
print("The user wants to stop the task!")
@cl.on_chat_end
def on_chat_end():
print("The user disconnected!")
@cl.on_chat_resume
async def on_chat_resume(thread: ThreadDict):
print("The user resumed a previous chat session!")
# @cl.on_message
async def on_message(message: cl.Message):
cl.user_session.get("session")["history"].append({
"role": "user",
"content": message.content
})
response = await client.chat.completions.create(
messages=[
{
"content": "You are a helpful bot",
"role": "system"
},
*cl.user_session.get("session")["history"]
],
**settings
)
# Add assitanr message to the history
cl.user_session.get("session")["history"].append({
"role": "assistant",
"content": response.choices[0].message.content
})
# msg.content = response.choices[0].message.content
# await msg.update()
# await cl.Message(content=response.choices[0].message.content).send()
@cl.on_message
async def on_message(message: cl.Message):
cl.user_session.get("session")["history"].append({
"role": "user",
"content": message.content
})
msg = cl.Message(content="")
await msg.send()
stream = await client.chat.completions.create(
messages=[
{
"content": "You are a helpful bot",
"role": "system"
},
*cl.user_session.get("session")["history"]
],
stream = True,
**settings
)
async for part in stream:
if token := part.choices[0].delta.content or "":
await msg.stream_token(token)
# Add assitanr message to the history
cl.user_session.get("session")["history"].append({
"role": "assistant",
"content": msg.content
})
await msg.update()
if __name__ == "__main__":
from chainlit.cli import run_chainlit
run_chainlit(__file__)

View File

@@ -1,238 +0,0 @@
# Make sure to install the required packageschainlit and groq
import os, time
from openai import AsyncOpenAI
import chainlit as cl
import re
import requests
from io import BytesIO
from chainlit.element import ElementBased
from groq import Groq
# Import threadpools to run the crawl_url function in a separate thread
from concurrent.futures import ThreadPoolExecutor
client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY"))
# Instrument the OpenAI client
cl.instrument_openai()
settings = {
"model": "llama3-8b-8192",
"temperature": 0.5,
"max_tokens": 500,
"top_p": 1,
"frequency_penalty": 0,
"presence_penalty": 0,
}
def extract_urls(text):
url_pattern = re.compile(r'(https?://\S+)')
return url_pattern.findall(text)
def crawl_url(url):
data = {
"urls": [url],
"include_raw_html": True,
"word_count_threshold": 10,
"extraction_strategy": "NoExtractionStrategy",
"chunking_strategy": "RegexChunking"
}
response = requests.post("https://crawl4ai.com/crawl", json=data)
response_data = response.json()
response_data = response_data['results'][0]
return response_data['markdown']
@cl.on_chat_start
async def on_chat_start():
cl.user_session.set("session", {
"history": [],
"context": {}
})
await cl.Message(
content="Welcome to the chat! How can I assist you today?"
).send()
@cl.on_message
async def on_message(message: cl.Message):
user_session = cl.user_session.get("session")
# Extract URLs from the user's message
urls = extract_urls(message.content)
futures = []
with ThreadPoolExecutor() as executor:
for url in urls:
futures.append(executor.submit(crawl_url, url))
results = [future.result() for future in futures]
for url, result in zip(urls, results):
ref_number = f"REF_{len(user_session['context']) + 1}"
user_session["context"][ref_number] = {
"url": url,
"content": result
}
# for url in urls:
# # Crawl the content of each URL and add it to the session context with a reference number
# ref_number = f"REF_{len(user_session['context']) + 1}"
# crawled_content = crawl_url(url)
# user_session["context"][ref_number] = {
# "url": url,
# "content": crawled_content
# }
user_session["history"].append({
"role": "user",
"content": message.content
})
# Create a system message that includes the context
context_messages = [
f'<appendix ref="{ref}">\n{data["content"]}\n</appendix>'
for ref, data in user_session["context"].items()
]
if context_messages:
system_message = {
"role": "system",
"content": (
"You are a helpful bot. Use the following context for answering questions. "
"Refer to the sources using the REF number in square brackets, e.g., [1], only if the source is given in the appendices below.\n\n"
"If the question requires any information from the provided appendices or context, refer to the sources. "
"If not, there is no need to add a references section. "
"At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n"
"\n\n".join(context_messages)
)
}
else:
system_message = {
"role": "system",
"content": "You are a helpful assistant."
}
msg = cl.Message(content="")
await msg.send()
# Get response from the LLM
stream = await client.chat.completions.create(
messages=[
system_message,
*user_session["history"]
],
stream=True,
**settings
)
assistant_response = ""
async for part in stream:
if token := part.choices[0].delta.content:
assistant_response += token
await msg.stream_token(token)
# Add assistant message to the history
user_session["history"].append({
"role": "assistant",
"content": assistant_response
})
await msg.update()
# Append the reference section to the assistant's response
reference_section = "\n\nReferences:\n"
for ref, data in user_session["context"].items():
reference_section += f"[{ref.split('_')[1]}]: {data['url']}\n"
msg.content += reference_section
await msg.update()
@cl.on_audio_chunk
async def on_audio_chunk(chunk: cl.AudioChunk):
if chunk.isStart:
buffer = BytesIO()
# This is required for whisper to recognize the file type
buffer.name = f"input_audio.{chunk.mimeType.split('/')[1]}"
# Initialize the session for a new audio stream
cl.user_session.set("audio_buffer", buffer)
cl.user_session.set("audio_mime_type", chunk.mimeType)
# Write the chunks to a buffer and transcribe the whole audio at the end
cl.user_session.get("audio_buffer").write(chunk.data)
pass
@cl.step(type="tool")
async def speech_to_text(audio_file):
cli = Groq()
# response = cli.audio.transcriptions.create(
# file=audio_file, #(filename, file.read()),
# model="whisper-large-v3",
# )
response = await client.audio.transcriptions.create(
model="whisper-large-v3", file=audio_file
)
return response.text
@cl.on_audio_end
async def on_audio_end(elements: list[ElementBased]):
# Get the audio buffer from the session
audio_buffer: BytesIO = cl.user_session.get("audio_buffer")
audio_buffer.seek(0) # Move the file pointer to the beginning
audio_file = audio_buffer.read()
audio_mime_type: str = cl.user_session.get("audio_mime_type")
# input_audio_el = cl.Audio(
# mime=audio_mime_type, content=audio_file, name=audio_buffer.name
# )
# await cl.Message(
# author="You",
# type="user_message",
# content="",
# elements=[input_audio_el, *elements]
# ).send()
# answer_message = await cl.Message(content="").send()
start_time = time.time()
whisper_input = (audio_buffer.name, audio_file, audio_mime_type)
transcription = await speech_to_text(whisper_input)
end_time = time.time()
print(f"Transcription took {end_time - start_time} seconds")
user_msg = cl.Message(
author="You",
type="user_message",
content=transcription
)
await user_msg.send()
await on_message(user_msg)
# images = [file for file in elements if "image" in file.mime]
# text_answer = await generate_text_answer(transcription, images)
# output_name, output_audio = await text_to_speech(text_answer, audio_mime_type)
# output_audio_el = cl.Audio(
# name=output_name,
# auto_play=True,
# mime=audio_mime_type,
# content=output_audio,
# )
# answer_message.elements = [output_audio_el]
# answer_message.content = transcription
# await answer_message.update()
if __name__ == "__main__":
from chainlit.cli import run_chainlit
run_chainlit(__file__)

View File

@@ -2,20 +2,39 @@
Crawl4AI, the **#1 trending GitHub repository**, streamlines web content extraction into AI-ready formats. Perfect for AI assistants, semantic search engines, or data pipelines, Crawl4AI transforms raw HTML into structured Markdown or JSON effortlessly. Integrate with LLMs, open-source models, or your own retrieval-augmented generation workflows.
**Key Links:**
- **Website:** [https://crawl4ai.com](https://crawl4ai.com)
- **GitHub:** [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
- **Colab Notebook:** [Try on Google Colab](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing)
- **Quickstart Code Example:** [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py)
- **Examples Folder:** [Crawl4AI Examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples)
**What Crawl4AI is not:**
Crawl4AI is not a replacement for traditional web scraping libraries, Selenium, or Playwright. It's not designed as a general-purpose web automation tool. Instead, Crawl4AI has a specific, focused goal:
- To generate perfect, AI-friendly data (particularly for LLMs) from web content
- To maximize speed and efficiency in data extraction and processing
- To operate at scale, from Raspberry Pi to cloud infrastructures
Crawl4AI is engineered with a "scale-first" mindset, aiming to handle millions of links while maintaining exceptional performance. It's super efficient and fast, optimized to:
1. Transform raw web content into structured, LLM-ready formats (Markdown/JSON)
2. Implement intelligent extraction strategies to reduce reliance on costly API calls
3. Provide a streamlined pipeline for AI data preparation and ingestion
In essence, Crawl4AI bridges the gap between web content and AI systems, focusing on delivering high-quality, processed data rather than offering broad web automation capabilities.
**Key Links:**
- **Website:** [https://crawl4ai.com](https://crawl4ai.com)
- **GitHub:** [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
- **Colab Notebook:** [Try on Google Colab](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing)
- **Quickstart Code Example:** [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py)
- **Examples Folder:** [Crawl4AI Examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples)
---
## Table of Contents
- [Crawl4AI Quick Start Guide: Your All-in-One AI-Ready Web Crawling \& AI Integration Solution](#crawl4ai-quick-start-guide-your-all-in-one-ai-ready-web-crawling--ai-integration-solution)
- [Table of Contents](#table-of-contents)
- [1. Introduction \& Key Concepts](#1-introduction--key-concepts)
- [2. Installation \& Environment Setup](#2-installation--environment-setup)
- [Test Your Installation](#test-your-installation)
- [3. Core Concepts \& Configuration](#3-core-concepts--configuration)
- [4. Basic Crawling \& Simple Extraction](#4-basic-crawling--simple-extraction)
- [5. Markdown Generation \& AI-Optimized Output](#5-markdown-generation--ai-optimized-output)
@@ -38,15 +57,17 @@ Crawl4AI, the **#1 trending GitHub repository**, streamlines web content extract
---
## 1. Introduction & Key Concepts
Crawl4AI transforms websites into structured, AI-friendly data. It efficiently handles large-scale crawling, integrates with both proprietary and open-source LLMs, and optimizes content for semantic search or RAG pipelines.
**Quick Test:**
```python
import asyncio
from crawl4ai import AsyncWebCrawler
async def test_run():
async with AsyncWebCrawler(verbose=True) as crawler:
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://example.com")
print(result.markdown)
@@ -60,12 +81,41 @@ If you see Markdown output, everything is working!
---
## 2. Installation & Environment Setup
```bash
# Install the package
pip install crawl4ai
crawl4ai-setup
playwright install chromium
# Install Playwright with system dependencies (recommended)
playwright install --with-deps # Installs all browsers
# Or install specific browsers:
playwright install --with-deps chrome # Recommended for Colab/Linux
playwright install --with-deps firefox
playwright install --with-deps webkit
playwright install --with-deps chromium
# Keep Playwright updated periodically
playwright install
```
> **Note**: For Google Colab and some Linux environments, use `chrome` instead of `chromium` - it tends to work more reliably.
### Test Your Installation
Try these one-liners:
```python
# Visible browser test
python -c "from playwright.sync_api import sync_playwright; p = sync_playwright().start(); browser = p.chromium.launch(headless=False); page = browser.new_page(); page.goto('https://example.com'); input('Press Enter to close...')"
# Headless test (for servers/CI)
python -c "from playwright.sync_api import sync_playwright; p = sync_playwright().start(); browser = p.chromium.launch(headless=True); page = browser.new_page(); page.goto('https://example.com'); print(f'Title: {page.title()}'); browser.close()"
```
You should see a browser window (in visible test) loading example.com. If you get errors, try with Firefox using `playwright install --with-deps firefox`.
**Try in Colab:**
[Open Colab Notebook](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing)
@@ -74,16 +124,19 @@ playwright install chromium
---
## 3. Core Concepts & Configuration
Use `AsyncWebCrawler`, `CrawlerRunConfig`, and `BrowserConfig` to control crawling.
**Example config:**
```python
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
browser_config = BrowserConfig(
headless=True,
viewport_width=1920,
viewport_height=1080,
verbose=True,
viewport_width=1080,
viewport_height=600,
text_mode=False,
ignore_https_errors=True,
java_script_enabled=True
@@ -97,7 +150,7 @@ run_config = CrawlerRunConfig(
wait_for="css:.article-loaded",
page_timeout=60000,
delay_before_return_html=1.0,
mean_delay=0.1,
mean_delay=0.1,
max_range=0.3,
process_iframes=True,
remove_overlay_elements=True,
@@ -115,15 +168,17 @@ run_config = CrawlerRunConfig(
```
**Prefixes:**
- `http://` or `https://` for live pages
- `file://local.html` for local
- `raw:<html>` for raw HTML strings
- `http://` or `https://` for live pages
- `file://local.html` for local
- `raw:<html>` for raw HTML strings
**More info:** [See /docs/async_webcrawler](#) or [3_async_webcrawler.ex.md](https://github.com/unclecode/crawl4ai/blob/main/async_webcrawler.ex.md)
---
## 4. Basic Crawling & Simple Extraction
```python
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun("https://news.example.com/article", config=run_config)
@@ -137,13 +192,15 @@ async with AsyncWebCrawler(config=browser_config) as crawler:
## 5. Markdown Generation & AI-Optimized Output
After crawling, `result.markdown_v2` provides:
- `raw_markdown`: Unfiltered markdown
- `markdown_with_citations`: Links as references at the bottom
- `references_markdown`: A separate list of reference links
- `fit_markdown`: Filtered, relevant markdown (e.g., after BM25)
- `fit_html`: The HTML used to produce `fit_markdown`
- `raw_markdown`: Unfiltered markdown
- `markdown_with_citations`: Links as references at the bottom
- `references_markdown`: A separate list of reference links
- `fit_markdown`: Filtered, relevant markdown (e.g., after BM25)
- `fit_html`: The HTML used to produce `fit_markdown`
**Example:**
```python
print("RAW:", result.markdown_v2.raw_markdown[:200])
print("CITED:", result.markdown_v2.markdown_with_citations[:200])
@@ -158,9 +215,11 @@ For AI training, `fit_markdown` focuses on the most relevant content.
---
## 6. Structured Data Extraction (CSS, XPath, LLM)
Extract JSON data without LLMs:
**CSS:**
```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
@@ -176,6 +235,7 @@ run_config.extraction_strategy = JsonCssExtractionStrategy(schema)
```
**XPath:**
```python
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
@@ -195,6 +255,7 @@ run_config.extraction_strategy = JsonXPathExtractionStrategy(xpath_schema)
---
## 7. Advanced Extraction: LLM & Open-Source Models
Use LLMExtractionStrategy for complex tasks. Works with OpenAI or open-source models (e.g., Ollama).
```python
@@ -217,7 +278,9 @@ run_config.extraction_strategy = LLMExtractionStrategy(
---
## 8. Page Interactions, JS Execution, & Dynamic Content
Insert `js_code` and use `wait_for` to ensure content loads. Example:
```python
run_config.js_code = """
(async () => {
@@ -233,6 +296,7 @@ run_config.wait_for = "css:.item-loaded"
---
## 9. Media, Links, & Metadata Handling
`result.media["images"]`: List of images with `src`, `score`, `alt`. Score indicates relevance.
`result.media["videos"]`, `result.media["audios"]` similarly hold media info.
@@ -242,6 +306,7 @@ run_config.wait_for = "css:.item-loaded"
`result.metadata`: Title, description, keywords, author.
**Example:**
```python
# Images
for img in result.media["images"]:
@@ -263,30 +328,37 @@ print("Description:", result.metadata["description"])
## 10. Authentication & Identity Preservation
### Manual Setup via User Data Directory
1. **Open Chrome with a custom user data dir:**
```bash
"C:\Program Files\Google\Chrome\Application\chrome.exe" --user-data-dir="C:\MyChromeProfile"
```
On macOS:
```bash
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" --user-data-dir="/Users/username/ChromeProfiles/MyProfile"
```
```bash
"C:\Program Files\Google\Chrome\Application\chrome.exe" --user-data-dir="C:\MyChromeProfile"
```
On macOS:
```bash
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" --user-data-dir="/Users/username/ChromeProfiles/MyProfile"
```
2. **Log in to sites, solve CAPTCHAs, adjust settings manually.**
The browser saves cookies/localStorage in that directory.
3. **Use `user_data_dir` in `BrowserConfig`:**
```python
browser_config = BrowserConfig(
headless=True,
user_data_dir="/Users/username/ChromeProfiles/MyProfile"
)
```
Now the crawler starts with those cookies, sessions, etc.
```python
browser_config = BrowserConfig(
headless=True,
user_data_dir="/Users/username/ChromeProfiles/MyProfile"
)
```
Now the crawler starts with those cookies, sessions, etc.
### Using `storage_state`
Alternatively, export and reuse storage states:
```python
browser_config = BrowserConfig(
headless=True,
@@ -301,7 +373,9 @@ No repeated logins needed.
---
## 11. Proxy & Security Enhancements
Use `proxy_config` for authenticated proxies:
```python
browser_config.proxy_config = {
"server": "http://proxy.example.com:8080",
@@ -317,6 +391,7 @@ Combine with `headers` or `ignore_https_errors` as needed.
---
## 12. Screenshots, PDFs & File Downloads
Enable `screenshot=True` or `pdf=True` in `CrawlerRunConfig`:
```python
@@ -325,6 +400,7 @@ run_config.pdf = True
```
After crawling:
```python
if result.screenshot:
with open("page.png", "wb") as f:
@@ -336,6 +412,7 @@ if result.pdf:
```
**File Downloads:**
```python
browser_config.accept_downloads = True
browser_config.downloads_path = "./downloads"
@@ -351,7 +428,9 @@ Also [10_file_download.md](https://github.com/unclecode/crawl4ai/blob/main/file_
---
## 13. Caching & Performance Optimization
Set `cache_mode` to reuse fetch results:
```python
from crawl4ai import CacheMode
run_config.cache_mode = CacheMode.ENABLED
@@ -364,11 +443,13 @@ Adjust delays, increase concurrency, or use `text_mode=True` for faster extracti
---
## 14. Hooks for Custom Logic
Hooks let you run code at specific lifecycle events without creating pages manually in `on_browser_created`.
Use `on_page_context_created` to apply routing or modify page contexts before crawling the URL:
**Example Hook:**
```python
async def on_page_context_created_hook(context, page, **kwargs):
# Block all images to speed up load
@@ -388,21 +469,25 @@ This hook is clean and doesnt create a separate page itself—it just modifie
---
## 15. Dockerization & Scaling
Use Docker images:
- AMD64 basic:
- AMD64 basic:
```bash
docker pull unclecode/crawl4ai:basic-amd64
docker run -p 11235:11235 unclecode/crawl4ai:basic-amd64
```
- ARM64 for M1/M2:
- ARM64 for M1/M2:
```bash
docker pull unclecode/crawl4ai:basic-arm64
docker run -p 11235:11235 unclecode/crawl4ai:basic-arm64
```
- GPU support:
- GPU support:
```bash
docker pull unclecode/crawl4ai:gpu-amd64
docker run --gpus all -p 11235:11235 unclecode/crawl4ai:gpu-amd64
@@ -415,25 +500,28 @@ Scale with load balancers or Kubernetes.
---
## 16. Troubleshooting & Common Pitfalls
- Empty results? Relax filters, check selectors.
- Timeouts? Increase `page_timeout` or refine `wait_for`.
- CAPTCHAs? Use `user_data_dir` or `storage_state` after manual solving.
- JS errors? Try headful mode for debugging.
- Empty results? Relax filters, check selectors.
- Timeouts? Increase `page_timeout` or refine `wait_for`.
- CAPTCHAs? Use `user_data_dir` or `storage_state` after manual solving.
- JS errors? Try headful mode for debugging.
Check [examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples) & [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py) for more code.
---
## 17. Comprehensive End-to-End Example
Combine hooks, JS execution, PDF saving, LLM extraction—see [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py) for a full example.
---
## 18. Further Resources & Community
- **Docs:** [https://crawl4ai.com](https://crawl4ai.com)
- **Issues & PRs:** [https://github.com/unclecode/crawl4ai/issues](https://github.com/unclecode/crawl4ai/issues)
- **Docs:** [https://crawl4ai.com](https://crawl4ai.com)
- **Issues & PRs:** [https://github.com/unclecode/crawl4ai/issues](https://github.com/unclecode/crawl4ai/issues)
Follow [@unclecode](https://x.com/unclecode) for news & community updates.
**Happy Crawling!**
Leverage Crawl4AI to feed your AI models with clean, structured web data today.
Leverage Crawl4AI to feed your AI models with clean, structured web data today.

View File

@@ -65,7 +65,7 @@
#### `viewport_width` and `viewport_height`
- **Description**: Sets the default browser viewport dimensions.
- Default: `1920` (width), `1080` (height)
- Default: `1080` (width), `600` (height)
- **Use Case**:
- Adjust for crawling responsive layouts or specific device emulations.
@@ -134,6 +134,19 @@
- **Use Case**:
- Use for advanced browser configurations like WebRTC or GPU tuning.
#### `verbose`
- **Description**: Enable verbose logging of browser operations.
- Default: `True`
- **Use Case**:
- Enable for detailed logging during development and debugging.
- Disable in production for better performance.
#### `sleep_on_close`
- **Description**: Adds a delay before closing the browser.
- Default: `False`
- **Use Case**:
- Enable when you need to ensure all browser operations are complete before closing.
## CrawlerRunConfig
The `CrawlerRunConfig` class centralizes parameters for controlling crawl operations. This configuration covers content extraction, page interactions, caching, and runtime behaviors. Below is an exhaustive breakdown of parameters and their best-use scenarios.
@@ -341,3 +354,37 @@ The `CrawlerRunConfig` class centralizes parameters for controlling crawl operat
- **Use Case**:
- Enable when debugging JavaScript errors on pages.
##### `parser_type`
- **Description**: Type of parser to use for HTML parsing.
- Default: `"lxml"`
- **Use Case**:
- Use when specific HTML parsing requirements are needed.
- `"lxml"` provides good performance and standards compliance.
##### `prettiify`
- **Description**: Apply `fast_format_html` to produce prettified HTML output.
- Default: `False`
- **Use Case**:
- Enable for better readability of extracted HTML content.
- Useful during development and debugging.
##### `fetch_ssl_certificate`
- **Description**: Fetch and store SSL certificate information during crawling.
- Default: `False`
- **Use Case**:
- Enable when SSL certificate analysis is required.
- Useful for security audits and certificate validation.
##### `url`
- **Description**: Target URL for the crawl operation.
- Default: `None`
- **Use Case**:
- Set when initializing a crawler for a specific URL.
- Can be overridden during actual crawl operations.
##### `log_console`
- **Description**: Log browser console messages during crawling.
- Default: `False`
- **Use Case**:
- Enable to capture JavaScript console output.
- Useful for debugging JavaScript-heavy pages.

View File

@@ -3,6 +3,7 @@
This document provides a comprehensive, human-oriented overview of the `AsyncWebCrawler` class and related components from the `crawl4ai` package. It explains the motivations behind asynchronous crawling, shows how to configure and run crawls, and provides examples for advanced features like dynamic content handling, extraction strategies, caching, containerization, and troubleshooting.
## Introduction
[EDIT: This is not a good way to introduce the library. The library excels at generating crawl data in the form of markdown or extracted JSON as quickly as possible. It is designed to be efficient in terms of memory and CPU usage. Users should choose this library because it generates markdown suitable for large language models and AI. Additionally, it can create structured data, which is beneficial because it supports attaching large language models to generate structured data. It also includes techniques like JSON CSS and JSON XPath extraction, allowing users to define patterns and extract data quickly. One of the library's strengths is its ability to work everywhere. It can crawl any website by offering various capabilities, such as connecting to a remote browser or using persistent data. This feature allows developers to create their own identity on websites where they have authentication access, enabling them to crawl without being mistakenly identified as a bot. This is a better way to introduce the library. In these documents, we discuss the main object, the main class, Asinggull crawlers, and all the functionalities we can achieve with this Asinggull crawler.]
Crawling websites can be slow if done sequentially, especially when handling large numbers of URLs or rendering dynamic pages. Asynchronous crawling helps you run multiple operations concurrently, improving throughput and performance. The `AsyncWebCrawler` class leverages asynchronous I/O and browser automation tools to fetch content efficiently, handle complex DOM interactions, and extract structured data.

View File

@@ -74,9 +74,10 @@ The Markdown generation process transforms raw HTML into a structured format. At
```python
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
options={
"ignore_links": True,

View File

@@ -310,22 +310,6 @@ response = requests.post("http://localhost:11235/crawl", json=request)
> **Note**: Remember to add `.env` to your `.gitignore` to keep your API keys secure!
## Usage Examples 📝
### Basic Crawling