This commit introduces significant enhancements to the Crawl4AI ecosystem: Chrome Extension - Script Builder (Alpha): - Add recording functionality to capture user interactions (clicks, typing, scrolling) - Implement smart event grouping for cleaner script generation - Support export to both JavaScript and C4A script formats - Add timeline view for visualizing and editing recorded actions - Include wait commands (time-based and element-based) - Add saved flows functionality for reusing automation scripts - Update UI with consistent dark terminal theme (Dank Mono font, green/pink accents) - Release new extension versions: v1.1.0, v1.2.0, v1.2.1 LLM Context Builder Improvements: - Reorganize context files from llmtxt/ to llm.txt/ with better structure - Separate diagram templates from text content (diagrams/ and txt/ subdirectories) - Add comprehensive context files for all major Crawl4AI components - Improve file naming convention for better discoverability Documentation Updates: - Update apps index page to match main documentation theme - Standardize color scheme: "Available" tags use primary color (#50ffff) - Change "Coming Soon" tags to dark gray for better visual hierarchy - Add interactive two-column layout for extension landing page - Include code examples for both Schema Builder and Script Builder features Technical Improvements: - Enhance event capture mechanism with better element selection - Add support for contenteditable elements and complex form interactions - Implement proper scroll event handling for both window and element scrolling - Add meta key support for keyboard shortcuts - Improve selector generation for more reliable element targeting The Script Builder is released as Alpha, acknowledging potential bugs while providing early access to this powerful automation recording feature.
788 lines
26 KiB
Plaintext
788 lines
26 KiB
Plaintext
## Extraction Strategies
|
|
|
|
Powerful data extraction from web pages using LLM-based intelligent parsing or fast schema/pattern-based approaches.
|
|
|
|
### LLM-Based Extraction - Intelligent Content Understanding
|
|
|
|
```python
|
|
import os
|
|
import asyncio
|
|
import json
|
|
from pydantic import BaseModel, Field
|
|
from typing import List
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
|
|
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
|
|
|
# Define structured data model
|
|
class Product(BaseModel):
|
|
name: str = Field(description="Product name")
|
|
price: str = Field(description="Product price")
|
|
description: str = Field(description="Product description")
|
|
features: List[str] = Field(description="List of product features")
|
|
rating: float = Field(description="Product rating out of 5")
|
|
|
|
# Configure LLM provider
|
|
llm_config = LLMConfig(
|
|
provider="openai/gpt-4o-mini", # or "ollama/llama3.3", "anthropic/claude-3-5-sonnet"
|
|
api_token=os.getenv("OPENAI_API_KEY"), # or "env:OPENAI_API_KEY"
|
|
temperature=0.1,
|
|
max_tokens=2000
|
|
)
|
|
|
|
# Create LLM extraction strategy
|
|
llm_strategy = LLMExtractionStrategy(
|
|
llm_config=llm_config,
|
|
schema=Product.model_json_schema(),
|
|
extraction_type="schema", # or "block" for freeform text
|
|
instruction="""
|
|
Extract product information from the webpage content.
|
|
Focus on finding complete product details including:
|
|
- Product name and price
|
|
- Detailed description
|
|
- All listed features
|
|
- Customer rating if available
|
|
Return valid JSON array of products.
|
|
""",
|
|
chunk_token_threshold=1200, # Split content if too large
|
|
overlap_rate=0.1, # 10% overlap between chunks
|
|
apply_chunking=True, # Enable automatic chunking
|
|
input_format="markdown", # "html", "fit_markdown", or "markdown"
|
|
extra_args={"temperature": 0.0, "max_tokens": 800},
|
|
verbose=True
|
|
)
|
|
|
|
async def extract_with_llm():
|
|
browser_config = BrowserConfig(headless=True)
|
|
|
|
crawl_config = CrawlerRunConfig(
|
|
extraction_strategy=llm_strategy,
|
|
cache_mode=CacheMode.BYPASS,
|
|
word_count_threshold=10
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com/products",
|
|
config=crawl_config
|
|
)
|
|
|
|
if result.success:
|
|
# Parse extracted JSON
|
|
products = json.loads(result.extracted_content)
|
|
print(f"Extracted {len(products)} products")
|
|
|
|
for product in products[:3]: # Show first 3
|
|
print(f"Product: {product['name']}")
|
|
print(f"Price: {product['price']}")
|
|
print(f"Rating: {product.get('rating', 'N/A')}")
|
|
|
|
# Show token usage and cost
|
|
llm_strategy.show_usage()
|
|
else:
|
|
print(f"Extraction failed: {result.error_message}")
|
|
|
|
asyncio.run(extract_with_llm())
|
|
```
|
|
|
|
### LLM Strategy Advanced Configuration
|
|
|
|
```python
|
|
# Multiple provider configurations
|
|
providers = {
|
|
"openai": LLMConfig(
|
|
provider="openai/gpt-4o",
|
|
api_token="env:OPENAI_API_KEY",
|
|
temperature=0.1
|
|
),
|
|
"anthropic": LLMConfig(
|
|
provider="anthropic/claude-3-5-sonnet-20240620",
|
|
api_token="env:ANTHROPIC_API_KEY",
|
|
max_tokens=4000
|
|
),
|
|
"ollama": LLMConfig(
|
|
provider="ollama/llama3.3",
|
|
api_token=None, # Not needed for Ollama
|
|
base_url="http://localhost:11434"
|
|
),
|
|
"groq": LLMConfig(
|
|
provider="groq/llama3-70b-8192",
|
|
api_token="env:GROQ_API_KEY"
|
|
)
|
|
}
|
|
|
|
# Advanced chunking for large content
|
|
large_content_strategy = LLMExtractionStrategy(
|
|
llm_config=providers["openai"],
|
|
schema=YourModel.model_json_schema(),
|
|
extraction_type="schema",
|
|
instruction="Extract detailed information...",
|
|
|
|
# Chunking parameters
|
|
chunk_token_threshold=2000, # Larger chunks for complex content
|
|
overlap_rate=0.15, # More overlap for context preservation
|
|
apply_chunking=True,
|
|
|
|
# Input format selection
|
|
input_format="fit_markdown", # Use filtered content if available
|
|
|
|
# LLM parameters
|
|
extra_args={
|
|
"temperature": 0.0, # Deterministic output
|
|
"top_p": 0.9,
|
|
"frequency_penalty": 0.1,
|
|
"presence_penalty": 0.1,
|
|
"max_tokens": 1500
|
|
},
|
|
verbose=True
|
|
)
|
|
|
|
# Knowledge graph extraction
|
|
class Entity(BaseModel):
|
|
name: str
|
|
type: str # "person", "organization", "location", etc.
|
|
description: str
|
|
|
|
class Relationship(BaseModel):
|
|
source: str
|
|
target: str
|
|
relationship: str
|
|
confidence: float
|
|
|
|
class KnowledgeGraph(BaseModel):
|
|
entities: List[Entity]
|
|
relationships: List[Relationship]
|
|
summary: str
|
|
|
|
knowledge_strategy = LLMExtractionStrategy(
|
|
llm_config=providers["anthropic"],
|
|
schema=KnowledgeGraph.model_json_schema(),
|
|
extraction_type="schema",
|
|
instruction="""
|
|
Create a knowledge graph from the content by:
|
|
1. Identifying key entities (people, organizations, locations, concepts)
|
|
2. Finding relationships between entities
|
|
3. Providing confidence scores for relationships
|
|
4. Summarizing the main topics
|
|
""",
|
|
input_format="html", # Use HTML for better structure preservation
|
|
apply_chunking=True,
|
|
chunk_token_threshold=1500
|
|
)
|
|
```
|
|
|
|
### JSON CSS Extraction - Fast Schema-Based Extraction
|
|
|
|
```python
|
|
import asyncio
|
|
import json
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
|
|
|
# Basic CSS extraction schema
|
|
simple_schema = {
|
|
"name": "Product Listings",
|
|
"baseSelector": "div.product-card",
|
|
"fields": [
|
|
{
|
|
"name": "title",
|
|
"selector": "h2.product-title",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"name": "price",
|
|
"selector": ".price",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"name": "image_url",
|
|
"selector": "img.product-image",
|
|
"type": "attribute",
|
|
"attribute": "src"
|
|
},
|
|
{
|
|
"name": "product_url",
|
|
"selector": "a.product-link",
|
|
"type": "attribute",
|
|
"attribute": "href"
|
|
}
|
|
]
|
|
}
|
|
|
|
# Complex nested schema with multiple data types
|
|
complex_schema = {
|
|
"name": "E-commerce Product Catalog",
|
|
"baseSelector": "div.category",
|
|
"baseFields": [
|
|
{
|
|
"name": "category_id",
|
|
"type": "attribute",
|
|
"attribute": "data-category-id"
|
|
},
|
|
{
|
|
"name": "category_url",
|
|
"type": "attribute",
|
|
"attribute": "data-url"
|
|
}
|
|
],
|
|
"fields": [
|
|
{
|
|
"name": "category_name",
|
|
"selector": "h2.category-title",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"name": "products",
|
|
"selector": "div.product",
|
|
"type": "nested_list", # Array of complex objects
|
|
"fields": [
|
|
{
|
|
"name": "name",
|
|
"selector": "h3.product-name",
|
|
"type": "text",
|
|
"default": "Unknown Product"
|
|
},
|
|
{
|
|
"name": "price",
|
|
"selector": "span.price",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"name": "details",
|
|
"selector": "div.product-details",
|
|
"type": "nested", # Single complex object
|
|
"fields": [
|
|
{
|
|
"name": "brand",
|
|
"selector": "span.brand",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"name": "model",
|
|
"selector": "span.model",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"name": "specs",
|
|
"selector": "div.specifications",
|
|
"type": "html" # Preserve HTML structure
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"name": "features",
|
|
"selector": "ul.features li",
|
|
"type": "list", # Simple array of strings
|
|
"fields": [
|
|
{"name": "feature", "type": "text"}
|
|
]
|
|
},
|
|
{
|
|
"name": "reviews",
|
|
"selector": "div.review",
|
|
"type": "nested_list",
|
|
"fields": [
|
|
{
|
|
"name": "reviewer",
|
|
"selector": "span.reviewer-name",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"name": "rating",
|
|
"selector": "span.rating",
|
|
"type": "attribute",
|
|
"attribute": "data-rating"
|
|
},
|
|
{
|
|
"name": "comment",
|
|
"selector": "p.review-text",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"name": "date",
|
|
"selector": "time.review-date",
|
|
"type": "attribute",
|
|
"attribute": "datetime"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
|
|
async def extract_with_css_schema():
|
|
strategy = JsonCssExtractionStrategy(complex_schema, verbose=True)
|
|
|
|
config = CrawlerRunConfig(
|
|
extraction_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS,
|
|
# Enable dynamic content loading if needed
|
|
js_code="window.scrollTo(0, document.body.scrollHeight);",
|
|
wait_for="css:.product:nth-child(10)", # Wait for products to load
|
|
process_iframes=True
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com/catalog",
|
|
config=config
|
|
)
|
|
|
|
if result.success:
|
|
data = json.loads(result.extracted_content)
|
|
print(f"Extracted {len(data)} categories")
|
|
|
|
for category in data:
|
|
print(f"Category: {category['category_name']}")
|
|
print(f"Products: {len(category.get('products', []))}")
|
|
|
|
# Show first product details
|
|
if category.get('products'):
|
|
product = category['products'][0]
|
|
print(f" First product: {product.get('name')}")
|
|
print(f" Features: {len(product.get('features', []))}")
|
|
print(f" Reviews: {len(product.get('reviews', []))}")
|
|
|
|
asyncio.run(extract_with_css_schema())
|
|
```
|
|
|
|
### Automatic Schema Generation - One-Time LLM, Unlimited Use
|
|
|
|
```python
|
|
import json
|
|
import asyncio
|
|
from pathlib import Path
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
|
|
|
async def generate_and_use_schema():
|
|
"""
|
|
1. Use LLM once to generate schema from sample HTML
|
|
2. Cache the schema for reuse
|
|
3. Use cached schema for fast extraction without LLM calls
|
|
"""
|
|
|
|
cache_dir = Path("./schema_cache")
|
|
cache_dir.mkdir(exist_ok=True)
|
|
schema_file = cache_dir / "ecommerce_schema.json"
|
|
|
|
# Step 1: Generate or load cached schema
|
|
if schema_file.exists():
|
|
schema = json.load(schema_file.open())
|
|
print("Using cached schema")
|
|
else:
|
|
print("Generating schema using LLM...")
|
|
|
|
# Configure LLM for schema generation
|
|
llm_config = LLMConfig(
|
|
provider="openai/gpt-4o", # or "ollama/llama3.3" for local
|
|
api_token="env:OPENAI_API_KEY"
|
|
)
|
|
|
|
# Get sample HTML from target site
|
|
async with AsyncWebCrawler() as crawler:
|
|
sample_result = await crawler.arun(
|
|
url="https://example.com/products",
|
|
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
|
)
|
|
sample_html = sample_result.cleaned_html[:5000] # Use first 5k chars
|
|
|
|
# Generate schema automatically (ONE-TIME LLM COST)
|
|
schema = JsonCssExtractionStrategy.generate_schema(
|
|
html=sample_html,
|
|
schema_type="css",
|
|
llm_config=llm_config,
|
|
instruction="Extract product information including name, price, description, and features"
|
|
)
|
|
|
|
# Cache schema for future use (NO MORE LLM CALLS)
|
|
json.dump(schema, schema_file.open("w"), indent=2)
|
|
print("Schema generated and cached")
|
|
|
|
# Step 2: Use schema for fast extraction (NO LLM CALLS)
|
|
strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
|
|
|
config = CrawlerRunConfig(
|
|
extraction_strategy=strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
# Step 3: Extract from multiple pages using same schema
|
|
urls = [
|
|
"https://example.com/products",
|
|
"https://example.com/electronics",
|
|
"https://example.com/books"
|
|
]
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
for url in urls:
|
|
result = await crawler.arun(url=url, config=config)
|
|
|
|
if result.success:
|
|
data = json.loads(result.extracted_content)
|
|
print(f"{url}: Extracted {len(data)} items")
|
|
else:
|
|
print(f"{url}: Failed - {result.error_message}")
|
|
|
|
asyncio.run(generate_and_use_schema())
|
|
```
|
|
|
|
### XPath Extraction Strategy
|
|
|
|
```python
|
|
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
|
|
|
|
# XPath-based schema (alternative to CSS)
|
|
xpath_schema = {
|
|
"name": "News Articles",
|
|
"baseSelector": "//article[@class='news-item']",
|
|
"baseFields": [
|
|
{
|
|
"name": "article_id",
|
|
"type": "attribute",
|
|
"attribute": "data-id"
|
|
}
|
|
],
|
|
"fields": [
|
|
{
|
|
"name": "headline",
|
|
"selector": ".//h2[@class='headline']",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"name": "author",
|
|
"selector": ".//span[@class='author']/text()",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"name": "publish_date",
|
|
"selector": ".//time/@datetime",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"name": "content",
|
|
"selector": ".//div[@class='article-body']",
|
|
"type": "html"
|
|
},
|
|
{
|
|
"name": "tags",
|
|
"selector": ".//div[@class='tags']/span[@class='tag']",
|
|
"type": "list",
|
|
"fields": [
|
|
{"name": "tag", "type": "text"}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
|
|
# Generate XPath schema automatically
|
|
async def generate_xpath_schema():
|
|
llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None)
|
|
|
|
sample_html = """
|
|
<article class="news-item" data-id="123">
|
|
<h2 class="headline">Breaking News</h2>
|
|
<span class="author">John Doe</span>
|
|
<time datetime="2024-01-01">Today</time>
|
|
<div class="article-body"><p>Content here...</p></div>
|
|
</article>
|
|
"""
|
|
|
|
schema = JsonXPathExtractionStrategy.generate_schema(
|
|
html=sample_html,
|
|
schema_type="xpath",
|
|
llm_config=llm_config
|
|
)
|
|
|
|
return schema
|
|
|
|
# Use XPath strategy
|
|
xpath_strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
|
|
```
|
|
|
|
### Regex Extraction Strategy - Pattern-Based Fast Extraction
|
|
|
|
```python
|
|
from crawl4ai.extraction_strategy import RegexExtractionStrategy
|
|
|
|
# Built-in patterns for common data types
|
|
async def extract_with_builtin_patterns():
|
|
# Use multiple built-in patterns
|
|
strategy = RegexExtractionStrategy(
|
|
pattern=(
|
|
RegexExtractionStrategy.Email |
|
|
RegexExtractionStrategy.PhoneUS |
|
|
RegexExtractionStrategy.Url |
|
|
RegexExtractionStrategy.Currency |
|
|
RegexExtractionStrategy.DateIso
|
|
)
|
|
)
|
|
|
|
config = CrawlerRunConfig(extraction_strategy=strategy)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com/contact",
|
|
config=config
|
|
)
|
|
|
|
if result.success:
|
|
matches = json.loads(result.extracted_content)
|
|
|
|
# Group by pattern type
|
|
by_type = {}
|
|
for match in matches:
|
|
label = match['label']
|
|
if label not in by_type:
|
|
by_type[label] = []
|
|
by_type[label].append(match['value'])
|
|
|
|
for pattern_type, values in by_type.items():
|
|
print(f"{pattern_type}: {len(values)} matches")
|
|
for value in values[:3]: # Show first 3
|
|
print(f" {value}")
|
|
|
|
# Custom regex patterns
|
|
custom_patterns = {
|
|
"product_code": r"SKU-\d{4,6}",
|
|
"discount": r"\d{1,2}%\s*off",
|
|
"model_number": r"Model:\s*([A-Z0-9-]+)"
|
|
}
|
|
|
|
async def extract_with_custom_patterns():
|
|
strategy = RegexExtractionStrategy(custom=custom_patterns)
|
|
|
|
config = CrawlerRunConfig(extraction_strategy=strategy)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com/products",
|
|
config=config
|
|
)
|
|
|
|
if result.success:
|
|
data = json.loads(result.extracted_content)
|
|
for item in data:
|
|
print(f"{item['label']}: {item['value']}")
|
|
|
|
# LLM-generated patterns (one-time cost)
|
|
async def generate_custom_patterns():
|
|
cache_file = Path("./patterns/price_patterns.json")
|
|
|
|
if cache_file.exists():
|
|
patterns = json.load(cache_file.open())
|
|
else:
|
|
llm_config = LLMConfig(
|
|
provider="openai/gpt-4o-mini",
|
|
api_token="env:OPENAI_API_KEY"
|
|
)
|
|
|
|
# Get sample content
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun("https://example.com/pricing")
|
|
sample_html = result.cleaned_html
|
|
|
|
# Generate optimized patterns
|
|
patterns = RegexExtractionStrategy.generate_pattern(
|
|
label="pricing_info",
|
|
html=sample_html,
|
|
query="Extract all pricing information including discounts and special offers",
|
|
llm_config=llm_config
|
|
)
|
|
|
|
# Cache for reuse
|
|
cache_file.parent.mkdir(exist_ok=True)
|
|
json.dump(patterns, cache_file.open("w"), indent=2)
|
|
|
|
# Use cached patterns (no more LLM calls)
|
|
strategy = RegexExtractionStrategy(custom=patterns)
|
|
return strategy
|
|
|
|
asyncio.run(extract_with_builtin_patterns())
|
|
asyncio.run(extract_with_custom_patterns())
|
|
```
|
|
|
|
### Complete Extraction Workflow - Combining Strategies
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
from crawl4ai.extraction_strategy import (
|
|
JsonCssExtractionStrategy,
|
|
RegexExtractionStrategy,
|
|
LLMExtractionStrategy
|
|
)
|
|
|
|
async def multi_strategy_extraction():
|
|
"""
|
|
Demonstrate using multiple extraction strategies in sequence:
|
|
1. Fast regex for common patterns
|
|
2. Schema-based for structured data
|
|
3. LLM for complex reasoning
|
|
"""
|
|
|
|
browser_config = BrowserConfig(headless=True)
|
|
|
|
# Strategy 1: Fast regex extraction
|
|
regex_strategy = RegexExtractionStrategy(
|
|
pattern=RegexExtractionStrategy.Email | RegexExtractionStrategy.PhoneUS
|
|
)
|
|
|
|
# Strategy 2: Schema-based structured extraction
|
|
product_schema = {
|
|
"name": "Products",
|
|
"baseSelector": "div.product",
|
|
"fields": [
|
|
{"name": "name", "selector": "h3", "type": "text"},
|
|
{"name": "price", "selector": ".price", "type": "text"},
|
|
{"name": "rating", "selector": ".rating", "type": "attribute", "attribute": "data-rating"}
|
|
]
|
|
}
|
|
css_strategy = JsonCssExtractionStrategy(product_schema)
|
|
|
|
# Strategy 3: LLM for complex analysis
|
|
llm_strategy = LLMExtractionStrategy(
|
|
llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token="env:OPENAI_API_KEY"),
|
|
schema={
|
|
"type": "object",
|
|
"properties": {
|
|
"sentiment": {"type": "string"},
|
|
"key_topics": {"type": "array", "items": {"type": "string"}},
|
|
"summary": {"type": "string"}
|
|
}
|
|
},
|
|
extraction_type="schema",
|
|
instruction="Analyze the content sentiment, extract key topics, and provide a summary"
|
|
)
|
|
|
|
url = "https://example.com/product-reviews"
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
# Extract contact info with regex
|
|
regex_config = CrawlerRunConfig(extraction_strategy=regex_strategy)
|
|
regex_result = await crawler.arun(url=url, config=regex_config)
|
|
|
|
# Extract structured product data
|
|
css_config = CrawlerRunConfig(extraction_strategy=css_strategy)
|
|
css_result = await crawler.arun(url=url, config=css_config)
|
|
|
|
# Extract insights with LLM
|
|
llm_config = CrawlerRunConfig(extraction_strategy=llm_strategy)
|
|
llm_result = await crawler.arun(url=url, config=llm_config)
|
|
|
|
# Combine results
|
|
results = {
|
|
"contacts": json.loads(regex_result.extracted_content) if regex_result.success else [],
|
|
"products": json.loads(css_result.extracted_content) if css_result.success else [],
|
|
"analysis": json.loads(llm_result.extracted_content) if llm_result.success else {}
|
|
}
|
|
|
|
print(f"Found {len(results['contacts'])} contact entries")
|
|
print(f"Found {len(results['products'])} products")
|
|
print(f"Sentiment: {results['analysis'].get('sentiment', 'N/A')}")
|
|
|
|
return results
|
|
|
|
# Performance comparison
|
|
async def compare_extraction_performance():
|
|
"""Compare speed and accuracy of different strategies"""
|
|
import time
|
|
|
|
url = "https://example.com/large-catalog"
|
|
|
|
strategies = {
|
|
"regex": RegexExtractionStrategy(pattern=RegexExtractionStrategy.Currency),
|
|
"css": JsonCssExtractionStrategy({
|
|
"name": "Prices",
|
|
"baseSelector": ".price",
|
|
"fields": [{"name": "amount", "selector": "span", "type": "text"}]
|
|
}),
|
|
"llm": LLMExtractionStrategy(
|
|
llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token="env:OPENAI_API_KEY"),
|
|
instruction="Extract all prices from the content",
|
|
extraction_type="block"
|
|
)
|
|
}
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
for name, strategy in strategies.items():
|
|
start_time = time.time()
|
|
|
|
config = CrawlerRunConfig(extraction_strategy=strategy)
|
|
result = await crawler.arun(url=url, config=config)
|
|
|
|
duration = time.time() - start_time
|
|
|
|
if result.success:
|
|
data = json.loads(result.extracted_content)
|
|
print(f"{name}: {len(data)} items in {duration:.2f}s")
|
|
else:
|
|
print(f"{name}: Failed in {duration:.2f}s")
|
|
|
|
asyncio.run(multi_strategy_extraction())
|
|
asyncio.run(compare_extraction_performance())
|
|
```
|
|
|
|
### Best Practices and Strategy Selection
|
|
|
|
```python
|
|
# Strategy selection guide
|
|
def choose_extraction_strategy(use_case):
|
|
"""
|
|
Guide for selecting the right extraction strategy
|
|
"""
|
|
|
|
strategies = {
|
|
# Fast pattern matching for common data types
|
|
"contact_info": RegexExtractionStrategy(
|
|
pattern=RegexExtractionStrategy.Email | RegexExtractionStrategy.PhoneUS
|
|
),
|
|
|
|
# Structured data from consistent HTML
|
|
"product_catalogs": JsonCssExtractionStrategy,
|
|
|
|
# Complex reasoning and semantic understanding
|
|
"content_analysis": LLMExtractionStrategy,
|
|
|
|
# Mixed approach for comprehensive extraction
|
|
"complete_site_analysis": "multi_strategy"
|
|
}
|
|
|
|
recommendations = {
|
|
"speed_priority": "Use RegexExtractionStrategy for simple patterns, JsonCssExtractionStrategy for structured data",
|
|
"accuracy_priority": "Use LLMExtractionStrategy for complex content, JsonCssExtractionStrategy for predictable structure",
|
|
"cost_priority": "Avoid LLM strategies, use schema generation once then JsonCssExtractionStrategy",
|
|
"scale_priority": "Cache schemas, use regex for simple patterns, avoid LLM for high-volume extraction"
|
|
}
|
|
|
|
return recommendations.get(use_case, "Combine strategies based on content complexity")
|
|
|
|
# Error handling and validation
|
|
async def robust_extraction():
|
|
strategies = [
|
|
RegexExtractionStrategy(pattern=RegexExtractionStrategy.Email),
|
|
JsonCssExtractionStrategy(simple_schema),
|
|
# LLM as fallback for complex cases
|
|
]
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
for strategy in strategies:
|
|
try:
|
|
config = CrawlerRunConfig(extraction_strategy=strategy)
|
|
result = await crawler.arun(url="https://example.com", config=config)
|
|
|
|
if result.success and result.extracted_content:
|
|
data = json.loads(result.extracted_content)
|
|
if data: # Validate non-empty results
|
|
print(f"Success with {strategy.__class__.__name__}")
|
|
return data
|
|
|
|
except Exception as e:
|
|
print(f"Strategy {strategy.__class__.__name__} failed: {e}")
|
|
continue
|
|
|
|
print("All strategies failed")
|
|
return None
|
|
```
|
|
|
|
**📖 Learn more:** [LLM Strategies Deep Dive](https://docs.crawl4ai.com/extraction/llm-strategies/), [Schema-Based Extraction](https://docs.crawl4ai.com/extraction/no-llm-strategies/), [Regex Patterns](https://docs.crawl4ai.com/extraction/no-llm-strategies/#regexextractionstrategy), [Performance Optimization](https://docs.crawl4ai.com/advanced/multi-url-crawling/) |