feat: add Script Builder to Chrome Extension and reorganize LLM context files

This commit introduces significant enhancements to the Crawl4AI ecosystem:

  Chrome Extension - Script Builder (Alpha):
  - Add recording functionality to capture user interactions (clicks, typing, scrolling)
  - Implement smart event grouping for cleaner script generation
  - Support export to both JavaScript and C4A script formats
  - Add timeline view for visualizing and editing recorded actions
  - Include wait commands (time-based and element-based)
  - Add saved flows functionality for reusing automation scripts
  - Update UI with consistent dark terminal theme (Dank Mono font, green/pink accents)
  - Release new extension versions: v1.1.0, v1.2.0, v1.2.1

  LLM Context Builder Improvements:
  - Reorganize context files from llmtxt/ to llm.txt/ with better structure
  - Separate diagram templates from text content (diagrams/ and txt/ subdirectories)
  - Add comprehensive context files for all major Crawl4AI components
  - Improve file naming convention for better discoverability

  Documentation Updates:
  - Update apps index page to match main documentation theme
  - Standardize color scheme: "Available" tags use primary color (#50ffff)
  - Change "Coming Soon" tags to dark gray for better visual hierarchy
  - Add interactive two-column layout for extension landing page
  - Include code examples for both Schema Builder and Script Builder features

  Technical Improvements:
  - Enhance event capture mechanism with better element selection
  - Add support for contenteditable elements and complex form interactions
  - Implement proper scroll event handling for both window and element scrolling
  - Add meta key support for keyboard shortcuts
  - Improve selector generation for more reliable element targeting

  The Script Builder is released as Alpha, acknowledging potential bugs while providing
  early access to this powerful automation recording feature.
This commit is contained in:
UncleCode
2025-06-08 22:02:12 +08:00
parent 926592649e
commit 40640badad
72 changed files with 28600 additions and 100986 deletions

View File

@@ -0,0 +1,788 @@
## Extraction Strategies
Powerful data extraction from web pages using LLM-based intelligent parsing or fast schema/pattern-based approaches.
### LLM-Based Extraction - Intelligent Content Understanding
```python
import os
import asyncio
import json
from pydantic import BaseModel, Field
from typing import List
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
# Define structured data model
class Product(BaseModel):
name: str = Field(description="Product name")
price: str = Field(description="Product price")
description: str = Field(description="Product description")
features: List[str] = Field(description="List of product features")
rating: float = Field(description="Product rating out of 5")
# Configure LLM provider
llm_config = LLMConfig(
provider="openai/gpt-4o-mini", # or "ollama/llama3.3", "anthropic/claude-3-5-sonnet"
api_token=os.getenv("OPENAI_API_KEY"), # or "env:OPENAI_API_KEY"
temperature=0.1,
max_tokens=2000
)
# Create LLM extraction strategy
llm_strategy = LLMExtractionStrategy(
llm_config=llm_config,
schema=Product.model_json_schema(),
extraction_type="schema", # or "block" for freeform text
instruction="""
Extract product information from the webpage content.
Focus on finding complete product details including:
- Product name and price
- Detailed description
- All listed features
- Customer rating if available
Return valid JSON array of products.
""",
chunk_token_threshold=1200, # Split content if too large
overlap_rate=0.1, # 10% overlap between chunks
apply_chunking=True, # Enable automatic chunking
input_format="markdown", # "html", "fit_markdown", or "markdown"
extra_args={"temperature": 0.0, "max_tokens": 800},
verbose=True
)
async def extract_with_llm():
browser_config = BrowserConfig(headless=True)
crawl_config = CrawlerRunConfig(
extraction_strategy=llm_strategy,
cache_mode=CacheMode.BYPASS,
word_count_threshold=10
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://example.com/products",
config=crawl_config
)
if result.success:
# Parse extracted JSON
products = json.loads(result.extracted_content)
print(f"Extracted {len(products)} products")
for product in products[:3]: # Show first 3
print(f"Product: {product['name']}")
print(f"Price: {product['price']}")
print(f"Rating: {product.get('rating', 'N/A')}")
# Show token usage and cost
llm_strategy.show_usage()
else:
print(f"Extraction failed: {result.error_message}")
asyncio.run(extract_with_llm())
```
### LLM Strategy Advanced Configuration
```python
# Multiple provider configurations
providers = {
"openai": LLMConfig(
provider="openai/gpt-4o",
api_token="env:OPENAI_API_KEY",
temperature=0.1
),
"anthropic": LLMConfig(
provider="anthropic/claude-3-5-sonnet-20240620",
api_token="env:ANTHROPIC_API_KEY",
max_tokens=4000
),
"ollama": LLMConfig(
provider="ollama/llama3.3",
api_token=None, # Not needed for Ollama
base_url="http://localhost:11434"
),
"groq": LLMConfig(
provider="groq/llama3-70b-8192",
api_token="env:GROQ_API_KEY"
)
}
# Advanced chunking for large content
large_content_strategy = LLMExtractionStrategy(
llm_config=providers["openai"],
schema=YourModel.model_json_schema(),
extraction_type="schema",
instruction="Extract detailed information...",
# Chunking parameters
chunk_token_threshold=2000, # Larger chunks for complex content
overlap_rate=0.15, # More overlap for context preservation
apply_chunking=True,
# Input format selection
input_format="fit_markdown", # Use filtered content if available
# LLM parameters
extra_args={
"temperature": 0.0, # Deterministic output
"top_p": 0.9,
"frequency_penalty": 0.1,
"presence_penalty": 0.1,
"max_tokens": 1500
},
verbose=True
)
# Knowledge graph extraction
class Entity(BaseModel):
name: str
type: str # "person", "organization", "location", etc.
description: str
class Relationship(BaseModel):
source: str
target: str
relationship: str
confidence: float
class KnowledgeGraph(BaseModel):
entities: List[Entity]
relationships: List[Relationship]
summary: str
knowledge_strategy = LLMExtractionStrategy(
llm_config=providers["anthropic"],
schema=KnowledgeGraph.model_json_schema(),
extraction_type="schema",
instruction="""
Create a knowledge graph from the content by:
1. Identifying key entities (people, organizations, locations, concepts)
2. Finding relationships between entities
3. Providing confidence scores for relationships
4. Summarizing the main topics
""",
input_format="html", # Use HTML for better structure preservation
apply_chunking=True,
chunk_token_threshold=1500
)
```
### JSON CSS Extraction - Fast Schema-Based Extraction
```python
import asyncio
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
# Basic CSS extraction schema
simple_schema = {
"name": "Product Listings",
"baseSelector": "div.product-card",
"fields": [
{
"name": "title",
"selector": "h2.product-title",
"type": "text"
},
{
"name": "price",
"selector": ".price",
"type": "text"
},
{
"name": "image_url",
"selector": "img.product-image",
"type": "attribute",
"attribute": "src"
},
{
"name": "product_url",
"selector": "a.product-link",
"type": "attribute",
"attribute": "href"
}
]
}
# Complex nested schema with multiple data types
complex_schema = {
"name": "E-commerce Product Catalog",
"baseSelector": "div.category",
"baseFields": [
{
"name": "category_id",
"type": "attribute",
"attribute": "data-category-id"
},
{
"name": "category_url",
"type": "attribute",
"attribute": "data-url"
}
],
"fields": [
{
"name": "category_name",
"selector": "h2.category-title",
"type": "text"
},
{
"name": "products",
"selector": "div.product",
"type": "nested_list", # Array of complex objects
"fields": [
{
"name": "name",
"selector": "h3.product-name",
"type": "text",
"default": "Unknown Product"
},
{
"name": "price",
"selector": "span.price",
"type": "text"
},
{
"name": "details",
"selector": "div.product-details",
"type": "nested", # Single complex object
"fields": [
{
"name": "brand",
"selector": "span.brand",
"type": "text"
},
{
"name": "model",
"selector": "span.model",
"type": "text"
},
{
"name": "specs",
"selector": "div.specifications",
"type": "html" # Preserve HTML structure
}
]
},
{
"name": "features",
"selector": "ul.features li",
"type": "list", # Simple array of strings
"fields": [
{"name": "feature", "type": "text"}
]
},
{
"name": "reviews",
"selector": "div.review",
"type": "nested_list",
"fields": [
{
"name": "reviewer",
"selector": "span.reviewer-name",
"type": "text"
},
{
"name": "rating",
"selector": "span.rating",
"type": "attribute",
"attribute": "data-rating"
},
{
"name": "comment",
"selector": "p.review-text",
"type": "text"
},
{
"name": "date",
"selector": "time.review-date",
"type": "attribute",
"attribute": "datetime"
}
]
}
]
}
]
}
async def extract_with_css_schema():
strategy = JsonCssExtractionStrategy(complex_schema, verbose=True)
config = CrawlerRunConfig(
extraction_strategy=strategy,
cache_mode=CacheMode.BYPASS,
# Enable dynamic content loading if needed
js_code="window.scrollTo(0, document.body.scrollHeight);",
wait_for="css:.product:nth-child(10)", # Wait for products to load
process_iframes=True
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com/catalog",
config=config
)
if result.success:
data = json.loads(result.extracted_content)
print(f"Extracted {len(data)} categories")
for category in data:
print(f"Category: {category['category_name']}")
print(f"Products: {len(category.get('products', []))}")
# Show first product details
if category.get('products'):
product = category['products'][0]
print(f" First product: {product.get('name')}")
print(f" Features: {len(product.get('features', []))}")
print(f" Reviews: {len(product.get('reviews', []))}")
asyncio.run(extract_with_css_schema())
```
### Automatic Schema Generation - One-Time LLM, Unlimited Use
```python
import json
import asyncio
from pathlib import Path
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
async def generate_and_use_schema():
"""
1. Use LLM once to generate schema from sample HTML
2. Cache the schema for reuse
3. Use cached schema for fast extraction without LLM calls
"""
cache_dir = Path("./schema_cache")
cache_dir.mkdir(exist_ok=True)
schema_file = cache_dir / "ecommerce_schema.json"
# Step 1: Generate or load cached schema
if schema_file.exists():
schema = json.load(schema_file.open())
print("Using cached schema")
else:
print("Generating schema using LLM...")
# Configure LLM for schema generation
llm_config = LLMConfig(
provider="openai/gpt-4o", # or "ollama/llama3.3" for local
api_token="env:OPENAI_API_KEY"
)
# Get sample HTML from target site
async with AsyncWebCrawler() as crawler:
sample_result = await crawler.arun(
url="https://example.com/products",
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
)
sample_html = sample_result.cleaned_html[:5000] # Use first 5k chars
# Generate schema automatically (ONE-TIME LLM COST)
schema = JsonCssExtractionStrategy.generate_schema(
html=sample_html,
schema_type="css",
llm_config=llm_config,
instruction="Extract product information including name, price, description, and features"
)
# Cache schema for future use (NO MORE LLM CALLS)
json.dump(schema, schema_file.open("w"), indent=2)
print("Schema generated and cached")
# Step 2: Use schema for fast extraction (NO LLM CALLS)
strategy = JsonCssExtractionStrategy(schema, verbose=True)
config = CrawlerRunConfig(
extraction_strategy=strategy,
cache_mode=CacheMode.BYPASS
)
# Step 3: Extract from multiple pages using same schema
urls = [
"https://example.com/products",
"https://example.com/electronics",
"https://example.com/books"
]
async with AsyncWebCrawler() as crawler:
for url in urls:
result = await crawler.arun(url=url, config=config)
if result.success:
data = json.loads(result.extracted_content)
print(f"{url}: Extracted {len(data)} items")
else:
print(f"{url}: Failed - {result.error_message}")
asyncio.run(generate_and_use_schema())
```
### XPath Extraction Strategy
```python
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
# XPath-based schema (alternative to CSS)
xpath_schema = {
"name": "News Articles",
"baseSelector": "//article[@class='news-item']",
"baseFields": [
{
"name": "article_id",
"type": "attribute",
"attribute": "data-id"
}
],
"fields": [
{
"name": "headline",
"selector": ".//h2[@class='headline']",
"type": "text"
},
{
"name": "author",
"selector": ".//span[@class='author']/text()",
"type": "text"
},
{
"name": "publish_date",
"selector": ".//time/@datetime",
"type": "text"
},
{
"name": "content",
"selector": ".//div[@class='article-body']",
"type": "html"
},
{
"name": "tags",
"selector": ".//div[@class='tags']/span[@class='tag']",
"type": "list",
"fields": [
{"name": "tag", "type": "text"}
]
}
]
}
# Generate XPath schema automatically
async def generate_xpath_schema():
llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None)
sample_html = """
<article class="news-item" data-id="123">
<h2 class="headline">Breaking News</h2>
<span class="author">John Doe</span>
<time datetime="2024-01-01">Today</time>
<div class="article-body"><p>Content here...</p></div>
</article>
"""
schema = JsonXPathExtractionStrategy.generate_schema(
html=sample_html,
schema_type="xpath",
llm_config=llm_config
)
return schema
# Use XPath strategy
xpath_strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
```
### Regex Extraction Strategy - Pattern-Based Fast Extraction
```python
from crawl4ai.extraction_strategy import RegexExtractionStrategy
# Built-in patterns for common data types
async def extract_with_builtin_patterns():
# Use multiple built-in patterns
strategy = RegexExtractionStrategy(
pattern=(
RegexExtractionStrategy.Email |
RegexExtractionStrategy.PhoneUS |
RegexExtractionStrategy.Url |
RegexExtractionStrategy.Currency |
RegexExtractionStrategy.DateIso
)
)
config = CrawlerRunConfig(extraction_strategy=strategy)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com/contact",
config=config
)
if result.success:
matches = json.loads(result.extracted_content)
# Group by pattern type
by_type = {}
for match in matches:
label = match['label']
if label not in by_type:
by_type[label] = []
by_type[label].append(match['value'])
for pattern_type, values in by_type.items():
print(f"{pattern_type}: {len(values)} matches")
for value in values[:3]: # Show first 3
print(f" {value}")
# Custom regex patterns
custom_patterns = {
"product_code": r"SKU-\d{4,6}",
"discount": r"\d{1,2}%\s*off",
"model_number": r"Model:\s*([A-Z0-9-]+)"
}
async def extract_with_custom_patterns():
strategy = RegexExtractionStrategy(custom=custom_patterns)
config = CrawlerRunConfig(extraction_strategy=strategy)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com/products",
config=config
)
if result.success:
data = json.loads(result.extracted_content)
for item in data:
print(f"{item['label']}: {item['value']}")
# LLM-generated patterns (one-time cost)
async def generate_custom_patterns():
cache_file = Path("./patterns/price_patterns.json")
if cache_file.exists():
patterns = json.load(cache_file.open())
else:
llm_config = LLMConfig(
provider="openai/gpt-4o-mini",
api_token="env:OPENAI_API_KEY"
)
# Get sample content
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://example.com/pricing")
sample_html = result.cleaned_html
# Generate optimized patterns
patterns = RegexExtractionStrategy.generate_pattern(
label="pricing_info",
html=sample_html,
query="Extract all pricing information including discounts and special offers",
llm_config=llm_config
)
# Cache for reuse
cache_file.parent.mkdir(exist_ok=True)
json.dump(patterns, cache_file.open("w"), indent=2)
# Use cached patterns (no more LLM calls)
strategy = RegexExtractionStrategy(custom=patterns)
return strategy
asyncio.run(extract_with_builtin_patterns())
asyncio.run(extract_with_custom_patterns())
```
### Complete Extraction Workflow - Combining Strategies
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.extraction_strategy import (
JsonCssExtractionStrategy,
RegexExtractionStrategy,
LLMExtractionStrategy
)
async def multi_strategy_extraction():
"""
Demonstrate using multiple extraction strategies in sequence:
1. Fast regex for common patterns
2. Schema-based for structured data
3. LLM for complex reasoning
"""
browser_config = BrowserConfig(headless=True)
# Strategy 1: Fast regex extraction
regex_strategy = RegexExtractionStrategy(
pattern=RegexExtractionStrategy.Email | RegexExtractionStrategy.PhoneUS
)
# Strategy 2: Schema-based structured extraction
product_schema = {
"name": "Products",
"baseSelector": "div.product",
"fields": [
{"name": "name", "selector": "h3", "type": "text"},
{"name": "price", "selector": ".price", "type": "text"},
{"name": "rating", "selector": ".rating", "type": "attribute", "attribute": "data-rating"}
]
}
css_strategy = JsonCssExtractionStrategy(product_schema)
# Strategy 3: LLM for complex analysis
llm_strategy = LLMExtractionStrategy(
llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token="env:OPENAI_API_KEY"),
schema={
"type": "object",
"properties": {
"sentiment": {"type": "string"},
"key_topics": {"type": "array", "items": {"type": "string"}},
"summary": {"type": "string"}
}
},
extraction_type="schema",
instruction="Analyze the content sentiment, extract key topics, and provide a summary"
)
url = "https://example.com/product-reviews"
async with AsyncWebCrawler(config=browser_config) as crawler:
# Extract contact info with regex
regex_config = CrawlerRunConfig(extraction_strategy=regex_strategy)
regex_result = await crawler.arun(url=url, config=regex_config)
# Extract structured product data
css_config = CrawlerRunConfig(extraction_strategy=css_strategy)
css_result = await crawler.arun(url=url, config=css_config)
# Extract insights with LLM
llm_config = CrawlerRunConfig(extraction_strategy=llm_strategy)
llm_result = await crawler.arun(url=url, config=llm_config)
# Combine results
results = {
"contacts": json.loads(regex_result.extracted_content) if regex_result.success else [],
"products": json.loads(css_result.extracted_content) if css_result.success else [],
"analysis": json.loads(llm_result.extracted_content) if llm_result.success else {}
}
print(f"Found {len(results['contacts'])} contact entries")
print(f"Found {len(results['products'])} products")
print(f"Sentiment: {results['analysis'].get('sentiment', 'N/A')}")
return results
# Performance comparison
async def compare_extraction_performance():
"""Compare speed and accuracy of different strategies"""
import time
url = "https://example.com/large-catalog"
strategies = {
"regex": RegexExtractionStrategy(pattern=RegexExtractionStrategy.Currency),
"css": JsonCssExtractionStrategy({
"name": "Prices",
"baseSelector": ".price",
"fields": [{"name": "amount", "selector": "span", "type": "text"}]
}),
"llm": LLMExtractionStrategy(
llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token="env:OPENAI_API_KEY"),
instruction="Extract all prices from the content",
extraction_type="block"
)
}
async with AsyncWebCrawler() as crawler:
for name, strategy in strategies.items():
start_time = time.time()
config = CrawlerRunConfig(extraction_strategy=strategy)
result = await crawler.arun(url=url, config=config)
duration = time.time() - start_time
if result.success:
data = json.loads(result.extracted_content)
print(f"{name}: {len(data)} items in {duration:.2f}s")
else:
print(f"{name}: Failed in {duration:.2f}s")
asyncio.run(multi_strategy_extraction())
asyncio.run(compare_extraction_performance())
```
### Best Practices and Strategy Selection
```python
# Strategy selection guide
def choose_extraction_strategy(use_case):
"""
Guide for selecting the right extraction strategy
"""
strategies = {
# Fast pattern matching for common data types
"contact_info": RegexExtractionStrategy(
pattern=RegexExtractionStrategy.Email | RegexExtractionStrategy.PhoneUS
),
# Structured data from consistent HTML
"product_catalogs": JsonCssExtractionStrategy,
# Complex reasoning and semantic understanding
"content_analysis": LLMExtractionStrategy,
# Mixed approach for comprehensive extraction
"complete_site_analysis": "multi_strategy"
}
recommendations = {
"speed_priority": "Use RegexExtractionStrategy for simple patterns, JsonCssExtractionStrategy for structured data",
"accuracy_priority": "Use LLMExtractionStrategy for complex content, JsonCssExtractionStrategy for predictable structure",
"cost_priority": "Avoid LLM strategies, use schema generation once then JsonCssExtractionStrategy",
"scale_priority": "Cache schemas, use regex for simple patterns, avoid LLM for high-volume extraction"
}
return recommendations.get(use_case, "Combine strategies based on content complexity")
# Error handling and validation
async def robust_extraction():
strategies = [
RegexExtractionStrategy(pattern=RegexExtractionStrategy.Email),
JsonCssExtractionStrategy(simple_schema),
# LLM as fallback for complex cases
]
async with AsyncWebCrawler() as crawler:
for strategy in strategies:
try:
config = CrawlerRunConfig(extraction_strategy=strategy)
result = await crawler.arun(url="https://example.com", config=config)
if result.success and result.extracted_content:
data = json.loads(result.extracted_content)
if data: # Validate non-empty results
print(f"Success with {strategy.__class__.__name__}")
return data
except Exception as e:
print(f"Strategy {strategy.__class__.__name__} failed: {e}")
continue
print("All strategies failed")
return None
```
**📖 Learn more:** [LLM Strategies Deep Dive](https://docs.crawl4ai.com/extraction/llm-strategies/), [Schema-Based Extraction](https://docs.crawl4ai.com/extraction/no-llm-strategies/), [Regex Patterns](https://docs.crawl4ai.com/extraction/no-llm-strategies/#regexextractionstrategy), [Performance Optimization](https://docs.crawl4ai.com/advanced/multi-url-crawling/)