- Bump version to 0.7.0 - Add release notes and demo files - Update README with v0.7.0 features - Update Docker configurations for v0.7.0-r1 - Move v0.7.0 demo files to releases_review - Fix BM25 scoring bug in URLSeeder Major features: - Adaptive Crawling with pattern learning - Virtual Scroll support for infinite pages - Link Preview with 3-layer scoring - Async URL Seeder for massive discovery - Performance optimizations
408 lines
14 KiB
Python
408 lines
14 KiB
Python
"""
|
|
🚀 Crawl4AI v0.7.0 Release Demo
|
|
================================
|
|
This demo showcases all major features introduced in v0.7.0 release.
|
|
|
|
Major Features:
|
|
1. ✅ Adaptive Crawling - Intelligent crawling with confidence tracking
|
|
2. ✅ Virtual Scroll Support - Handle infinite scroll pages
|
|
3. ✅ Link Preview - Advanced link analysis with 3-layer scoring
|
|
4. ✅ URL Seeder - Smart URL discovery and filtering
|
|
5. ✅ C4A Script - Domain-specific language for web automation
|
|
6. ✅ Chrome Extension Updates - Click2Crawl and instant schema extraction
|
|
7. ✅ PDF Parsing Support - Extract content from PDF documents
|
|
8. ✅ Nightly Builds - Automated nightly releases
|
|
|
|
Run this demo to see all features in action!
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
from typing import List, Dict
|
|
from rich.console import Console
|
|
from rich.table import Table
|
|
from rich.panel import Panel
|
|
from rich import box
|
|
|
|
from crawl4ai import (
|
|
AsyncWebCrawler,
|
|
CrawlerRunConfig,
|
|
BrowserConfig,
|
|
CacheMode,
|
|
AdaptiveCrawler,
|
|
AdaptiveConfig,
|
|
AsyncUrlSeeder,
|
|
SeedingConfig,
|
|
c4a_compile,
|
|
CompilationResult
|
|
)
|
|
from crawl4ai.async_configs import VirtualScrollConfig, LinkPreviewConfig
|
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
|
|
|
console = Console()
|
|
|
|
def print_section(title: str, description: str = ""):
|
|
"""Print a section header"""
|
|
console.print(f"\n[bold cyan]{'=' * 60}[/bold cyan]")
|
|
console.print(f"[bold yellow]{title}[/bold yellow]")
|
|
if description:
|
|
console.print(f"[dim]{description}[/dim]")
|
|
console.print(f"[bold cyan]{'=' * 60}[/bold cyan]\n")
|
|
|
|
|
|
async def demo_1_adaptive_crawling():
|
|
"""Demo 1: Adaptive Crawling - Intelligent content extraction"""
|
|
print_section(
|
|
"Demo 1: Adaptive Crawling",
|
|
"Intelligently learns and adapts to website patterns"
|
|
)
|
|
|
|
# Create adaptive crawler with custom configuration
|
|
config = AdaptiveConfig(
|
|
strategy="statistical", # or "embedding"
|
|
confidence_threshold=0.7,
|
|
max_pages=10,
|
|
top_k_links=3,
|
|
min_gain_threshold=0.1
|
|
)
|
|
|
|
# Example: Learn from a product page
|
|
console.print("[cyan]Learning from product page patterns...[/cyan]")
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
adaptive = AdaptiveCrawler(crawler, config)
|
|
|
|
# Start adaptive crawl
|
|
console.print("[cyan]Starting adaptive crawl...[/cyan]")
|
|
result = await adaptive.digest(
|
|
start_url="https://docs.python.org/3/",
|
|
query="python decorators tutorial"
|
|
)
|
|
|
|
console.print("[green]✓ Adaptive crawl completed[/green]")
|
|
console.print(f" - Confidence Level: {adaptive.confidence:.0%}")
|
|
console.print(f" - Pages Crawled: {len(result.crawled_urls)}")
|
|
console.print(f" - Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
|
|
|
# Get most relevant content
|
|
relevant = adaptive.get_relevant_content(top_k=3)
|
|
if relevant:
|
|
console.print("\nMost relevant pages:")
|
|
for i, page in enumerate(relevant, 1):
|
|
console.print(f" {i}. {page['url']} (relevance: {page['score']:.2%})")
|
|
|
|
|
|
async def demo_2_virtual_scroll():
|
|
"""Demo 2: Virtual Scroll - Handle infinite scroll pages"""
|
|
print_section(
|
|
"Demo 2: Virtual Scroll Support",
|
|
"Capture content from modern infinite scroll pages"
|
|
)
|
|
|
|
# Configure virtual scroll - using body as container for example.com
|
|
scroll_config = VirtualScrollConfig(
|
|
container_selector="body", # Using body since example.com has simple structure
|
|
scroll_count=3, # Just 3 scrolls for demo
|
|
scroll_by="container_height", # or "page_height" or pixel value
|
|
wait_after_scroll=0.5 # Wait 500ms after each scroll
|
|
)
|
|
|
|
config = CrawlerRunConfig(
|
|
virtual_scroll_config=scroll_config,
|
|
cache_mode=CacheMode.BYPASS,
|
|
wait_until="networkidle"
|
|
)
|
|
|
|
console.print("[cyan]Virtual Scroll Configuration:[/cyan]")
|
|
console.print(f" - Container: {scroll_config.container_selector}")
|
|
console.print(f" - Scroll count: {scroll_config.scroll_count}")
|
|
console.print(f" - Scroll by: {scroll_config.scroll_by}")
|
|
console.print(f" - Wait after scroll: {scroll_config.wait_after_scroll}s")
|
|
|
|
console.print("\n[dim]Note: Using example.com for demo - in production, use this[/dim]")
|
|
console.print("[dim]with actual infinite scroll pages like social media feeds.[/dim]\n")
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
"https://example.com",
|
|
config=config
|
|
)
|
|
|
|
if result.success:
|
|
console.print("[green]✓ Virtual scroll executed successfully![/green]")
|
|
console.print(f" - Content length: {len(result.markdown)} chars")
|
|
|
|
# Show example of how to use with real infinite scroll sites
|
|
console.print("\n[yellow]Example for real infinite scroll sites:[/yellow]")
|
|
console.print("""
|
|
# For Twitter-like feeds:
|
|
scroll_config = VirtualScrollConfig(
|
|
container_selector="[data-testid='primaryColumn']",
|
|
scroll_count=20,
|
|
scroll_by="container_height",
|
|
wait_after_scroll=1.0
|
|
)
|
|
|
|
# For Instagram-like grids:
|
|
scroll_config = VirtualScrollConfig(
|
|
container_selector="main article",
|
|
scroll_count=15,
|
|
scroll_by=1000, # Fixed pixel amount
|
|
wait_after_scroll=1.5
|
|
)""")
|
|
|
|
|
|
async def demo_3_link_preview():
|
|
"""Demo 3: Link Preview with 3-layer scoring"""
|
|
print_section(
|
|
"Demo 3: Link Preview & Scoring",
|
|
"Advanced link analysis with intrinsic, contextual, and total scoring"
|
|
)
|
|
|
|
# Configure link preview
|
|
link_config = LinkPreviewConfig(
|
|
include_internal=True,
|
|
include_external=False,
|
|
max_links=10,
|
|
concurrency=5,
|
|
query="python tutorial", # For contextual scoring
|
|
score_threshold=0.3,
|
|
verbose=True
|
|
)
|
|
|
|
config = CrawlerRunConfig(
|
|
link_preview_config=link_config,
|
|
score_links=True, # Enable intrinsic scoring
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
console.print("[cyan]Analyzing links with 3-layer scoring system...[/cyan]")
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun("https://docs.python.org/3/", config=config)
|
|
|
|
if result.success and result.links:
|
|
# Get scored links
|
|
internal_links = result.links.get("internal", [])
|
|
scored_links = [l for l in internal_links if l.get("total_score")]
|
|
scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
|
|
|
|
# Create a scoring table
|
|
table = Table(title="Link Scoring Results", box=box.ROUNDED)
|
|
table.add_column("Link Text", style="cyan", width=40)
|
|
table.add_column("Intrinsic Score", justify="center")
|
|
table.add_column("Contextual Score", justify="center")
|
|
table.add_column("Total Score", justify="center", style="bold green")
|
|
|
|
for link in scored_links[:5]:
|
|
text = link.get('text', 'No text')[:40]
|
|
table.add_row(
|
|
text,
|
|
f"{link.get('intrinsic_score', 0):.1f}/10",
|
|
f"{link.get('contextual_score', 0):.2f}/1",
|
|
f"{link.get('total_score', 0):.3f}"
|
|
)
|
|
|
|
console.print(table)
|
|
|
|
|
|
async def demo_4_url_seeder():
|
|
"""Demo 4: URL Seeder - Smart URL discovery"""
|
|
print_section(
|
|
"Demo 4: URL Seeder",
|
|
"Intelligent URL discovery and filtering"
|
|
)
|
|
|
|
# Configure seeding
|
|
seeding_config = SeedingConfig(
|
|
source="cc+sitemap", # or "crawl"
|
|
pattern="*tutorial*", # URL pattern filter
|
|
max_urls=50,
|
|
extract_head=True, # Get metadata
|
|
query="python programming", # For relevance scoring
|
|
scoring_method="bm25",
|
|
score_threshold=0.2,
|
|
force = True
|
|
)
|
|
|
|
console.print("[cyan]URL Seeder Configuration:[/cyan]")
|
|
console.print(f" - Source: {seeding_config.source}")
|
|
console.print(f" - Pattern: {seeding_config.pattern}")
|
|
console.print(f" - Max URLs: {seeding_config.max_urls}")
|
|
console.print(f" - Query: {seeding_config.query}")
|
|
console.print(f" - Scoring: {seeding_config.scoring_method}")
|
|
|
|
# Use URL seeder to discover URLs
|
|
async with AsyncUrlSeeder() as seeder:
|
|
console.print("\n[cyan]Discovering URLs from Python docs...[/cyan]")
|
|
urls = await seeder.urls("docs.python.org", seeding_config)
|
|
|
|
console.print(f"\n[green]✓ Discovered {len(urls)} URLs[/green]")
|
|
for i, url_info in enumerate(urls[:5], 1):
|
|
console.print(f" {i}. {url_info['url']}")
|
|
if url_info.get('relevance_score'):
|
|
console.print(f" Relevance: {url_info['relevance_score']:.3f}")
|
|
|
|
|
|
async def demo_5_c4a_script():
|
|
"""Demo 5: C4A Script - Domain-specific language"""
|
|
print_section(
|
|
"Demo 5: C4A Script Language",
|
|
"Domain-specific language for web automation"
|
|
)
|
|
|
|
# Example C4A script
|
|
c4a_script = """
|
|
# Simple C4A script example
|
|
WAIT `body` 3
|
|
IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
|
|
CLICK `.search-button`
|
|
TYPE "python tutorial"
|
|
PRESS Enter
|
|
WAIT `.results` 5
|
|
"""
|
|
|
|
console.print("[cyan]C4A Script Example:[/cyan]")
|
|
console.print(Panel(c4a_script, title="script.c4a", border_style="blue"))
|
|
|
|
# Compile the script
|
|
compilation_result = c4a_compile(c4a_script)
|
|
|
|
if compilation_result.success:
|
|
console.print("[green]✓ Script compiled successfully![/green]")
|
|
console.print(f" - Generated {len(compilation_result.js_code)} JavaScript statements")
|
|
console.print("\nFirst 3 JS statements:")
|
|
for stmt in compilation_result.js_code[:3]:
|
|
console.print(f" • {stmt}")
|
|
else:
|
|
console.print("[red]✗ Script compilation failed[/red]")
|
|
if compilation_result.first_error:
|
|
error = compilation_result.first_error
|
|
console.print(f" Error at line {error.line}: {error.message}")
|
|
|
|
|
|
async def demo_6_css_extraction():
|
|
"""Demo 6: Enhanced CSS/JSON extraction"""
|
|
print_section(
|
|
"Demo 6: Enhanced Extraction",
|
|
"Improved CSS selector and JSON extraction"
|
|
)
|
|
|
|
# Define extraction schema
|
|
schema = {
|
|
"name": "Example Page Data",
|
|
"baseSelector": "body",
|
|
"fields": [
|
|
{
|
|
"name": "title",
|
|
"selector": "h1",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"name": "paragraphs",
|
|
"selector": "p",
|
|
"type": "list",
|
|
"fields": [
|
|
{"name": "text", "type": "text"}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
|
|
extraction_strategy = JsonCssExtractionStrategy(schema)
|
|
|
|
console.print("[cyan]Extraction Schema:[/cyan]")
|
|
console.print(json.dumps(schema, indent=2))
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
"https://example.com",
|
|
config=CrawlerRunConfig(
|
|
extraction_strategy=extraction_strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
)
|
|
|
|
if result.success and result.extracted_content:
|
|
console.print("\n[green]✓ Content extracted successfully![/green]")
|
|
console.print(f"Extracted: {json.dumps(json.loads(result.extracted_content), indent=2)[:200]}...")
|
|
|
|
|
|
async def demo_7_performance_improvements():
|
|
"""Demo 7: Performance improvements"""
|
|
print_section(
|
|
"Demo 7: Performance Improvements",
|
|
"Faster crawling with better resource management"
|
|
)
|
|
|
|
# Performance-optimized configuration
|
|
config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.ENABLED, # Use caching
|
|
wait_until="domcontentloaded", # Faster than networkidle
|
|
page_timeout=10000, # 10 second timeout
|
|
exclude_external_links=True,
|
|
exclude_social_media_links=True,
|
|
exclude_external_images=True
|
|
)
|
|
|
|
console.print("[cyan]Performance Configuration:[/cyan]")
|
|
console.print(" - Cache: ENABLED")
|
|
console.print(" - Wait: domcontentloaded (faster)")
|
|
console.print(" - Timeout: 10s")
|
|
console.print(" - Excluding: external links, images, social media")
|
|
|
|
# Measure performance
|
|
import time
|
|
start_time = time.time()
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun("https://example.com", config=config)
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
if result.success:
|
|
console.print(f"\n[green]✓ Page crawled in {elapsed:.2f} seconds[/green]")
|
|
|
|
|
|
async def main():
|
|
"""Run all demos"""
|
|
console.print(Panel(
|
|
"[bold cyan]Crawl4AI v0.7.0 Release Demo[/bold cyan]\n\n"
|
|
"This demo showcases all major features introduced in v0.7.0.\n"
|
|
"Each demo is self-contained and demonstrates a specific feature.",
|
|
title="Welcome",
|
|
border_style="blue"
|
|
))
|
|
|
|
demos = [
|
|
demo_1_adaptive_crawling,
|
|
demo_2_virtual_scroll,
|
|
demo_3_link_preview,
|
|
demo_4_url_seeder,
|
|
demo_5_c4a_script,
|
|
demo_6_css_extraction,
|
|
demo_7_performance_improvements
|
|
]
|
|
|
|
for i, demo in enumerate(demos, 1):
|
|
try:
|
|
await demo()
|
|
if i < len(demos):
|
|
console.print("\n[dim]Press Enter to continue to next demo...[/dim]")
|
|
input()
|
|
except Exception as e:
|
|
console.print(f"[red]Error in demo: {e}[/red]")
|
|
continue
|
|
|
|
console.print(Panel(
|
|
"[bold green]Demo Complete![/bold green]\n\n"
|
|
"Thank you for trying Crawl4AI v0.7.0!\n"
|
|
"For more examples and documentation, visit:\n"
|
|
"https://github.com/unclecode/crawl4ai",
|
|
title="Complete",
|
|
border_style="green"
|
|
))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |