The crawl4ai showcase script has been significantly expanded to include more detailed examples and demonstrations. This includes live code examples, more detailed explanations, and a new real-world example. A new file, uv.lock, has also been added.
1584 lines
62 KiB
Python
1584 lines
62 KiB
Python
"""
|
|
🚀 Crawl4AI v0.7.0 Feature Showcase
|
|
=====================================
|
|
This demo showcases the major features introduced in v0.7.0:
|
|
1. Link Preview/Peek - Advanced link analysis with 3-layer scoring
|
|
2. Adaptive Crawling - Intelligent crawling with confidence tracking
|
|
3. Virtual Scroll - Capture content from modern infinite scroll pages
|
|
4. C4A Script - Domain-specific language for web automation
|
|
5. URL Seeder - Smart URL discovery and filtering
|
|
6. LLM Context Builder - 3D context for AI assistants
|
|
|
|
Let's explore each feature with practical examples!
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import time
|
|
import re
|
|
from typing import List, Dict
|
|
from rich.console import Console
|
|
from rich.table import Table
|
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
from rich.panel import Panel
|
|
from rich.syntax import Syntax
|
|
from rich.layout import Layout
|
|
from rich.live import Live
|
|
from rich import box
|
|
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, AdaptiveCrawler, AdaptiveConfig, BrowserConfig, CacheMode
|
|
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
|
from crawl4ai.async_configs import LinkPreviewConfig, VirtualScrollConfig
|
|
from crawl4ai import c4a_compile, CompilationResult
|
|
|
|
# Initialize Rich console for beautiful output
|
|
console = Console()
|
|
|
|
|
|
def print_banner(title: str, subtitle: str = ""):
|
|
"""Print a beautiful banner for each section"""
|
|
console.print(f"\n[bold cyan]{'=' * 80}[/bold cyan]")
|
|
console.print(f"[bold yellow]{title.center(80)}[/bold yellow]")
|
|
if subtitle:
|
|
console.print(f"[dim white]{subtitle.center(80)}[/dim white]")
|
|
console.print(f"[bold cyan]{'=' * 80}[/bold cyan]\n")
|
|
|
|
|
|
def create_score_bar(score: float, max_score: float = 10.0) -> str:
|
|
"""Create a visual progress bar for scores"""
|
|
percentage = (score / max_score)
|
|
filled = int(percentage * 20)
|
|
bar = "█" * filled + "░" * (20 - filled)
|
|
return f"[{'green' if score >= 7 else 'yellow' if score >= 4 else 'red'}]{bar}[/] {score:.2f}/{max_score}"
|
|
|
|
|
|
async def link_preview_demo(auto_mode=False):
|
|
"""
|
|
🔗 Link Preview/Peek Demo
|
|
Showcases the 3-layer scoring system for intelligent link analysis
|
|
"""
|
|
print_banner(
|
|
"🔗 LINK PREVIEW & INTELLIGENT SCORING",
|
|
"Advanced link analysis with intrinsic, contextual, and total scoring"
|
|
)
|
|
|
|
# Explain the feature
|
|
console.print(Panel(
|
|
"[bold]What is Link Preview?[/bold]\n\n"
|
|
"Link Preview analyzes links on a page with a sophisticated 3-layer scoring system:\n\n"
|
|
"• [cyan]Intrinsic Score[/cyan]: Quality based on link text, position, and attributes (0-10)\n"
|
|
"• [magenta]Contextual Score[/magenta]: Relevance to your query using semantic analysis (0-1)\n"
|
|
"• [green]Total Score[/green]: Combined score for intelligent prioritization\n\n"
|
|
"This helps you find the most relevant and high-quality links automatically!",
|
|
title="Feature Overview",
|
|
border_style="blue"
|
|
))
|
|
|
|
await asyncio.sleep(2)
|
|
|
|
# Demo 1: Basic link analysis with visual scoring
|
|
console.print("\n[bold yellow]Demo 1: Analyzing Python Documentation Links[/bold yellow]\n")
|
|
|
|
query = "async await coroutines tutorial"
|
|
console.print(f"[cyan]🔍 Query:[/cyan] [bold]{query}[/bold]")
|
|
console.print("[dim]Looking for links related to asynchronous programming...[/dim]\n")
|
|
|
|
config = CrawlerRunConfig(
|
|
link_preview_config=LinkPreviewConfig(
|
|
include_internal=True,
|
|
include_external=False,
|
|
max_links=10,
|
|
concurrency=5,
|
|
query=query, # Our search context
|
|
verbose=False # We'll handle the display
|
|
),
|
|
score_links=True,
|
|
only_text=True
|
|
)
|
|
|
|
# Create a progress display
|
|
with Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[progress.description]{task.description}"),
|
|
console=console
|
|
) as progress:
|
|
task = progress.add_task("[cyan]Crawling and analyzing links...", total=None)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun("https://docs.python.org/3/library/asyncio.html", config=config)
|
|
|
|
progress.remove_task(task)
|
|
|
|
if result.success:
|
|
# Extract links with scores
|
|
links = result.links.get("internal", [])
|
|
scored_links = [l for l in links if l.get("head_data") and l.get("total_score")]
|
|
|
|
# Sort by total score
|
|
scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
|
|
|
|
# Create a beautiful table for results
|
|
table = Table(
|
|
title="🎯 Top Scored Links",
|
|
box=box.ROUNDED,
|
|
show_lines=True,
|
|
title_style="bold magenta"
|
|
)
|
|
|
|
table.add_column("Rank", style="cyan", width=6)
|
|
table.add_column("Link Text", style="white", width=40)
|
|
table.add_column("Intrinsic Score", width=25)
|
|
table.add_column("Contextual Score", width=25)
|
|
table.add_column("Total Score", style="bold", width=15)
|
|
|
|
for i, link in enumerate(scored_links[:5], 1):
|
|
intrinsic = link.get('intrinsic_score', 0)
|
|
contextual = link.get('contextual_score', 0)
|
|
total = link.get('total_score', 0)
|
|
|
|
# Get link text and title
|
|
text = link.get('text', '')[:35] + "..." if len(link.get('text', '')) > 35 else link.get('text', '')
|
|
title = link.get('head_data', {}).get('title', 'No title')[:40]
|
|
|
|
table.add_row(
|
|
f"#{i}",
|
|
text or title,
|
|
create_score_bar(intrinsic, 10.0),
|
|
create_score_bar(contextual, 1.0),
|
|
f"[bold green]{total:.3f}[/bold green]"
|
|
)
|
|
|
|
console.print(table)
|
|
|
|
# Show what makes a high-scoring link
|
|
if scored_links:
|
|
best_link = scored_links[0]
|
|
console.print(f"\n[bold green]🏆 Best Match:[/bold green]")
|
|
console.print(f"URL: [link]{best_link['href']}[/link]")
|
|
console.print(f"Title: {best_link.get('head_data', {}).get('title', 'N/A')}")
|
|
|
|
desc = best_link.get('head_data', {}).get('meta', {}).get('description', '')
|
|
if desc:
|
|
console.print(f"Description: [dim]{desc[:100]}...[/dim]")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 2...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 2: Research Assistant Mode
|
|
console.print("\n[bold yellow]Demo 2: Research Assistant - Finding Machine Learning Resources[/bold yellow]\n")
|
|
|
|
# First query - will find no results
|
|
query1 = "deep learning neural networks beginners tutorial"
|
|
console.print(f"[cyan]🔍 Query 1:[/cyan] [bold]{query1}[/bold]")
|
|
console.print("[dim]Note: scikit-learn focuses on traditional ML, not deep learning[/dim]\n")
|
|
|
|
# Configure for research mode
|
|
research_config = CrawlerRunConfig(
|
|
link_preview_config=LinkPreviewConfig(
|
|
include_internal=True,
|
|
include_external=True,
|
|
query=query1,
|
|
max_links=20,
|
|
score_threshold=0.3, # Only high-relevance links
|
|
concurrency=10
|
|
),
|
|
score_links=True
|
|
)
|
|
|
|
with Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[progress.description]{task.description}"),
|
|
console=console
|
|
) as progress:
|
|
task = progress.add_task("[cyan]Discovering learning resources...", total=None)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun("https://scikit-learn.org/stable/", config=research_config)
|
|
|
|
progress.remove_task(task)
|
|
|
|
if result.success:
|
|
all_links = result.links.get("internal", []) + result.links.get("external", [])
|
|
# Filter for links with actual scores
|
|
relevant_links = [l for l in all_links if l.get("total_score") is not None and l.get("total_score") > 0.3]
|
|
relevant_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
|
|
|
|
console.print(f"[bold green]📚 Found {len(relevant_links)} highly relevant resources![/bold green]\n")
|
|
|
|
# Group by score ranges
|
|
excellent = [l for l in relevant_links if l.get("total_score", 0) > 0.7]
|
|
good = [l for l in relevant_links if 0.5 <= l.get("total_score", 0) <= 0.7]
|
|
fair = [l for l in relevant_links if 0.3 <= l.get("total_score", 0) < 0.5]
|
|
|
|
if excellent:
|
|
console.print("[bold green]⭐⭐⭐ Excellent Matches:[/bold green]")
|
|
for link in excellent[:3]:
|
|
title = link.get('head_data', {}).get('title', link.get('text', 'No title'))
|
|
console.print(f" • {title[:60]}... [dim]({link.get('total_score', 0):.2f})[/dim]")
|
|
|
|
if good:
|
|
console.print("\n[yellow]⭐⭐ Good Matches:[/yellow]")
|
|
for link in good[:3]:
|
|
title = link.get('head_data', {}).get('title', link.get('text', 'No title'))
|
|
console.print(f" • {title[:60]}... [dim]({link.get('total_score', 0):.2f})[/dim]")
|
|
|
|
# Second query - will find results
|
|
console.print("\n[bold cyan]Let's try a more relevant query for scikit-learn:[/bold cyan]\n")
|
|
|
|
query2 = "machine learning classification tutorial examples"
|
|
console.print(f"[cyan]🔍 Query 2:[/cyan] [bold]{query2}[/bold]")
|
|
console.print("[dim]This should find relevant content about traditional ML[/dim]\n")
|
|
|
|
research_config2 = CrawlerRunConfig(
|
|
link_preview_config=LinkPreviewConfig(
|
|
include_internal=True,
|
|
include_external=True,
|
|
query=query2,
|
|
max_links=15,
|
|
score_threshold=0.2, # Slightly lower threshold
|
|
concurrency=10
|
|
),
|
|
score_links=True
|
|
)
|
|
|
|
with Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[progress.description]{task.description}"),
|
|
console=console
|
|
) as progress:
|
|
task = progress.add_task("[cyan]Finding ML tutorials...", total=None)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result2 = await crawler.arun("https://scikit-learn.org/stable/", config=research_config2)
|
|
|
|
progress.remove_task(task)
|
|
|
|
if result2.success:
|
|
all_links2 = result2.links.get("internal", []) + result2.links.get("external", [])
|
|
relevant_links2 = [l for l in all_links2 if l.get("total_score") is not None and l.get("total_score") > 0.2]
|
|
relevant_links2.sort(key=lambda x: x.get("total_score", 0), reverse=True)
|
|
|
|
console.print(f"[bold green]📚 Now found {len(relevant_links2)} relevant resources![/bold green]\n")
|
|
|
|
if relevant_links2:
|
|
console.print("[bold]Top relevant links for ML tutorials:[/bold]")
|
|
for i, link in enumerate(relevant_links2[:5], 1):
|
|
title = link.get('head_data', {}).get('title', link.get('text', 'No title'))
|
|
score = link.get('total_score', 0)
|
|
console.print(f"{i}. [{score:.3f}] {title[:70]}...")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 3...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 3: Live scoring visualization
|
|
console.print("\n[bold yellow]Demo 3: Understanding the 3-Layer Scoring System[/bold yellow]\n")
|
|
|
|
demo_query = "async programming tutorial"
|
|
console.print(f"[cyan]🔍 Query:[/cyan] [bold]{demo_query}[/bold]")
|
|
console.print("[dim]Let's see how different link types score against this query[/dim]\n")
|
|
|
|
# Create a sample link analysis
|
|
sample_links = [
|
|
{
|
|
"text": "Complete Guide to Async Programming",
|
|
"intrinsic": 9.2,
|
|
"contextual": 0.95,
|
|
"factors": ["Strong keywords", "Title position", "Descriptive text"]
|
|
},
|
|
{
|
|
"text": "API Reference",
|
|
"intrinsic": 6.5,
|
|
"contextual": 0.15,
|
|
"factors": ["Common link text", "Navigation menu", "Low relevance"]
|
|
},
|
|
{
|
|
"text": "Click here",
|
|
"intrinsic": 2.1,
|
|
"contextual": 0.05,
|
|
"factors": ["Poor link text", "No context", "Generic anchor"]
|
|
}
|
|
]
|
|
|
|
for link in sample_links:
|
|
total = (link["intrinsic"] / 10 * 0.4) + (link["contextual"] * 0.6)
|
|
|
|
panel_content = (
|
|
f"[bold]Link Text:[/bold] {link['text']}\n\n"
|
|
f"[cyan]Intrinsic Score:[/cyan] {create_score_bar(link['intrinsic'], 10.0)}\n"
|
|
f"[magenta]Contextual Score:[/magenta] {create_score_bar(link['contextual'], 1.0)}\n"
|
|
f"[green]Total Score:[/green] {total:.3f}\n\n"
|
|
f"[dim]Factors: {', '.join(link['factors'])}[/dim]"
|
|
)
|
|
|
|
console.print(Panel(
|
|
panel_content,
|
|
title=f"Link Analysis",
|
|
border_style="blue" if total > 0.7 else "yellow" if total > 0.3 else "red"
|
|
))
|
|
await asyncio.sleep(1)
|
|
|
|
# Summary
|
|
console.print("\n[bold green]✨ Link Preview Benefits:[/bold green]")
|
|
console.print("• Automatically finds the most relevant links for your research")
|
|
console.print("• Saves time by prioritizing high-quality content")
|
|
console.print("• Provides semantic understanding beyond simple keyword matching")
|
|
console.print("• Enables intelligent crawling decisions\n")
|
|
|
|
|
|
async def adaptive_crawling_demo(auto_mode=False):
|
|
"""
|
|
🎯 Adaptive Crawling Demo
|
|
Shows intelligent crawling that knows when to stop
|
|
"""
|
|
print_banner(
|
|
"🎯 ADAPTIVE CRAWLING",
|
|
"Intelligent crawling that knows when it has enough information"
|
|
)
|
|
|
|
# Explain the feature
|
|
console.print(Panel(
|
|
"[bold]What is Adaptive Crawling?[/bold]\n\n"
|
|
"Adaptive Crawling intelligently determines when sufficient information has been gathered:\n\n"
|
|
"• [cyan]Confidence Tracking[/cyan]: Monitors how well we understand the topic (0-100%)\n"
|
|
"• [magenta]Smart Exploration[/magenta]: Follows most promising links based on relevance\n"
|
|
"• [green]Early Stopping[/green]: Stops when confidence threshold is reached\n"
|
|
"• [yellow]Two Strategies[/yellow]: Statistical (fast) vs Embedding (semantic)\n\n"
|
|
"Perfect for research tasks where you need 'just enough' information!",
|
|
title="Feature Overview",
|
|
border_style="blue"
|
|
))
|
|
|
|
await asyncio.sleep(2)
|
|
|
|
# Demo 1: Basic adaptive crawling with confidence visualization
|
|
console.print("\n[bold yellow]Demo 1: Statistical Strategy - Fast Topic Understanding[/bold yellow]\n")
|
|
|
|
query = "Python async web scraping best practices"
|
|
console.print(f"[cyan]🔍 Research Query:[/cyan] [bold]{query}[/bold]")
|
|
console.print(f"[cyan]🎯 Goal:[/cyan] Gather enough information to understand the topic")
|
|
console.print(f"[cyan]📊 Strategy:[/cyan] Statistical (keyword-based, fast)\n")
|
|
|
|
# Configure adaptive crawler
|
|
config = AdaptiveConfig(
|
|
strategy="statistical",
|
|
max_pages=3, # Limit to 3 pages for demo
|
|
confidence_threshold=0.7, # Stop at 70% confidence
|
|
top_k_links=2, # Follow top 2 links per page
|
|
min_gain_threshold=0.05 # Need 5% information gain to continue
|
|
)
|
|
|
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
|
adaptive = AdaptiveCrawler(crawler, config)
|
|
|
|
# Create progress tracking
|
|
with Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[progress.description]{task.description}"),
|
|
console=console
|
|
) as progress:
|
|
|
|
# Track crawling progress
|
|
crawl_task = progress.add_task("[cyan]Starting adaptive crawl...", total=None)
|
|
|
|
# Start crawling
|
|
start_time = time.time()
|
|
result = await adaptive.digest(
|
|
start_url="https://docs.python.org/3/library/asyncio.html",
|
|
query=query
|
|
)
|
|
elapsed = time.time() - start_time
|
|
|
|
progress.remove_task(crawl_task)
|
|
|
|
# Display results with visual confidence meter
|
|
console.print(f"\n[bold green]✅ Crawling Complete in {elapsed:.1f} seconds![/bold green]\n")
|
|
|
|
# Create confidence visualization
|
|
confidence = adaptive.confidence
|
|
conf_percentage = int(confidence * 100)
|
|
conf_bar = "█" * (conf_percentage // 5) + "░" * (20 - conf_percentage // 5)
|
|
|
|
console.print(f"[bold]Confidence Level:[/bold] [{('green' if confidence >= 0.7 else 'yellow' if confidence >= 0.5 else 'red')}]{conf_bar}[/] {conf_percentage}%")
|
|
|
|
# Show crawl statistics
|
|
stats_table = Table(
|
|
title="📊 Crawl Statistics",
|
|
box=box.ROUNDED,
|
|
show_lines=True
|
|
)
|
|
|
|
stats_table.add_column("Metric", style="cyan", width=25)
|
|
stats_table.add_column("Value", style="white", width=20)
|
|
|
|
stats_table.add_row("Pages Crawled", str(len(result.crawled_urls)))
|
|
stats_table.add_row("Knowledge Base Size", f"{len(adaptive.state.knowledge_base)} documents")
|
|
# Calculate total content from CrawlResult objects
|
|
total_content = 0
|
|
for doc in adaptive.state.knowledge_base:
|
|
if hasattr(doc, 'markdown') and doc.markdown and hasattr(doc.markdown, 'raw_markdown'):
|
|
total_content += len(doc.markdown.raw_markdown)
|
|
stats_table.add_row("Total Content", f"{total_content:,} chars")
|
|
stats_table.add_row("Time per Page", f"{elapsed / len(result.crawled_urls):.2f}s")
|
|
|
|
console.print(stats_table)
|
|
|
|
# Show top relevant pages
|
|
console.print("\n[bold]🏆 Most Relevant Pages Found:[/bold]")
|
|
relevant_pages = adaptive.get_relevant_content(top_k=3)
|
|
for i, page in enumerate(relevant_pages, 1):
|
|
console.print(f"\n{i}. [bold]{page['url']}[/bold]")
|
|
console.print(f" Relevance: {page['score']:.2%}")
|
|
|
|
# Show key information extracted
|
|
content = page['content'] or ""
|
|
if content:
|
|
# Find most relevant sentence
|
|
sentences = [s.strip() for s in content.split('.') if s.strip()]
|
|
if sentences:
|
|
console.print(f" [dim]Key insight: {sentences[0]}...[/dim]")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 2...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 2: Early Stopping Demonstration
|
|
console.print("\n[bold yellow]Demo 2: Early Stopping - Stop When We Know Enough[/bold yellow]\n")
|
|
|
|
query2 = "Python requests library tutorial"
|
|
console.print(f"[cyan]🔍 Research Query:[/cyan] [bold]{query2}[/bold]")
|
|
console.print(f"[cyan]🎯 Goal:[/cyan] Stop as soon as we reach 60% confidence")
|
|
console.print("[dim]Watch how adaptive crawling stops early when it has enough info[/dim]\n")
|
|
|
|
# Configure for early stopping
|
|
early_stop_config = AdaptiveConfig(
|
|
strategy="statistical",
|
|
max_pages=10, # Allow up to 10, but will stop early
|
|
confidence_threshold=0.6, # Lower threshold for demo
|
|
top_k_links=2
|
|
)
|
|
|
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
|
adaptive_early = AdaptiveCrawler(crawler, early_stop_config)
|
|
|
|
# Track progress
|
|
console.print("[cyan]Starting crawl with early stopping enabled...[/cyan]")
|
|
start_time = time.time()
|
|
|
|
result = await adaptive_early.digest(
|
|
start_url="https://docs.python-requests.org/en/latest/",
|
|
query=query2
|
|
)
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
# Show results
|
|
console.print(f"\n[bold green]✅ Stopped early at {int(adaptive_early.confidence * 100)}% confidence![/bold green]")
|
|
console.print(f"• Crawled only {len(result.crawled_urls)} pages (max was 10)")
|
|
console.print(f"• Saved time: ~{elapsed:.1f}s total")
|
|
console.print(f"• Efficiency: {elapsed / len(result.crawled_urls):.1f}s per page\n")
|
|
|
|
# Show why it stopped
|
|
if adaptive_early.confidence >= 0.6:
|
|
console.print("[green]✓ Reached confidence threshold - no need to crawl more![/green]")
|
|
else:
|
|
console.print("[yellow]⚠ Hit max pages limit before reaching threshold[/yellow]")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 3...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 3: Knowledge Base Export/Import
|
|
console.print("\n[bold yellow]Demo 3: Knowledge Base Export & Import[/bold yellow]\n")
|
|
|
|
query3 = "Python decorators tutorial"
|
|
console.print(f"[cyan]🔍 Research Query:[/cyan] [bold]{query3}[/bold]")
|
|
console.print("[dim]Build knowledge base, export it, then import for continued research[/dim]\n")
|
|
|
|
# First crawl - build knowledge base
|
|
export_config = AdaptiveConfig(
|
|
strategy="statistical",
|
|
max_pages=2, # Small for demo
|
|
confidence_threshold=0.5
|
|
)
|
|
|
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
|
# Phase 1: Initial research
|
|
console.print("[bold]Phase 1: Initial Research[/bold]")
|
|
adaptive1 = AdaptiveCrawler(crawler, export_config)
|
|
|
|
result1 = await adaptive1.digest(
|
|
start_url="https://realpython.com/",
|
|
query=query3
|
|
)
|
|
|
|
console.print(f"✓ Built knowledge base with {len(adaptive1.state.knowledge_base)} documents")
|
|
console.print(f"✓ Confidence: {int(adaptive1.confidence * 100)}%\n")
|
|
|
|
# Export knowledge base
|
|
console.print("[bold]💾 Exporting Knowledge Base:[/bold]")
|
|
kb_export = adaptive1.export_knowledge_base()
|
|
|
|
export_stats = {
|
|
"documents": len(kb_export['documents']),
|
|
"urls": len(kb_export['visited_urls']),
|
|
"size": len(json.dumps(kb_export)),
|
|
"confidence": kb_export['confidence']
|
|
}
|
|
|
|
for key, value in export_stats.items():
|
|
console.print(f"• {key.capitalize()}: {value:,}" if isinstance(value, int) else f"• {key.capitalize()}: {value:.2%}")
|
|
|
|
# Phase 2: Import and continue
|
|
console.print("\n[bold]Phase 2: Import & Continue Research[/bold]")
|
|
adaptive2 = AdaptiveCrawler(crawler, export_config)
|
|
|
|
# Import the knowledge base
|
|
adaptive2.import_knowledge_base(kb_export)
|
|
console.print(f"✓ Imported {len(adaptive2.state.knowledge_base)} documents")
|
|
console.print(f"✓ Starting confidence: {int(adaptive2.confidence * 100)}%")
|
|
|
|
# Continue research from a different starting point
|
|
console.print("\n[cyan]Continuing research from a different angle...[/cyan]")
|
|
result2 = await adaptive2.digest(
|
|
start_url="https://docs.python.org/3/glossary.html#term-decorator",
|
|
query=query3
|
|
)
|
|
|
|
console.print(f"\n[bold green]✅ Research Complete![/bold green]")
|
|
console.print(f"• Total documents: {len(adaptive2.state.knowledge_base)}")
|
|
console.print(f"• Final confidence: {int(adaptive2.confidence * 100)}%")
|
|
console.print(f"• Knowledge preserved across sessions!")
|
|
|
|
# Summary
|
|
console.print("\n[bold green]✨ Adaptive Crawling Benefits:[/bold green]")
|
|
console.print("• Automatically stops when enough information is gathered")
|
|
console.print("• Follows most promising links based on relevance")
|
|
console.print("• Saves time and resources with intelligent exploration")
|
|
console.print("• Export/import knowledge bases for continued research")
|
|
console.print("• Choose strategy based on needs: speed vs semantic understanding\n")
|
|
|
|
|
|
async def virtual_scroll_demo(auto_mode=False):
|
|
"""
|
|
📜 Virtual Scroll Demo
|
|
Shows how to capture content from modern infinite scroll pages
|
|
"""
|
|
import os
|
|
import http.server
|
|
import socketserver
|
|
import threading
|
|
from pathlib import Path
|
|
|
|
print_banner(
|
|
"📜 VIRTUAL SCROLL SUPPORT",
|
|
"Capture all content from pages with DOM recycling"
|
|
)
|
|
|
|
# Explain the feature
|
|
console.print(Panel(
|
|
"[bold]What is Virtual Scroll?[/bold]\n\n"
|
|
"Virtual Scroll handles modern web pages that use DOM recycling techniques:\n\n"
|
|
"• [cyan]Twitter/X-like feeds[/cyan]: Content replaced as you scroll\n"
|
|
"• [magenta]Instagram grids[/magenta]: Visual content with virtualization\n"
|
|
"• [green]News feeds[/green]: Mixed content with different behaviors\n"
|
|
"• [yellow]Infinite scroll[/yellow]: Captures everything, not just visible\n\n"
|
|
"Without this, you'd only get the initially visible content!",
|
|
title="Feature Overview",
|
|
border_style="blue"
|
|
))
|
|
|
|
await asyncio.sleep(2)
|
|
|
|
# Start test server with HTML examples
|
|
ASSETS_DIR = Path(__file__).parent / "assets"
|
|
|
|
class TestServer:
|
|
"""Simple HTTP server to serve our test HTML files"""
|
|
|
|
def __init__(self, port=8080):
|
|
self.port = port
|
|
self.httpd = None
|
|
self.server_thread = None
|
|
|
|
async def start(self):
|
|
"""Start the test server"""
|
|
Handler = http.server.SimpleHTTPRequestHandler
|
|
|
|
# Save current directory and change to assets directory
|
|
self.original_cwd = os.getcwd()
|
|
os.chdir(ASSETS_DIR)
|
|
|
|
# Try to find an available port
|
|
for _ in range(10):
|
|
try:
|
|
self.httpd = socketserver.TCPServer(("", self.port), Handler)
|
|
break
|
|
except OSError:
|
|
self.port += 1
|
|
|
|
if self.httpd is None:
|
|
raise RuntimeError("Could not find available port")
|
|
|
|
self.server_thread = threading.Thread(target=self.httpd.serve_forever)
|
|
self.server_thread.daemon = True
|
|
self.server_thread.start()
|
|
|
|
# Give server time to start
|
|
await asyncio.sleep(0.5)
|
|
|
|
console.print(f"[green]Test server started on http://localhost:{self.port}[/green]")
|
|
return self.port
|
|
|
|
def stop(self):
|
|
"""Stop the test server"""
|
|
if self.httpd:
|
|
self.httpd.shutdown()
|
|
# Restore original directory
|
|
if hasattr(self, 'original_cwd'):
|
|
os.chdir(self.original_cwd)
|
|
|
|
server = TestServer()
|
|
port = await server.start()
|
|
|
|
try:
|
|
# Demo 1: Twitter-like virtual scroll (content REPLACED)
|
|
console.print("\n[bold yellow]Demo 1: Twitter-like Virtual Scroll - Content Replaced[/bold yellow]\n")
|
|
console.print("[cyan]This simulates Twitter/X where only visible tweets exist in DOM[/cyan]\n")
|
|
|
|
url = f"http://localhost:{port}/virtual_scroll_twitter_like.html"
|
|
|
|
# First, crawl WITHOUT virtual scroll
|
|
console.print("[red]WITHOUT Virtual Scroll:[/red]")
|
|
|
|
config_normal = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
|
browser_config = BrowserConfig(
|
|
headless=False if not auto_mode else True,
|
|
viewport={"width": 1280, "height": 800}
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result_normal = await crawler.arun(url=url, config=config_normal)
|
|
|
|
# Count tweets
|
|
tweets_normal = len(set(re.findall(r'data-tweet-id="(\d+)"', result_normal.html)))
|
|
console.print(f"• Captured only {tweets_normal} tweets (initial visible)")
|
|
console.print(f"• HTML size: {len(result_normal.html):,} bytes\n")
|
|
|
|
# Then, crawl WITH virtual scroll
|
|
console.print("[green]WITH Virtual Scroll:[/green]")
|
|
|
|
virtual_config = VirtualScrollConfig(
|
|
container_selector="#timeline",
|
|
scroll_count=50,
|
|
scroll_by="container_height",
|
|
wait_after_scroll=0.2
|
|
)
|
|
|
|
config_virtual = CrawlerRunConfig(
|
|
virtual_scroll_config=virtual_config,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result_virtual = await crawler.arun(url=url, config=config_virtual)
|
|
|
|
tweets_virtual = len(set(re.findall(r'data-tweet-id="(\d+)"', result_virtual.html)))
|
|
console.print(f"• Captured {tweets_virtual} tweets (all content)")
|
|
console.print(f"• HTML size: {len(result_virtual.html):,} bytes")
|
|
console.print(f"• [bold]{tweets_virtual / tweets_normal if tweets_normal > 0 else 'N/A':.1f}x more content![/bold]\n")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 2...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 2: Instagram Grid Example
|
|
console.print("\n[bold yellow]Demo 2: Instagram Grid - Visual Grid Layout[/bold yellow]\n")
|
|
console.print("[cyan]This shows how virtual scroll works with grid layouts[/cyan]\n")
|
|
|
|
url2 = f"http://localhost:{port}/virtual_scroll_instagram_grid.html"
|
|
|
|
# Configure for grid layout
|
|
grid_config = VirtualScrollConfig(
|
|
container_selector=".feed-container",
|
|
scroll_count=100, # Many scrolls for 999 posts
|
|
scroll_by="container_height",
|
|
wait_after_scroll=0.1 if auto_mode else 0.3
|
|
)
|
|
|
|
config = CrawlerRunConfig(
|
|
virtual_scroll_config=grid_config,
|
|
cache_mode=CacheMode.BYPASS,
|
|
screenshot=True # Take a screenshot
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun(url=url2, config=config)
|
|
|
|
# Count posts in grid
|
|
posts = re.findall(r'data-post-id="(\d+)"', result.html)
|
|
unique_posts = sorted(set(int(id) for id in posts))
|
|
|
|
console.print(f"[green]✅ Results:[/green]")
|
|
console.print(f"• Posts captured: {len(unique_posts)} unique posts")
|
|
if unique_posts:
|
|
console.print(f"• Post IDs range: {min(unique_posts)} to {max(unique_posts)}")
|
|
console.print(f"• Expected: 0 to 998 (999 posts total)")
|
|
|
|
if len(unique_posts) >= 900:
|
|
console.print(f"• [bold green]SUCCESS! Captured {len(unique_posts)/999*100:.1f}% of all posts[/bold green]")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 3...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 3: Show the actual code
|
|
console.print("\n[bold yellow]Demo 3: The Code - How It Works[/bold yellow]\n")
|
|
|
|
# Show the actual implementation
|
|
code = '''# Example: Crawling Twitter-like feed with virtual scroll
|
|
url = "http://localhost:8080/virtual_scroll_twitter_like.html"
|
|
|
|
# Configure virtual scroll
|
|
virtual_config = VirtualScrollConfig(
|
|
container_selector="#timeline", # The scrollable container
|
|
scroll_count=50, # Max number of scrolls
|
|
scroll_by="container_height", # Scroll by container height
|
|
wait_after_scroll=0.3 # Wait 300ms after each scroll
|
|
)
|
|
|
|
config = CrawlerRunConfig(
|
|
virtual_scroll_config=virtual_config,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
# Use headless=False to watch it work!
|
|
browser_config = BrowserConfig(
|
|
headless=False,
|
|
viewport={"width": 1280, "height": 800}
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun(url=url, config=config)
|
|
|
|
# Extract all tweets
|
|
tweets = re.findall(r\'data-tweet-id="(\\d+)"\', result.html)
|
|
unique_tweets = set(tweets)
|
|
|
|
print(f"Captured {len(unique_tweets)} unique tweets!")
|
|
print(f"Without virtual scroll: only ~10 tweets")
|
|
print(f"With virtual scroll: all 500 tweets!")'''
|
|
|
|
syntax = Syntax(code, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, title="Implementation", border_style="green"))
|
|
|
|
# Summary
|
|
console.print("\n[bold green]✨ Virtual Scroll Benefits:[/bold green]")
|
|
console.print("• Captures ALL content, not just initially visible")
|
|
console.print("• Handles Twitter, Instagram, LinkedIn, and more")
|
|
console.print("• Smart scrolling with configurable parameters")
|
|
console.print("• Essential for modern web scraping")
|
|
console.print("• Works with any virtualized content\n")
|
|
|
|
finally:
|
|
# Stop the test server
|
|
server.stop()
|
|
console.print("[dim]Test server stopped[/dim]")
|
|
|
|
|
|
async def url_seeder_demo(auto_mode=False):
|
|
"""
|
|
🌱 URL Seeder Demo
|
|
Shows intelligent URL discovery and filtering
|
|
"""
|
|
print_banner(
|
|
"🌱 URL SEEDER - INTELLIGENT URL DISCOVERY",
|
|
"Pre-discover and filter URLs before crawling"
|
|
)
|
|
|
|
# Explain the feature
|
|
console.print(Panel(
|
|
"[bold]What is URL Seeder?[/bold]\n\n"
|
|
"URL Seeder enables intelligent crawling at scale by pre-discovering URLs:\n\n"
|
|
"• [cyan]Discovery[/cyan]: Find all URLs from sitemaps or by crawling\n"
|
|
"• [magenta]Filtering[/magenta]: Filter by patterns, dates, or content\n"
|
|
"• [green]Ranking[/green]: Score URLs by relevance (BM25 or semantic)\n"
|
|
"• [yellow]Metadata[/yellow]: Extract head data without full crawl\n\n"
|
|
"Perfect for targeted crawling of large websites!",
|
|
title="Feature Overview",
|
|
border_style="blue"
|
|
))
|
|
|
|
await asyncio.sleep(2)
|
|
|
|
# Demo 1: Basic URL discovery
|
|
console.print("\n[bold yellow]Demo 1: Discover URLs from Sitemap[/bold yellow]\n")
|
|
|
|
target_site = "realpython.com"
|
|
console.print(f"[cyan]🔍 Target:[/cyan] [bold]{target_site}[/bold]")
|
|
console.print("[dim]Let's discover what content is available[/dim]\n")
|
|
|
|
async with AsyncUrlSeeder() as seeder:
|
|
# First, see total URLs available
|
|
console.print("[cyan]Discovering ALL URLs from sitemap...[/cyan]")
|
|
|
|
all_urls = await seeder.urls(
|
|
target_site,
|
|
SeedingConfig(source="sitemap")
|
|
)
|
|
|
|
console.print(f"[green]✅ Found {len(all_urls)} total URLs![/green]\n")
|
|
|
|
# Show URL categories
|
|
categories = {}
|
|
for url_info in all_urls[:100]: # Sample first 100
|
|
url = url_info['url']
|
|
if '/tutorials/' in url:
|
|
categories['tutorials'] = categories.get('tutorials', 0) + 1
|
|
elif '/python-' in url:
|
|
categories['python-topics'] = categories.get('python-topics', 0) + 1
|
|
elif '/courses/' in url:
|
|
categories['courses'] = categories.get('courses', 0) + 1
|
|
else:
|
|
categories['other'] = categories.get('other', 0) + 1
|
|
|
|
console.print("[bold]URL Categories (sample of first 100):[/bold]")
|
|
for cat, count in sorted(categories.items(), key=lambda x: x[1], reverse=True):
|
|
console.print(f"• {cat}: {count} URLs")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 2...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 2: Pattern filtering
|
|
console.print("\n[bold yellow]Demo 2: Filter URLs by Pattern[/bold yellow]\n")
|
|
|
|
pattern = "*python-basics*"
|
|
console.print(f"[cyan]🎯 Pattern:[/cyan] [bold]{pattern}[/bold]")
|
|
console.print("[dim]Finding Python basics tutorials[/dim]\n")
|
|
|
|
async with AsyncUrlSeeder() as seeder:
|
|
filtered_urls = await seeder.urls(
|
|
target_site,
|
|
SeedingConfig(
|
|
source="sitemap",
|
|
pattern=pattern,
|
|
max_urls=10
|
|
)
|
|
)
|
|
|
|
console.print(f"[green]✅ Found {len(filtered_urls)} Python basics URLs:[/green]\n")
|
|
|
|
for i, url_info in enumerate(filtered_urls[:5], 1):
|
|
console.print(f"{i}. {url_info['url']}")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 3...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 3: Smart search with BM25 ranking
|
|
console.print("\n[bold yellow]Demo 3: Smart Search with BM25 Ranking[/bold yellow]\n")
|
|
|
|
query = "web scraping beautifulsoup tutorial"
|
|
console.print(f"[cyan]🔍 Query:[/cyan] [bold]{query}[/bold]")
|
|
console.print("[dim]Using BM25 to find most relevant content[/dim]\n")
|
|
|
|
async with AsyncUrlSeeder() as seeder:
|
|
# Search with relevance scoring
|
|
results = await seeder.urls(
|
|
target_site,
|
|
SeedingConfig(
|
|
source="sitemap",
|
|
pattern="*beautiful-soup*", # Find Beautiful Soup pages
|
|
extract_head=True, # Get metadata
|
|
query=query,
|
|
scoring_method="bm25",
|
|
# No threshold - show all results ranked by BM25
|
|
max_urls=10
|
|
)
|
|
)
|
|
|
|
console.print(f"[green]✅ Top {len(results)} most relevant results:[/green]\n")
|
|
|
|
# Create a table for results
|
|
table = Table(
|
|
title="🎯 Relevance-Ranked Results",
|
|
box=box.ROUNDED,
|
|
show_lines=True
|
|
)
|
|
|
|
table.add_column("Rank", style="cyan", width=6)
|
|
table.add_column("Score", style="yellow", width=8)
|
|
table.add_column("Title", style="white", width=50)
|
|
table.add_column("URL", style="dim", width=40)
|
|
|
|
for i, result in enumerate(results[:5], 1):
|
|
score = result.get('relevance_score', 0)
|
|
title = result.get('head_data', {}).get('title', 'No title')[:50]
|
|
url = result['url'].split('/')[-2] # Just the slug
|
|
|
|
table.add_row(
|
|
f"#{i}",
|
|
f"{score:.3f}",
|
|
title,
|
|
f".../{url}/"
|
|
)
|
|
|
|
console.print(table)
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 4...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 4: Complete pipeline - Discover → Filter → Crawl
|
|
console.print("\n[bold yellow]Demo 4: Complete Pipeline - Discover → Filter → Crawl[/bold yellow]\n")
|
|
|
|
console.print("[cyan]Let's build a complete crawling pipeline:[/cyan]")
|
|
console.print("1. Discover URLs about Python decorators")
|
|
console.print("2. Filter and rank by relevance")
|
|
console.print("3. Crawl top results\n")
|
|
|
|
async with AsyncUrlSeeder() as seeder:
|
|
# Step 1: Discover and filter
|
|
console.print("[bold]Step 1: Discovering decorator tutorials...[/bold]")
|
|
|
|
decorator_urls = await seeder.urls(
|
|
target_site,
|
|
SeedingConfig(
|
|
source="sitemap",
|
|
pattern="*decorator*",
|
|
extract_head=True,
|
|
query="python decorators tutorial examples",
|
|
scoring_method="bm25",
|
|
max_urls=5
|
|
)
|
|
)
|
|
|
|
console.print(f"Found {len(decorator_urls)} relevant URLs\n")
|
|
|
|
# Step 2: Show what we'll crawl
|
|
console.print("[bold]Step 2: URLs to crawl (ranked by relevance):[/bold]")
|
|
urls_to_crawl = []
|
|
for i, url_info in enumerate(decorator_urls[:3], 1):
|
|
urls_to_crawl.append(url_info['url'])
|
|
title = url_info.get('head_data', {}).get('title', 'No title')
|
|
console.print(f"{i}. {title[:60]}...")
|
|
console.print(f" [dim]{url_info['url']}[/dim]")
|
|
|
|
# Step 3: Crawl them
|
|
console.print("\n[bold]Step 3: Crawling selected URLs...[/bold]")
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
config = CrawlerRunConfig(
|
|
only_text=True,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
# Crawl just the first URL for demo
|
|
if urls_to_crawl:
|
|
console.print(f"\n[dim]Crawling first URL: {urls_to_crawl[0]}[/dim]")
|
|
result = await crawler.arun(urls_to_crawl[0], config=config)
|
|
|
|
if result.success:
|
|
console.print(f"\n[green]✅ Successfully crawled the page![/green]")
|
|
console.print("\n[bold]Sample content:[/bold]")
|
|
content = result.markdown.raw_markdown[:300].replace('\n', ' ')
|
|
console.print(f"[dim]{content}...[/dim]")
|
|
else:
|
|
console.print(f"[red]Failed to crawl: {result.error_message}[/red]")
|
|
|
|
# Show code example
|
|
console.print("\n[bold yellow]Code Example:[/bold yellow]\n")
|
|
|
|
code = '''# Complete URL Seeder pipeline
|
|
async with AsyncUrlSeeder() as seeder:
|
|
# 1. Discover and filter URLs
|
|
urls = await seeder.urls(
|
|
"example.com",
|
|
SeedingConfig(
|
|
source="sitemap", # or "crawl"
|
|
pattern="*tutorial*", # URL pattern
|
|
extract_head=True, # Get metadata
|
|
query="python web scraping", # Search query
|
|
scoring_method="bm25", # Ranking method
|
|
score_threshold=0.2, # Quality filter
|
|
max_urls=10 # Max URLs
|
|
)
|
|
)
|
|
|
|
# 2. Extract just the URLs
|
|
urls_to_crawl = [u["url"] for u in urls[:5]]
|
|
|
|
# 3. Crawl them efficiently
|
|
async with AsyncWebCrawler() as crawler:
|
|
results = await crawler.arun_many(urls_to_crawl)
|
|
|
|
async for result in results:
|
|
if result.success:
|
|
print(f"Crawled: {result.url}")
|
|
# Process content...'''
|
|
|
|
syntax = Syntax(code, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, title="Implementation", border_style="green"))
|
|
|
|
# Summary
|
|
console.print("\n[bold green]✨ URL Seeder Benefits:[/bold green]")
|
|
console.print("• Pre-discover URLs before crawling - save time!")
|
|
console.print("• Filter by patterns, dates, or content relevance")
|
|
console.print("• Rank URLs by BM25 or semantic similarity")
|
|
console.print("• Extract metadata without full crawl")
|
|
console.print("• Perfect for large-scale targeted crawling\n")
|
|
|
|
|
|
async def c4a_script_demo(auto_mode=False):
|
|
"""
|
|
🎭 C4A Script Demo
|
|
Shows the power of our domain-specific language for web automation
|
|
"""
|
|
print_banner(
|
|
"🎭 C4A SCRIPT - AUTOMATION MADE SIMPLE",
|
|
"Domain-specific language for complex web interactions"
|
|
)
|
|
|
|
# Explain the feature
|
|
console.print(Panel(
|
|
"[bold]What is C4A Script?[/bold]\n\n"
|
|
"C4A Script is a simple yet powerful language for web automation:\n\n"
|
|
"• [cyan]English-like syntax[/cyan]: IF, CLICK, TYPE, WAIT - intuitive commands\n"
|
|
"• [magenta]Smart transpiler[/magenta]: Converts to optimized JavaScript\n"
|
|
"• [green]Error handling[/green]: Helpful error messages with suggestions\n"
|
|
"• [yellow]Reusable procedures[/yellow]: Build complex workflows easily\n\n"
|
|
"Perfect for automating logins, handling popups, pagination, and more!",
|
|
title="Feature Overview",
|
|
border_style="blue"
|
|
))
|
|
|
|
await asyncio.sleep(2)
|
|
|
|
# Demo 1: Basic transpilation demonstration
|
|
console.print("\n[bold yellow]Demo 1: Understanding C4A Script Transpilation[/bold yellow]\n")
|
|
|
|
simple_script = """# Handle cookie banner and scroll
|
|
WAIT `body` 2
|
|
IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
|
|
SCROLL DOWN 500
|
|
WAIT 1"""
|
|
|
|
console.print("[cyan]C4A Script:[/cyan]")
|
|
syntax = Syntax(simple_script, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, border_style="cyan"))
|
|
|
|
# Compile it
|
|
from crawl4ai import c4a_compile
|
|
|
|
console.print("\n[cyan]Transpiling to JavaScript...[/cyan]")
|
|
result = c4a_compile(simple_script)
|
|
|
|
if result.success:
|
|
console.print("[green]✅ Compilation successful![/green]\n")
|
|
console.print("[cyan]Generated JavaScript:[/cyan]")
|
|
|
|
js_display = "\n".join(result.js_code)
|
|
js_syntax = Syntax(js_display, "javascript", theme="monokai", line_numbers=True)
|
|
console.print(Panel(js_syntax, border_style="green"))
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 2...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 2: Error handling showcase
|
|
console.print("\n[bold yellow]Demo 2: Smart Error Detection & Suggestions[/bold yellow]\n")
|
|
|
|
# Script with intentional errors
|
|
error_script = """WAIT body 2
|
|
CLICK button.submit
|
|
IF (EXISTS .modal) CLICK .close"""
|
|
|
|
console.print("[cyan]C4A Script with errors:[/cyan]")
|
|
syntax = Syntax(error_script, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, border_style="red"))
|
|
|
|
console.print("\n[cyan]Compiling...[/cyan]")
|
|
result = c4a_compile(error_script)
|
|
|
|
if not result.success:
|
|
console.print("[red]❌ Compilation failed (as expected)[/red]\n")
|
|
|
|
# Show the first error
|
|
error = result.first_error
|
|
console.print(f"[bold red]Error at line {error.line}, column {error.column}:[/bold red]")
|
|
console.print(f"[yellow]{error.message}[/yellow]")
|
|
console.print(f"\nProblematic code: [red]{error.source_line}[/red]")
|
|
console.print(" " * (16 + error.column) + "[red]^[/red]")
|
|
|
|
if error.suggestions:
|
|
console.print("\n[green]💡 Suggestions:[/green]")
|
|
for suggestion in error.suggestions:
|
|
console.print(f" • {suggestion.message}")
|
|
|
|
# Show the fixed version
|
|
fixed_script = """WAIT `body` 2
|
|
CLICK `button.submit`
|
|
IF (EXISTS `.modal`) THEN CLICK `.close`"""
|
|
|
|
console.print("\n[cyan]Fixed C4A Script:[/cyan]")
|
|
syntax = Syntax(fixed_script, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, border_style="green"))
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 3...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 3: Real-world example - E-commerce automation
|
|
console.print("\n[bold yellow]Demo 3: Real-World E-commerce Automation[/bold yellow]\n")
|
|
|
|
console.print("[cyan]Scenario:[/cyan] Automate product search with smart handling\n")
|
|
|
|
ecommerce_script = """# E-commerce Product Search Automation
|
|
# Define reusable procedures
|
|
PROC handle_popups
|
|
# Close cookie banner if present
|
|
IF (EXISTS `.cookie-notice`) THEN CLICK `.cookie-accept`
|
|
|
|
# Close newsletter popup if it appears
|
|
IF (EXISTS `#newsletter-modal`) THEN CLICK `.modal-close`
|
|
ENDPROC
|
|
|
|
PROC search_product
|
|
# Click search box and type query
|
|
CLICK `.search-input`
|
|
TYPE "wireless headphones"
|
|
PRESS Enter
|
|
|
|
# Wait for results
|
|
WAIT `.product-grid` 10
|
|
ENDPROC
|
|
|
|
# Main automation flow
|
|
SET max_products = 50
|
|
|
|
# Step 1: Navigate and handle popups
|
|
GO https://shop.example.com
|
|
WAIT `body` 3
|
|
handle_popups
|
|
|
|
# Step 2: Perform search
|
|
search_product
|
|
|
|
# Step 3: Load more products (infinite scroll)
|
|
REPEAT (SCROLL DOWN 1000, `document.querySelectorAll('.product-card').length < 50`)
|
|
|
|
# Step 4: Apply filters
|
|
IF (EXISTS `.filter-price`) THEN CLICK `input[data-filter="under-100"]`
|
|
WAIT 2
|
|
|
|
# Step 5: Extract product count
|
|
EVAL `console.log('Found ' + document.querySelectorAll('.product-card').length + ' products')`"""
|
|
|
|
syntax = Syntax(ecommerce_script, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, title="E-commerce Automation Script", border_style="cyan"))
|
|
|
|
# Compile and show results
|
|
console.print("\n[cyan]Compiling automation script...[/cyan]")
|
|
result = c4a_compile(ecommerce_script)
|
|
|
|
if result.success:
|
|
console.print(f"[green]✅ Successfully compiled to {len(result.js_code)} JavaScript statements![/green]")
|
|
console.print("\n[bold]Script Analysis:[/bold]")
|
|
console.print(f"• Procedures defined: {len(result.metadata.get('procedures', []))}")
|
|
console.print(f"• Variables used: {len(result.metadata.get('variables', []))}")
|
|
console.print(f"• Total commands: {result.metadata.get('total_commands', 0)}")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 4...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 4: Integration with Crawl4AI - LIVE DEMO
|
|
console.print("\n[bold yellow]Demo 4: Live Integration with Crawl4AI[/bold yellow]\n")
|
|
|
|
console.print("[cyan]Let's see C4A Script in action with real web crawling![/cyan]\n")
|
|
|
|
# Create a simple C4A script for demo
|
|
live_script = """# Handle common website patterns
|
|
WAIT `body` 2
|
|
# Close cookie banner if exists
|
|
IF (EXISTS `.cookie-banner, .cookie-notice, #cookie-consent`) THEN CLICK `.accept, .agree, button[aria-label*="accept"]`
|
|
# Scroll to load content
|
|
SCROLL DOWN 500
|
|
WAIT 1"""
|
|
|
|
console.print("[bold]Our C4A Script:[/bold]")
|
|
syntax = Syntax(live_script, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, border_style="cyan"))
|
|
|
|
# Method 1: Direct C4A Script usage
|
|
console.print("\n[bold cyan]Method 1: Direct C4A Script Integration[/bold cyan]\n")
|
|
|
|
try:
|
|
# Import necessary components
|
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
|
|
|
# Define extraction schema
|
|
schema = {
|
|
"name": "page_content",
|
|
"selector": "body",
|
|
"fields": {
|
|
"title": {"selector": "h1, title", "type": "text"},
|
|
"paragraphs": {"selector": "p", "type": "list", "fields": {"text": {"type": "text"}}},
|
|
"links": {"selector": "a[href]", "type": "list", "fields": {"text": {"type": "text"}, "href": {"type": "attribute", "attribute": "href"}}}
|
|
}
|
|
}
|
|
|
|
# Create config with C4A script
|
|
config = CrawlerRunConfig(
|
|
c4a_script=live_script,
|
|
extraction_strategy=JsonCssExtractionStrategy(schema),
|
|
only_text=True,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
console.print("[green]✅ Config created with C4A script![/green]")
|
|
console.print(f"[dim]The C4A script will be automatically transpiled when crawling[/dim]\n")
|
|
|
|
# Show the actual code
|
|
code_example1 = f'''# Live code that's actually running:
|
|
config = CrawlerRunConfig(
|
|
c4a_script="""{live_script}""",
|
|
extraction_strategy=JsonCssExtractionStrategy(schema),
|
|
only_text=True,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
# This would run the crawler:
|
|
# async with AsyncWebCrawler() as crawler:
|
|
# result = await crawler.arun("https://example.com", config=config)
|
|
# print(f"Extracted {{len(result.extracted_content)}} items")'''
|
|
|
|
syntax = Syntax(code_example1, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, title="Method 1: Direct Integration (Live Code)", border_style="green"))
|
|
|
|
except Exception as e:
|
|
console.print(f"[red]Error in demo: {e}[/red]")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to see Method 2...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Method 2: Pre-compilation approach
|
|
console.print("\n[bold cyan]Method 2: Pre-compile and Reuse[/bold cyan]\n")
|
|
|
|
# Advanced script with procedures
|
|
advanced_script = """# E-commerce automation with procedures
|
|
PROC handle_popups
|
|
IF (EXISTS `.popup-overlay`) THEN CLICK `.popup-close`
|
|
IF (EXISTS `#newsletter-modal`) THEN CLICK `.modal-dismiss`
|
|
ENDPROC
|
|
|
|
PROC load_all_products
|
|
# Keep scrolling until no more products load
|
|
REPEAT (SCROLL DOWN 1000, `document.querySelectorAll('.product').length < window.lastProductCount`)
|
|
EVAL `window.lastProductCount = document.querySelectorAll('.product').length`
|
|
ENDPROC
|
|
|
|
# Main flow
|
|
WAIT `.products-container` 5
|
|
handle_popups
|
|
EVAL `window.lastProductCount = 0`
|
|
load_all_products"""
|
|
|
|
console.print("[bold]Advanced C4A Script with Procedures:[/bold]")
|
|
syntax = Syntax(advanced_script, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, border_style="cyan"))
|
|
|
|
# Actually compile it
|
|
console.print("\n[cyan]Compiling the script...[/cyan]")
|
|
compilation_result = c4a_compile(advanced_script)
|
|
|
|
if compilation_result.success:
|
|
console.print(f"[green]✅ Successfully compiled to {len(compilation_result.js_code)} JavaScript statements![/green]\n")
|
|
|
|
# Show first few JS statements
|
|
console.print("[bold]Generated JavaScript (first 5 statements):[/bold]")
|
|
js_preview = "\n".join(compilation_result.js_code[:5])
|
|
if len(compilation_result.js_code) > 5:
|
|
js_preview += f"\n... and {len(compilation_result.js_code) - 5} more statements"
|
|
|
|
js_syntax = Syntax(js_preview, "javascript", theme="monokai", line_numbers=True)
|
|
console.print(Panel(js_syntax, border_style="green"))
|
|
|
|
# Create actual config with compiled code
|
|
config_with_js = CrawlerRunConfig(
|
|
js_code=compilation_result.js_code,
|
|
wait_for="css:.products-container",
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
console.print("\n[green]✅ Config created with pre-compiled JavaScript![/green]")
|
|
|
|
# Show the actual implementation
|
|
code_example2 = f'''# Live code showing pre-compilation:
|
|
# Step 1: Compile once
|
|
result = c4a_compile(advanced_script)
|
|
if result.success:
|
|
js_code = result.js_code # {len(compilation_result.js_code)} statements generated
|
|
|
|
# Step 2: Use compiled code multiple times
|
|
config = CrawlerRunConfig(
|
|
js_code=js_code,
|
|
wait_for="css:.products-container",
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
# Step 3: Run crawler with compiled code
|
|
# async with AsyncWebCrawler() as crawler:
|
|
# # Can reuse js_code for multiple URLs
|
|
# for url in ["shop1.com", "shop2.com"]:
|
|
# result = await crawler.arun(url, config=config)
|
|
else:
|
|
print(f"Compilation error: {{result.first_error.message}}")'''
|
|
|
|
syntax = Syntax(code_example2, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, title="Method 2: Pre-compilation (Live Code)", border_style="green"))
|
|
|
|
else:
|
|
console.print(f"[red]Compilation failed: {compilation_result.first_error.message}[/red]")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to see a real-world example...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 5: Real-world example with actual crawling
|
|
console.print("\n[bold yellow]Demo 5: Real-World Example - News Site Automation[/bold yellow]\n")
|
|
|
|
news_script = """# News site content extraction
|
|
# Wait for main content
|
|
WAIT `article, .article-content, main` 5
|
|
|
|
# Handle common annoyances
|
|
IF (EXISTS `.cookie-notice`) THEN CLICK `button[class*="accept"]`
|
|
IF (EXISTS `.newsletter-popup`) THEN CLICK `.close, .dismiss`
|
|
|
|
# Expand "Read More" sections
|
|
IF (EXISTS `.read-more-button`) THEN CLICK `.read-more-button`
|
|
|
|
# Load comments if available
|
|
IF (EXISTS `.load-comments`) THEN CLICK `.load-comments`
|
|
WAIT 2"""
|
|
|
|
console.print("[bold]News Site Automation Script:[/bold]")
|
|
syntax = Syntax(news_script, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, border_style="cyan"))
|
|
|
|
# Create and show actual working config
|
|
console.print("\n[cyan]Creating crawler configuration...[/cyan]")
|
|
|
|
news_config = CrawlerRunConfig(
|
|
c4a_script=news_script,
|
|
wait_for="css:article",
|
|
only_text=True,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
console.print("[green]✅ Configuration ready for crawling![/green]\n")
|
|
|
|
# Show how to actually use it
|
|
usage_example = '''# Complete working example:
|
|
async def crawl_news_site():
|
|
"""Crawl a news site with C4A automation"""
|
|
|
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example-news.com/article",
|
|
config=CrawlerRunConfig(
|
|
c4a_script=news_script,
|
|
wait_for="css:article",
|
|
only_text=True
|
|
)
|
|
)
|
|
|
|
if result.success:
|
|
print(f"✓ Crawled: {result.url}")
|
|
print(f"✓ Content length: {len(result.markdown.raw_markdown)} chars")
|
|
print(f"✓ Links found: {len(result.links.get('internal', []))} internal")
|
|
|
|
# The C4A script ensured we:
|
|
# - Handled cookie banners
|
|
# - Expanded collapsed content
|
|
# - Loaded dynamic comments
|
|
# All automatically!
|
|
|
|
return result
|
|
|
|
# Run it:
|
|
# result = await crawl_news_site()'''
|
|
|
|
syntax = Syntax(usage_example, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, title="Complete Working Example", border_style="green"))
|
|
|
|
# Summary
|
|
console.print("\n[bold green]✨ What We Demonstrated:[/bold green]")
|
|
console.print("• C4A Script transpiles to optimized JavaScript automatically")
|
|
console.print("• Direct integration via `c4a_script` parameter - easiest approach")
|
|
console.print("• Pre-compilation via `c4a_compile()` - best for reuse")
|
|
console.print("• Real configs that you can copy and use immediately")
|
|
console.print("• Actual code running, not just examples!\n")
|
|
|
|
|
|
async def interactive_menu():
|
|
"""Interactive menu to select demos"""
|
|
from rich.prompt import Prompt
|
|
|
|
demos = {
|
|
"1": ("Link Preview & Scoring", link_preview_demo),
|
|
"2": ("Adaptive Crawling", adaptive_crawling_demo),
|
|
"3": ("Virtual Scroll", virtual_scroll_demo),
|
|
"4": ("URL Seeder", url_seeder_demo),
|
|
"5": ("C4A Script", c4a_script_demo),
|
|
"6": ("LLM Context Builder", lambda auto: console.print("[yellow]LLM Context demo coming soon![/yellow]")),
|
|
"7": ("Run All Demos", None), # Special case
|
|
"0": ("Exit", None)
|
|
}
|
|
|
|
while True:
|
|
# Clear screen for better presentation
|
|
console.clear()
|
|
|
|
print_banner(
|
|
"🚀 CRAWL4AI v0.7.0 SHOWCASE",
|
|
"Interactive Demo Menu"
|
|
)
|
|
|
|
console.print("\n[bold cyan]Select a demo to run:[/bold cyan]\n")
|
|
|
|
for key, (name, _) in demos.items():
|
|
if key == "0":
|
|
console.print(f"\n[dim]{key}. {name}[/dim]")
|
|
else:
|
|
console.print(f"{key}. {name}")
|
|
|
|
choice = Prompt.ask("\n[bold]Enter your choice[/bold]", choices=list(demos.keys()))
|
|
|
|
if choice == "0":
|
|
console.print("\n[yellow]Thanks for exploring Crawl4AI v0.7.0![/yellow]")
|
|
break
|
|
elif choice == "7":
|
|
# Run all demos
|
|
console.clear()
|
|
for key in ["1", "3", "4", "5"]: # Link Preview, Virtual Scroll, URL Seeder, C4A Script
|
|
name, demo_func = demos[key]
|
|
if demo_func:
|
|
await demo_func(auto_mode=True)
|
|
console.print("\n[dim]Press Enter to continue...[/dim]")
|
|
input()
|
|
else:
|
|
name, demo_func = demos[choice]
|
|
if demo_func:
|
|
console.clear()
|
|
await demo_func(auto_mode=False)
|
|
console.print("\n[dim]Press Enter to return to menu...[/dim]")
|
|
input()
|
|
|
|
|
|
async def main():
|
|
"""Run all feature demonstrations"""
|
|
import sys
|
|
|
|
# Check command line arguments
|
|
interactive_mode = "--interactive" in sys.argv or "-i" in sys.argv
|
|
auto_mode = "--auto" in sys.argv
|
|
|
|
if interactive_mode:
|
|
await interactive_menu()
|
|
elif auto_mode:
|
|
console.print("[yellow]Running in AUTO MODE - skipping user prompts[/yellow]\n")
|
|
|
|
# Run demos automatically
|
|
await link_preview_demo(auto_mode=True)
|
|
await asyncio.sleep(2)
|
|
# await adaptive_crawling_demo(auto_mode=True) # Skip for now
|
|
await virtual_scroll_demo(auto_mode=True)
|
|
await asyncio.sleep(2)
|
|
await url_seeder_demo(auto_mode=True)
|
|
await asyncio.sleep(2)
|
|
await c4a_script_demo(auto_mode=True)
|
|
else:
|
|
# Default: run all demos with prompts
|
|
try:
|
|
# 1. Link Preview Demo
|
|
await link_preview_demo(auto_mode=False)
|
|
|
|
console.print("\n[dim]Press Enter to continue to Virtual Scroll demo...[/dim]")
|
|
input()
|
|
|
|
# 2. Virtual Scroll Demo
|
|
await virtual_scroll_demo(auto_mode=False)
|
|
|
|
console.print("\n[dim]Press Enter to continue to URL Seeder demo...[/dim]")
|
|
input()
|
|
|
|
# 3. URL Seeder Demo
|
|
await url_seeder_demo(auto_mode=False)
|
|
|
|
console.print("\n[dim]Press Enter to continue to C4A Script demo...[/dim]")
|
|
input()
|
|
|
|
# 4. C4A Script Demo
|
|
await c4a_script_demo(auto_mode=False)
|
|
|
|
# TODO: Add other demos here
|
|
# await llm_context_demo()
|
|
|
|
console.print("\n[bold green]✨ All demos completed![/bold green]")
|
|
console.print("\nTo explore individual demos, run: [cyan]python crawl4ai_v0_7_0_showcase.py --interactive[/cyan]")
|
|
|
|
except KeyboardInterrupt:
|
|
console.print("\n[yellow]Demo interrupted by user[/yellow]")
|
|
except Exception as e:
|
|
console.print(f"\n[red]Error: {str(e)}[/red]")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
# Show usage if --help is provided
|
|
if "--help" in sys.argv or "-h" in sys.argv:
|
|
console.print("\n[bold]Crawl4AI v0.7.0 Feature Showcase[/bold]\n")
|
|
console.print("Usage: python crawl4ai_v0_7_0_showcase.py [options]\n")
|
|
console.print("Options:")
|
|
console.print(" --interactive, -i Interactive menu to select demos")
|
|
console.print(" --auto Run all demos without user prompts")
|
|
console.print(" --help, -h Show this help message\n")
|
|
console.print("Default: Run all demos with prompts between each\n")
|
|
else:
|
|
asyncio.run(main()) |