This commit includes several updates to the crawl4ai package, including changes to the browser manager and content scraping strategy. The version number has been updated to 0.7.1. Significant modifications have been made to the documentation, including updates to the release notes for version 0.7.0 and the addition of release notes for version 0.7.1. Examples and core documentation have also been updated to reflect the changes in this version. Additionally, a new simple API test has been added to the Docker tests. These changes were made to improve the functionality of the crawl4ai package and to provide clearer, more up-to-date documentation for users. The new test will help ensure the API is working as expected. BREAKING CHANGE: The updates to the browser manager and content scraping strategy may affect how these components interact with the rest of the package. Users should review the updated documentation for details on these changes.
1584 lines
62 KiB
Python
1584 lines
62 KiB
Python
"""
|
|
🚀 Crawl4AI v0.7.0 Feature Showcase
|
|
=====================================
|
|
This demo showcases the major features introduced in v0.7.0:
|
|
1. Link Preview/Peek - Advanced link analysis with 3-layer scoring
|
|
2. Adaptive Crawling - Intelligent crawling with confidence tracking
|
|
3. Virtual Scroll - Capture content from modern infinite scroll pages
|
|
4. C4A Script - Domain-specific language for web automation
|
|
5. URL Seeder - Smart URL discovery and filtering
|
|
6. LLM Context Builder - 3D context for AI assistants
|
|
|
|
Let's explore each feature with practical examples!
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import time
|
|
import re
|
|
from typing import List, Dict
|
|
from rich.console import Console
|
|
from rich.table import Table
|
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
from rich.panel import Panel
|
|
from rich.syntax import Syntax
|
|
from rich.layout import Layout
|
|
from rich.live import Live
|
|
from rich import box
|
|
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, AdaptiveCrawler, AdaptiveConfig, BrowserConfig, CacheMode
|
|
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
|
from crawl4ai import LinkPreviewConfig, VirtualScrollConfig
|
|
from crawl4ai import c4a_compile, CompilationResult
|
|
|
|
# Initialize Rich console for beautiful output
|
|
console = Console()
|
|
|
|
|
|
def print_banner(title: str, subtitle: str = ""):
|
|
"""Print a beautiful banner for each section"""
|
|
console.print(f"\n[bold cyan]{'=' * 80}[/bold cyan]")
|
|
console.print(f"[bold yellow]{title.center(80)}[/bold yellow]")
|
|
if subtitle:
|
|
console.print(f"[dim white]{subtitle.center(80)}[/dim white]")
|
|
console.print(f"[bold cyan]{'=' * 80}[/bold cyan]\n")
|
|
|
|
|
|
def create_score_bar(score: float, max_score: float = 10.0) -> str:
|
|
"""Create a visual progress bar for scores"""
|
|
percentage = (score / max_score)
|
|
filled = int(percentage * 20)
|
|
bar = "█" * filled + "░" * (20 - filled)
|
|
return f"[{'green' if score >= 7 else 'yellow' if score >= 4 else 'red'}]{bar}[/] {score:.2f}/{max_score}"
|
|
|
|
|
|
async def link_preview_demo(auto_mode=False):
|
|
"""
|
|
🔗 Link Preview/Peek Demo
|
|
Showcases the 3-layer scoring system for intelligent link analysis
|
|
"""
|
|
print_banner(
|
|
"🔗 LINK PREVIEW & INTELLIGENT SCORING",
|
|
"Advanced link analysis with intrinsic, contextual, and total scoring"
|
|
)
|
|
|
|
# Explain the feature
|
|
console.print(Panel(
|
|
"[bold]What is Link Preview?[/bold]\n\n"
|
|
"Link Preview analyzes links on a page with a sophisticated 3-layer scoring system:\n\n"
|
|
"• [cyan]Intrinsic Score[/cyan]: Quality based on link text, position, and attributes (0-10)\n"
|
|
"• [magenta]Contextual Score[/magenta]: Relevance to your query using semantic analysis (0-1)\n"
|
|
"• [green]Total Score[/green]: Combined score for intelligent prioritization\n\n"
|
|
"This helps you find the most relevant and high-quality links automatically!",
|
|
title="Feature Overview",
|
|
border_style="blue"
|
|
))
|
|
|
|
await asyncio.sleep(2)
|
|
|
|
# Demo 1: Basic link analysis with visual scoring
|
|
console.print("\n[bold yellow]Demo 1: Analyzing Python Documentation Links[/bold yellow]\n")
|
|
|
|
query = "async await coroutines tutorial"
|
|
console.print(f"[cyan]🔍 Query:[/cyan] [bold]{query}[/bold]")
|
|
console.print("[dim]Looking for links related to asynchronous programming...[/dim]\n")
|
|
|
|
config = CrawlerRunConfig(
|
|
link_preview_config=LinkPreviewConfig(
|
|
include_internal=True,
|
|
include_external=False,
|
|
max_links=10,
|
|
concurrency=5,
|
|
query=query, # Our search context
|
|
verbose=False # We'll handle the display
|
|
),
|
|
score_links=True,
|
|
only_text=True
|
|
)
|
|
|
|
# Create a progress display
|
|
with Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[progress.description]{task.description}"),
|
|
console=console
|
|
) as progress:
|
|
task = progress.add_task("[cyan]Crawling and analyzing links...", total=None)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun("https://docs.python.org/3/library/asyncio.html", config=config)
|
|
|
|
progress.remove_task(task)
|
|
|
|
if result.success:
|
|
# Extract links with scores
|
|
links = result.links.get("internal", [])
|
|
scored_links = [l for l in links if l.get("head_data") and l.get("total_score")]
|
|
|
|
# Sort by total score
|
|
scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
|
|
|
|
# Create a beautiful table for results
|
|
table = Table(
|
|
title="🎯 Top Scored Links",
|
|
box=box.ROUNDED,
|
|
show_lines=True,
|
|
title_style="bold magenta"
|
|
)
|
|
|
|
table.add_column("Rank", style="cyan", width=6)
|
|
table.add_column("Link Text", style="white", width=40)
|
|
table.add_column("Intrinsic Score", width=25)
|
|
table.add_column("Contextual Score", width=25)
|
|
table.add_column("Total Score", style="bold", width=15)
|
|
|
|
for i, link in enumerate(scored_links[:5], 1):
|
|
intrinsic = link.get('intrinsic_score', 0)
|
|
contextual = link.get('contextual_score', 0)
|
|
total = link.get('total_score', 0)
|
|
|
|
# Get link text and title
|
|
text = link.get('text', '')[:35] + "..." if len(link.get('text', '')) > 35 else link.get('text', '')
|
|
title = link.get('head_data', {}).get('title', 'No title')[:40]
|
|
|
|
table.add_row(
|
|
f"#{i}",
|
|
text or title,
|
|
create_score_bar(intrinsic, 10.0),
|
|
create_score_bar(contextual, 1.0),
|
|
f"[bold green]{total:.3f}[/bold green]"
|
|
)
|
|
|
|
console.print(table)
|
|
|
|
# Show what makes a high-scoring link
|
|
if scored_links:
|
|
best_link = scored_links[0]
|
|
console.print(f"\n[bold green]🏆 Best Match:[/bold green]")
|
|
console.print(f"URL: [link]{best_link['href']}[/link]")
|
|
console.print(f"Title: {best_link.get('head_data', {}).get('title', 'N/A')}")
|
|
|
|
desc = best_link.get('head_data', {}).get('meta', {}).get('description', '')
|
|
if desc:
|
|
console.print(f"Description: [dim]{desc[:100]}...[/dim]")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 2...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 2: Research Assistant Mode
|
|
console.print("\n[bold yellow]Demo 2: Research Assistant - Finding Machine Learning Resources[/bold yellow]\n")
|
|
|
|
# First query - will find no results
|
|
query1 = "deep learning neural networks beginners tutorial"
|
|
console.print(f"[cyan]🔍 Query 1:[/cyan] [bold]{query1}[/bold]")
|
|
console.print("[dim]Note: scikit-learn focuses on traditional ML, not deep learning[/dim]\n")
|
|
|
|
# Configure for research mode
|
|
research_config = CrawlerRunConfig(
|
|
link_preview_config=LinkPreviewConfig(
|
|
include_internal=True,
|
|
include_external=True,
|
|
query=query1,
|
|
max_links=20,
|
|
score_threshold=0.3, # Only high-relevance links
|
|
concurrency=10
|
|
),
|
|
score_links=True
|
|
)
|
|
|
|
with Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[progress.description]{task.description}"),
|
|
console=console
|
|
) as progress:
|
|
task = progress.add_task("[cyan]Discovering learning resources...", total=None)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun("https://scikit-learn.org/stable/", config=research_config)
|
|
|
|
progress.remove_task(task)
|
|
|
|
if result.success:
|
|
all_links = result.links.get("internal", []) + result.links.get("external", [])
|
|
# Filter for links with actual scores
|
|
relevant_links = [l for l in all_links if l.get("total_score") is not None and l.get("total_score") > 0.3]
|
|
relevant_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
|
|
|
|
console.print(f"[bold green]📚 Found {len(relevant_links)} highly relevant resources![/bold green]\n")
|
|
|
|
# Group by score ranges
|
|
excellent = [l for l in relevant_links if l.get("total_score", 0) > 0.7]
|
|
good = [l for l in relevant_links if 0.5 <= l.get("total_score", 0) <= 0.7]
|
|
fair = [l for l in relevant_links if 0.3 <= l.get("total_score", 0) < 0.5]
|
|
|
|
if excellent:
|
|
console.print("[bold green]⭐⭐⭐ Excellent Matches:[/bold green]")
|
|
for link in excellent[:3]:
|
|
title = link.get('head_data', {}).get('title', link.get('text', 'No title'))
|
|
console.print(f" • {title[:60]}... [dim]({link.get('total_score', 0):.2f})[/dim]")
|
|
|
|
if good:
|
|
console.print("\n[yellow]⭐⭐ Good Matches:[/yellow]")
|
|
for link in good[:3]:
|
|
title = link.get('head_data', {}).get('title', link.get('text', 'No title'))
|
|
console.print(f" • {title[:60]}... [dim]({link.get('total_score', 0):.2f})[/dim]")
|
|
|
|
# Second query - will find results
|
|
console.print("\n[bold cyan]Let's try a more relevant query for scikit-learn:[/bold cyan]\n")
|
|
|
|
query2 = "machine learning classification tutorial examples"
|
|
console.print(f"[cyan]🔍 Query 2:[/cyan] [bold]{query2}[/bold]")
|
|
console.print("[dim]This should find relevant content about traditional ML[/dim]\n")
|
|
|
|
research_config2 = CrawlerRunConfig(
|
|
link_preview_config=LinkPreviewConfig(
|
|
include_internal=True,
|
|
include_external=True,
|
|
query=query2,
|
|
max_links=15,
|
|
score_threshold=0.2, # Slightly lower threshold
|
|
concurrency=10
|
|
),
|
|
score_links=True
|
|
)
|
|
|
|
with Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[progress.description]{task.description}"),
|
|
console=console
|
|
) as progress:
|
|
task = progress.add_task("[cyan]Finding ML tutorials...", total=None)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result2 = await crawler.arun("https://scikit-learn.org/stable/", config=research_config2)
|
|
|
|
progress.remove_task(task)
|
|
|
|
if result2.success:
|
|
all_links2 = result2.links.get("internal", []) + result2.links.get("external", [])
|
|
relevant_links2 = [l for l in all_links2 if l.get("total_score") is not None and l.get("total_score") > 0.2]
|
|
relevant_links2.sort(key=lambda x: x.get("total_score", 0), reverse=True)
|
|
|
|
console.print(f"[bold green]📚 Now found {len(relevant_links2)} relevant resources![/bold green]\n")
|
|
|
|
if relevant_links2:
|
|
console.print("[bold]Top relevant links for ML tutorials:[/bold]")
|
|
for i, link in enumerate(relevant_links2[:5], 1):
|
|
title = link.get('head_data', {}).get('title', link.get('text', 'No title'))
|
|
score = link.get('total_score', 0)
|
|
console.print(f"{i}. [{score:.3f}] {title[:70]}...")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 3...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 3: Live scoring visualization
|
|
console.print("\n[bold yellow]Demo 3: Understanding the 3-Layer Scoring System[/bold yellow]\n")
|
|
|
|
demo_query = "async programming tutorial"
|
|
console.print(f"[cyan]🔍 Query:[/cyan] [bold]{demo_query}[/bold]")
|
|
console.print("[dim]Let's see how different link types score against this query[/dim]\n")
|
|
|
|
# Create a sample link analysis
|
|
sample_links = [
|
|
{
|
|
"text": "Complete Guide to Async Programming",
|
|
"intrinsic": 9.2,
|
|
"contextual": 0.95,
|
|
"factors": ["Strong keywords", "Title position", "Descriptive text"]
|
|
},
|
|
{
|
|
"text": "API Reference",
|
|
"intrinsic": 6.5,
|
|
"contextual": 0.15,
|
|
"factors": ["Common link text", "Navigation menu", "Low relevance"]
|
|
},
|
|
{
|
|
"text": "Click here",
|
|
"intrinsic": 2.1,
|
|
"contextual": 0.05,
|
|
"factors": ["Poor link text", "No context", "Generic anchor"]
|
|
}
|
|
]
|
|
|
|
for link in sample_links:
|
|
total = (link["intrinsic"] / 10 * 0.4) + (link["contextual"] * 0.6)
|
|
|
|
panel_content = (
|
|
f"[bold]Link Text:[/bold] {link['text']}\n\n"
|
|
f"[cyan]Intrinsic Score:[/cyan] {create_score_bar(link['intrinsic'], 10.0)}\n"
|
|
f"[magenta]Contextual Score:[/magenta] {create_score_bar(link['contextual'], 1.0)}\n"
|
|
f"[green]Total Score:[/green] {total:.3f}\n\n"
|
|
f"[dim]Factors: {', '.join(link['factors'])}[/dim]"
|
|
)
|
|
|
|
console.print(Panel(
|
|
panel_content,
|
|
title=f"Link Analysis",
|
|
border_style="blue" if total > 0.7 else "yellow" if total > 0.3 else "red"
|
|
))
|
|
await asyncio.sleep(1)
|
|
|
|
# Summary
|
|
console.print("\n[bold green]✨ Link Preview Benefits:[/bold green]")
|
|
console.print("• Automatically finds the most relevant links for your research")
|
|
console.print("• Saves time by prioritizing high-quality content")
|
|
console.print("• Provides semantic understanding beyond simple keyword matching")
|
|
console.print("• Enables intelligent crawling decisions\n")
|
|
|
|
|
|
async def adaptive_crawling_demo(auto_mode=False):
|
|
"""
|
|
🎯 Adaptive Crawling Demo
|
|
Shows intelligent crawling that knows when to stop
|
|
"""
|
|
print_banner(
|
|
"🎯 ADAPTIVE CRAWLING",
|
|
"Intelligent crawling that knows when it has enough information"
|
|
)
|
|
|
|
# Explain the feature
|
|
console.print(Panel(
|
|
"[bold]What is Adaptive Crawling?[/bold]\n\n"
|
|
"Adaptive Crawling intelligently determines when sufficient information has been gathered:\n\n"
|
|
"• [cyan]Confidence Tracking[/cyan]: Monitors how well we understand the topic (0-100%)\n"
|
|
"• [magenta]Smart Exploration[/magenta]: Follows most promising links based on relevance\n"
|
|
"• [green]Early Stopping[/green]: Stops when confidence threshold is reached\n"
|
|
"• [yellow]Two Strategies[/yellow]: Statistical (fast) vs Embedding (semantic)\n\n"
|
|
"Perfect for research tasks where you need 'just enough' information!",
|
|
title="Feature Overview",
|
|
border_style="blue"
|
|
))
|
|
|
|
await asyncio.sleep(2)
|
|
|
|
# Demo 1: Basic adaptive crawling with confidence visualization
|
|
console.print("\n[bold yellow]Demo 1: Statistical Strategy - Fast Topic Understanding[/bold yellow]\n")
|
|
|
|
query = "Python async web scraping best practices"
|
|
console.print(f"[cyan]🔍 Research Query:[/cyan] [bold]{query}[/bold]")
|
|
console.print(f"[cyan]🎯 Goal:[/cyan] Gather enough information to understand the topic")
|
|
console.print(f"[cyan]📊 Strategy:[/cyan] Statistical (keyword-based, fast)\n")
|
|
|
|
# Configure adaptive crawler
|
|
config = AdaptiveConfig(
|
|
strategy="statistical",
|
|
max_pages=3, # Limit to 3 pages for demo
|
|
confidence_threshold=0.7, # Stop at 70% confidence
|
|
top_k_links=2, # Follow top 2 links per page
|
|
min_gain_threshold=0.05 # Need 5% information gain to continue
|
|
)
|
|
|
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
|
adaptive = AdaptiveCrawler(crawler, config)
|
|
|
|
# Create progress tracking
|
|
with Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[progress.description]{task.description}"),
|
|
console=console
|
|
) as progress:
|
|
|
|
# Track crawling progress
|
|
crawl_task = progress.add_task("[cyan]Starting adaptive crawl...", total=None)
|
|
|
|
# Start crawling
|
|
start_time = time.time()
|
|
result = await adaptive.digest(
|
|
start_url="https://docs.python.org/3/library/asyncio.html",
|
|
query=query
|
|
)
|
|
elapsed = time.time() - start_time
|
|
|
|
progress.remove_task(crawl_task)
|
|
|
|
# Display results with visual confidence meter
|
|
console.print(f"\n[bold green]✅ Crawling Complete in {elapsed:.1f} seconds![/bold green]\n")
|
|
|
|
# Create confidence visualization
|
|
confidence = adaptive.confidence
|
|
conf_percentage = int(confidence * 100)
|
|
conf_bar = "█" * (conf_percentage // 5) + "░" * (20 - conf_percentage // 5)
|
|
|
|
console.print(f"[bold]Confidence Level:[/bold] [{('green' if confidence >= 0.7 else 'yellow' if confidence >= 0.5 else 'red')}]{conf_bar}[/] {conf_percentage}%")
|
|
|
|
# Show crawl statistics
|
|
stats_table = Table(
|
|
title="📊 Crawl Statistics",
|
|
box=box.ROUNDED,
|
|
show_lines=True
|
|
)
|
|
|
|
stats_table.add_column("Metric", style="cyan", width=25)
|
|
stats_table.add_column("Value", style="white", width=20)
|
|
|
|
stats_table.add_row("Pages Crawled", str(len(result.crawled_urls)))
|
|
stats_table.add_row("Knowledge Base Size", f"{len(adaptive.state.knowledge_base)} documents")
|
|
# Calculate total content from CrawlResult objects
|
|
total_content = 0
|
|
for doc in adaptive.state.knowledge_base:
|
|
if hasattr(doc, 'markdown') and doc.markdown and hasattr(doc.markdown, 'raw_markdown'):
|
|
total_content += len(doc.markdown.raw_markdown)
|
|
stats_table.add_row("Total Content", f"{total_content:,} chars")
|
|
stats_table.add_row("Time per Page", f"{elapsed / len(result.crawled_urls):.2f}s")
|
|
|
|
console.print(stats_table)
|
|
|
|
# Show top relevant pages
|
|
console.print("\n[bold]🏆 Most Relevant Pages Found:[/bold]")
|
|
relevant_pages = adaptive.get_relevant_content(top_k=3)
|
|
for i, page in enumerate(relevant_pages, 1):
|
|
console.print(f"\n{i}. [bold]{page['url']}[/bold]")
|
|
console.print(f" Relevance: {page['score']:.2%}")
|
|
|
|
# Show key information extracted
|
|
content = page['content'] or ""
|
|
if content:
|
|
# Find most relevant sentence
|
|
sentences = [s.strip() for s in content.split('.') if s.strip()]
|
|
if sentences:
|
|
console.print(f" [dim]Key insight: {sentences[0]}...[/dim]")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 2...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 2: Early Stopping Demonstration
|
|
console.print("\n[bold yellow]Demo 2: Early Stopping - Stop When We Know Enough[/bold yellow]\n")
|
|
|
|
query2 = "Python requests library tutorial"
|
|
console.print(f"[cyan]🔍 Research Query:[/cyan] [bold]{query2}[/bold]")
|
|
console.print(f"[cyan]🎯 Goal:[/cyan] Stop as soon as we reach 60% confidence")
|
|
console.print("[dim]Watch how adaptive crawling stops early when it has enough info[/dim]\n")
|
|
|
|
# Configure for early stopping
|
|
early_stop_config = AdaptiveConfig(
|
|
strategy="statistical",
|
|
max_pages=10, # Allow up to 10, but will stop early
|
|
confidence_threshold=0.6, # Lower threshold for demo
|
|
top_k_links=2
|
|
)
|
|
|
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
|
adaptive_early = AdaptiveCrawler(crawler, early_stop_config)
|
|
|
|
# Track progress
|
|
console.print("[cyan]Starting crawl with early stopping enabled...[/cyan]")
|
|
start_time = time.time()
|
|
|
|
result = await adaptive_early.digest(
|
|
start_url="https://docs.python-requests.org/en/latest/",
|
|
query=query2
|
|
)
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
# Show results
|
|
console.print(f"\n[bold green]✅ Stopped early at {int(adaptive_early.confidence * 100)}% confidence![/bold green]")
|
|
console.print(f"• Crawled only {len(result.crawled_urls)} pages (max was 10)")
|
|
console.print(f"• Saved time: ~{elapsed:.1f}s total")
|
|
console.print(f"• Efficiency: {elapsed / len(result.crawled_urls):.1f}s per page\n")
|
|
|
|
# Show why it stopped
|
|
if adaptive_early.confidence >= 0.6:
|
|
console.print("[green]✓ Reached confidence threshold - no need to crawl more![/green]")
|
|
else:
|
|
console.print("[yellow]⚠ Hit max pages limit before reaching threshold[/yellow]")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 3...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 3: Knowledge Base Export/Import
|
|
console.print("\n[bold yellow]Demo 3: Knowledge Base Export & Import[/bold yellow]\n")
|
|
|
|
query3 = "Python decorators tutorial"
|
|
console.print(f"[cyan]🔍 Research Query:[/cyan] [bold]{query3}[/bold]")
|
|
console.print("[dim]Build knowledge base, export it, then import for continued research[/dim]\n")
|
|
|
|
# First crawl - build knowledge base
|
|
export_config = AdaptiveConfig(
|
|
strategy="statistical",
|
|
max_pages=2, # Small for demo
|
|
confidence_threshold=0.5
|
|
)
|
|
|
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
|
# Phase 1: Initial research
|
|
console.print("[bold]Phase 1: Initial Research[/bold]")
|
|
adaptive1 = AdaptiveCrawler(crawler, export_config)
|
|
|
|
result1 = await adaptive1.digest(
|
|
start_url="https://realpython.com/",
|
|
query=query3
|
|
)
|
|
|
|
console.print(f"✓ Built knowledge base with {len(adaptive1.state.knowledge_base)} documents")
|
|
console.print(f"✓ Confidence: {int(adaptive1.confidence * 100)}%\n")
|
|
|
|
# Export knowledge base
|
|
console.print("[bold]💾 Exporting Knowledge Base:[/bold]")
|
|
kb_export = adaptive1.export_knowledge_base()
|
|
|
|
export_stats = {
|
|
"documents": len(kb_export['documents']),
|
|
"urls": len(kb_export['visited_urls']),
|
|
"size": len(json.dumps(kb_export)),
|
|
"confidence": kb_export['confidence']
|
|
}
|
|
|
|
for key, value in export_stats.items():
|
|
console.print(f"• {key.capitalize()}: {value:,}" if isinstance(value, int) else f"• {key.capitalize()}: {value:.2%}")
|
|
|
|
# Phase 2: Import and continue
|
|
console.print("\n[bold]Phase 2: Import & Continue Research[/bold]")
|
|
adaptive2 = AdaptiveCrawler(crawler, export_config)
|
|
|
|
# Import the knowledge base
|
|
adaptive2.import_knowledge_base(kb_export)
|
|
console.print(f"✓ Imported {len(adaptive2.state.knowledge_base)} documents")
|
|
console.print(f"✓ Starting confidence: {int(adaptive2.confidence * 100)}%")
|
|
|
|
# Continue research from a different starting point
|
|
console.print("\n[cyan]Continuing research from a different angle...[/cyan]")
|
|
result2 = await adaptive2.digest(
|
|
start_url="https://docs.python.org/3/glossary.html#term-decorator",
|
|
query=query3
|
|
)
|
|
|
|
console.print(f"\n[bold green]✅ Research Complete![/bold green]")
|
|
console.print(f"• Total documents: {len(adaptive2.state.knowledge_base)}")
|
|
console.print(f"• Final confidence: {int(adaptive2.confidence * 100)}%")
|
|
console.print(f"• Knowledge preserved across sessions!")
|
|
|
|
# Summary
|
|
console.print("\n[bold green]✨ Adaptive Crawling Benefits:[/bold green]")
|
|
console.print("• Automatically stops when enough information is gathered")
|
|
console.print("• Follows most promising links based on relevance")
|
|
console.print("• Saves time and resources with intelligent exploration")
|
|
console.print("• Export/import knowledge bases for continued research")
|
|
console.print("• Choose strategy based on needs: speed vs semantic understanding\n")
|
|
|
|
|
|
async def virtual_scroll_demo(auto_mode=False):
|
|
"""
|
|
📜 Virtual Scroll Demo
|
|
Shows how to capture content from modern infinite scroll pages
|
|
"""
|
|
import os
|
|
import http.server
|
|
import socketserver
|
|
import threading
|
|
from pathlib import Path
|
|
|
|
print_banner(
|
|
"📜 VIRTUAL SCROLL SUPPORT",
|
|
"Capture all content from pages with DOM recycling"
|
|
)
|
|
|
|
# Explain the feature
|
|
console.print(Panel(
|
|
"[bold]What is Virtual Scroll?[/bold]\n\n"
|
|
"Virtual Scroll handles modern web pages that use DOM recycling techniques:\n\n"
|
|
"• [cyan]Twitter/X-like feeds[/cyan]: Content replaced as you scroll\n"
|
|
"• [magenta]Instagram grids[/magenta]: Visual content with virtualization\n"
|
|
"• [green]News feeds[/green]: Mixed content with different behaviors\n"
|
|
"• [yellow]Infinite scroll[/yellow]: Captures everything, not just visible\n\n"
|
|
"Without this, you'd only get the initially visible content!",
|
|
title="Feature Overview",
|
|
border_style="blue"
|
|
))
|
|
|
|
await asyncio.sleep(2)
|
|
|
|
# Start test server with HTML examples
|
|
ASSETS_DIR = Path(__file__).parent / "assets"
|
|
|
|
class TestServer:
|
|
"""Simple HTTP server to serve our test HTML files"""
|
|
|
|
def __init__(self, port=8080):
|
|
self.port = port
|
|
self.httpd = None
|
|
self.server_thread = None
|
|
|
|
async def start(self):
|
|
"""Start the test server"""
|
|
Handler = http.server.SimpleHTTPRequestHandler
|
|
|
|
# Save current directory and change to assets directory
|
|
self.original_cwd = os.getcwd()
|
|
os.chdir(ASSETS_DIR)
|
|
|
|
# Try to find an available port
|
|
for _ in range(10):
|
|
try:
|
|
self.httpd = socketserver.TCPServer(("", self.port), Handler)
|
|
break
|
|
except OSError:
|
|
self.port += 1
|
|
|
|
if self.httpd is None:
|
|
raise RuntimeError("Could not find available port")
|
|
|
|
self.server_thread = threading.Thread(target=self.httpd.serve_forever)
|
|
self.server_thread.daemon = True
|
|
self.server_thread.start()
|
|
|
|
# Give server time to start
|
|
await asyncio.sleep(0.5)
|
|
|
|
console.print(f"[green]Test server started on http://localhost:{self.port}[/green]")
|
|
return self.port
|
|
|
|
def stop(self):
|
|
"""Stop the test server"""
|
|
if self.httpd:
|
|
self.httpd.shutdown()
|
|
# Restore original directory
|
|
if hasattr(self, 'original_cwd'):
|
|
os.chdir(self.original_cwd)
|
|
|
|
server = TestServer()
|
|
port = await server.start()
|
|
|
|
try:
|
|
# Demo 1: Twitter-like virtual scroll (content REPLACED)
|
|
console.print("\n[bold yellow]Demo 1: Twitter-like Virtual Scroll - Content Replaced[/bold yellow]\n")
|
|
console.print("[cyan]This simulates Twitter/X where only visible tweets exist in DOM[/cyan]\n")
|
|
|
|
url = f"http://localhost:{port}/virtual_scroll_twitter_like.html"
|
|
|
|
# First, crawl WITHOUT virtual scroll
|
|
console.print("[red]WITHOUT Virtual Scroll:[/red]")
|
|
|
|
config_normal = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
|
browser_config = BrowserConfig(
|
|
headless=False if not auto_mode else True,
|
|
viewport={"width": 1280, "height": 800}
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result_normal = await crawler.arun(url=url, config=config_normal)
|
|
|
|
# Count tweets
|
|
tweets_normal = len(set(re.findall(r'data-tweet-id="(\d+)"', result_normal.html)))
|
|
console.print(f"• Captured only {tweets_normal} tweets (initial visible)")
|
|
console.print(f"• HTML size: {len(result_normal.html):,} bytes\n")
|
|
|
|
# Then, crawl WITH virtual scroll
|
|
console.print("[green]WITH Virtual Scroll:[/green]")
|
|
|
|
virtual_config = VirtualScrollConfig(
|
|
container_selector="#timeline",
|
|
scroll_count=50,
|
|
scroll_by="container_height",
|
|
wait_after_scroll=0.2
|
|
)
|
|
|
|
config_virtual = CrawlerRunConfig(
|
|
virtual_scroll_config=virtual_config,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result_virtual = await crawler.arun(url=url, config=config_virtual)
|
|
|
|
tweets_virtual = len(set(re.findall(r'data-tweet-id="(\d+)"', result_virtual.html)))
|
|
console.print(f"• Captured {tweets_virtual} tweets (all content)")
|
|
console.print(f"• HTML size: {len(result_virtual.html):,} bytes")
|
|
console.print(f"• [bold]{tweets_virtual / tweets_normal if tweets_normal > 0 else 'N/A':.1f}x more content![/bold]\n")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 2...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 2: Instagram Grid Example
|
|
console.print("\n[bold yellow]Demo 2: Instagram Grid - Visual Grid Layout[/bold yellow]\n")
|
|
console.print("[cyan]This shows how virtual scroll works with grid layouts[/cyan]\n")
|
|
|
|
url2 = f"http://localhost:{port}/virtual_scroll_instagram_grid.html"
|
|
|
|
# Configure for grid layout
|
|
grid_config = VirtualScrollConfig(
|
|
container_selector=".feed-container",
|
|
scroll_count=100, # Many scrolls for 999 posts
|
|
scroll_by="container_height",
|
|
wait_after_scroll=0.1 if auto_mode else 0.3
|
|
)
|
|
|
|
config = CrawlerRunConfig(
|
|
virtual_scroll_config=grid_config,
|
|
cache_mode=CacheMode.BYPASS,
|
|
screenshot=True # Take a screenshot
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun(url=url2, config=config)
|
|
|
|
# Count posts in grid
|
|
posts = re.findall(r'data-post-id="(\d+)"', result.html)
|
|
unique_posts = sorted(set(int(id) for id in posts))
|
|
|
|
console.print(f"[green]✅ Results:[/green]")
|
|
console.print(f"• Posts captured: {len(unique_posts)} unique posts")
|
|
if unique_posts:
|
|
console.print(f"• Post IDs range: {min(unique_posts)} to {max(unique_posts)}")
|
|
console.print(f"• Expected: 0 to 998 (999 posts total)")
|
|
|
|
if len(unique_posts) >= 900:
|
|
console.print(f"• [bold green]SUCCESS! Captured {len(unique_posts)/999*100:.1f}% of all posts[/bold green]")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 3...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 3: Show the actual code
|
|
console.print("\n[bold yellow]Demo 3: The Code - How It Works[/bold yellow]\n")
|
|
|
|
# Show the actual implementation
|
|
code = '''# Example: Crawling Twitter-like feed with virtual scroll
|
|
url = "http://localhost:8080/virtual_scroll_twitter_like.html"
|
|
|
|
# Configure virtual scroll
|
|
virtual_config = VirtualScrollConfig(
|
|
container_selector="#timeline", # The scrollable container
|
|
scroll_count=50, # Max number of scrolls
|
|
scroll_by="container_height", # Scroll by container height
|
|
wait_after_scroll=0.3 # Wait 300ms after each scroll
|
|
)
|
|
|
|
config = CrawlerRunConfig(
|
|
virtual_scroll_config=virtual_config,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
# Use headless=False to watch it work!
|
|
browser_config = BrowserConfig(
|
|
headless=False,
|
|
viewport={"width": 1280, "height": 800}
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun(url=url, config=config)
|
|
|
|
# Extract all tweets
|
|
tweets = re.findall(r\'data-tweet-id="(\\d+)"\', result.html)
|
|
unique_tweets = set(tweets)
|
|
|
|
print(f"Captured {len(unique_tweets)} unique tweets!")
|
|
print(f"Without virtual scroll: only ~10 tweets")
|
|
print(f"With virtual scroll: all 500 tweets!")'''
|
|
|
|
syntax = Syntax(code, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, title="Implementation", border_style="green"))
|
|
|
|
# Summary
|
|
console.print("\n[bold green]✨ Virtual Scroll Benefits:[/bold green]")
|
|
console.print("• Captures ALL content, not just initially visible")
|
|
console.print("• Handles Twitter, Instagram, LinkedIn, and more")
|
|
console.print("• Smart scrolling with configurable parameters")
|
|
console.print("• Essential for modern web scraping")
|
|
console.print("• Works with any virtualized content\n")
|
|
|
|
finally:
|
|
# Stop the test server
|
|
server.stop()
|
|
console.print("[dim]Test server stopped[/dim]")
|
|
|
|
|
|
async def url_seeder_demo(auto_mode=False):
|
|
"""
|
|
🌱 URL Seeder Demo
|
|
Shows intelligent URL discovery and filtering
|
|
"""
|
|
print_banner(
|
|
"🌱 URL SEEDER - INTELLIGENT URL DISCOVERY",
|
|
"Pre-discover and filter URLs before crawling"
|
|
)
|
|
|
|
# Explain the feature
|
|
console.print(Panel(
|
|
"[bold]What is URL Seeder?[/bold]\n\n"
|
|
"URL Seeder enables intelligent crawling at scale by pre-discovering URLs:\n\n"
|
|
"• [cyan]Discovery[/cyan]: Find all URLs from sitemaps or by crawling\n"
|
|
"• [magenta]Filtering[/magenta]: Filter by patterns, dates, or content\n"
|
|
"• [green]Ranking[/green]: Score URLs by relevance (BM25 or semantic)\n"
|
|
"• [yellow]Metadata[/yellow]: Extract head data without full crawl\n\n"
|
|
"Perfect for targeted crawling of large websites!",
|
|
title="Feature Overview",
|
|
border_style="blue"
|
|
))
|
|
|
|
await asyncio.sleep(2)
|
|
|
|
# Demo 1: Basic URL discovery
|
|
console.print("\n[bold yellow]Demo 1: Discover URLs from Sitemap[/bold yellow]\n")
|
|
|
|
target_site = "realpython.com"
|
|
console.print(f"[cyan]🔍 Target:[/cyan] [bold]{target_site}[/bold]")
|
|
console.print("[dim]Let's discover what content is available[/dim]\n")
|
|
|
|
async with AsyncUrlSeeder() as seeder:
|
|
# First, see total URLs available
|
|
console.print("[cyan]Discovering ALL URLs from sitemap...[/cyan]")
|
|
|
|
all_urls = await seeder.urls(
|
|
target_site,
|
|
SeedingConfig(source="sitemap")
|
|
)
|
|
|
|
console.print(f"[green]✅ Found {len(all_urls)} total URLs![/green]\n")
|
|
|
|
# Show URL categories
|
|
categories = {}
|
|
for url_info in all_urls[:100]: # Sample first 100
|
|
url = url_info['url']
|
|
if '/tutorials/' in url:
|
|
categories['tutorials'] = categories.get('tutorials', 0) + 1
|
|
elif '/python-' in url:
|
|
categories['python-topics'] = categories.get('python-topics', 0) + 1
|
|
elif '/courses/' in url:
|
|
categories['courses'] = categories.get('courses', 0) + 1
|
|
else:
|
|
categories['other'] = categories.get('other', 0) + 1
|
|
|
|
console.print("[bold]URL Categories (sample of first 100):[/bold]")
|
|
for cat, count in sorted(categories.items(), key=lambda x: x[1], reverse=True):
|
|
console.print(f"• {cat}: {count} URLs")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 2...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 2: Pattern filtering
|
|
console.print("\n[bold yellow]Demo 2: Filter URLs by Pattern[/bold yellow]\n")
|
|
|
|
pattern = "*python-basics*"
|
|
console.print(f"[cyan]🎯 Pattern:[/cyan] [bold]{pattern}[/bold]")
|
|
console.print("[dim]Finding Python basics tutorials[/dim]\n")
|
|
|
|
async with AsyncUrlSeeder() as seeder:
|
|
filtered_urls = await seeder.urls(
|
|
target_site,
|
|
SeedingConfig(
|
|
source="sitemap",
|
|
pattern=pattern,
|
|
max_urls=10
|
|
)
|
|
)
|
|
|
|
console.print(f"[green]✅ Found {len(filtered_urls)} Python basics URLs:[/green]\n")
|
|
|
|
for i, url_info in enumerate(filtered_urls[:5], 1):
|
|
console.print(f"{i}. {url_info['url']}")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 3...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 3: Smart search with BM25 ranking
|
|
console.print("\n[bold yellow]Demo 3: Smart Search with BM25 Ranking[/bold yellow]\n")
|
|
|
|
query = "web scraping beautifulsoup tutorial"
|
|
console.print(f"[cyan]🔍 Query:[/cyan] [bold]{query}[/bold]")
|
|
console.print("[dim]Using BM25 to find most relevant content[/dim]\n")
|
|
|
|
async with AsyncUrlSeeder() as seeder:
|
|
# Search with relevance scoring
|
|
results = await seeder.urls(
|
|
target_site,
|
|
SeedingConfig(
|
|
source="sitemap",
|
|
pattern="*beautiful-soup*", # Find Beautiful Soup pages
|
|
extract_head=True, # Get metadata
|
|
query=query,
|
|
scoring_method="bm25",
|
|
# No threshold - show all results ranked by BM25
|
|
max_urls=10
|
|
)
|
|
)
|
|
|
|
console.print(f"[green]✅ Top {len(results)} most relevant results:[/green]\n")
|
|
|
|
# Create a table for results
|
|
table = Table(
|
|
title="🎯 Relevance-Ranked Results",
|
|
box=box.ROUNDED,
|
|
show_lines=True
|
|
)
|
|
|
|
table.add_column("Rank", style="cyan", width=6)
|
|
table.add_column("Score", style="yellow", width=8)
|
|
table.add_column("Title", style="white", width=50)
|
|
table.add_column("URL", style="dim", width=40)
|
|
|
|
for i, result in enumerate(results[:5], 1):
|
|
score = result.get('relevance_score', 0)
|
|
title = result.get('head_data', {}).get('title', 'No title')[:50]
|
|
url = result['url'].split('/')[-2] # Just the slug
|
|
|
|
table.add_row(
|
|
f"#{i}",
|
|
f"{score:.3f}",
|
|
title,
|
|
f".../{url}/"
|
|
)
|
|
|
|
console.print(table)
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 4...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 4: Complete pipeline - Discover → Filter → Crawl
|
|
console.print("\n[bold yellow]Demo 4: Complete Pipeline - Discover → Filter → Crawl[/bold yellow]\n")
|
|
|
|
console.print("[cyan]Let's build a complete crawling pipeline:[/cyan]")
|
|
console.print("1. Discover URLs about Python decorators")
|
|
console.print("2. Filter and rank by relevance")
|
|
console.print("3. Crawl top results\n")
|
|
|
|
async with AsyncUrlSeeder() as seeder:
|
|
# Step 1: Discover and filter
|
|
console.print("[bold]Step 1: Discovering decorator tutorials...[/bold]")
|
|
|
|
decorator_urls = await seeder.urls(
|
|
target_site,
|
|
SeedingConfig(
|
|
source="sitemap",
|
|
pattern="*decorator*",
|
|
extract_head=True,
|
|
query="python decorators tutorial examples",
|
|
scoring_method="bm25",
|
|
max_urls=5
|
|
)
|
|
)
|
|
|
|
console.print(f"Found {len(decorator_urls)} relevant URLs\n")
|
|
|
|
# Step 2: Show what we'll crawl
|
|
console.print("[bold]Step 2: URLs to crawl (ranked by relevance):[/bold]")
|
|
urls_to_crawl = []
|
|
for i, url_info in enumerate(decorator_urls[:3], 1):
|
|
urls_to_crawl.append(url_info['url'])
|
|
title = url_info.get('head_data', {}).get('title', 'No title')
|
|
console.print(f"{i}. {title[:60]}...")
|
|
console.print(f" [dim]{url_info['url']}[/dim]")
|
|
|
|
# Step 3: Crawl them
|
|
console.print("\n[bold]Step 3: Crawling selected URLs...[/bold]")
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
config = CrawlerRunConfig(
|
|
only_text=True,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
# Crawl just the first URL for demo
|
|
if urls_to_crawl:
|
|
console.print(f"\n[dim]Crawling first URL: {urls_to_crawl[0]}[/dim]")
|
|
result = await crawler.arun(urls_to_crawl[0], config=config)
|
|
|
|
if result.success:
|
|
console.print(f"\n[green]✅ Successfully crawled the page![/green]")
|
|
console.print("\n[bold]Sample content:[/bold]")
|
|
content = result.markdown.raw_markdown[:300].replace('\n', ' ')
|
|
console.print(f"[dim]{content}...[/dim]")
|
|
else:
|
|
console.print(f"[red]Failed to crawl: {result.error_message}[/red]")
|
|
|
|
# Show code example
|
|
console.print("\n[bold yellow]Code Example:[/bold yellow]\n")
|
|
|
|
code = '''# Complete URL Seeder pipeline
|
|
async with AsyncUrlSeeder() as seeder:
|
|
# 1. Discover and filter URLs
|
|
urls = await seeder.urls(
|
|
"example.com",
|
|
SeedingConfig(
|
|
source="sitemap", # or "crawl"
|
|
pattern="*tutorial*", # URL pattern
|
|
extract_head=True, # Get metadata
|
|
query="python web scraping", # Search query
|
|
scoring_method="bm25", # Ranking method
|
|
score_threshold=0.2, # Quality filter
|
|
max_urls=10 # Max URLs
|
|
)
|
|
)
|
|
|
|
# 2. Extract just the URLs
|
|
urls_to_crawl = [u["url"] for u in urls[:5]]
|
|
|
|
# 3. Crawl them efficiently
|
|
async with AsyncWebCrawler() as crawler:
|
|
results = await crawler.arun_many(urls_to_crawl)
|
|
|
|
async for result in results:
|
|
if result.success:
|
|
print(f"Crawled: {result.url}")
|
|
# Process content...'''
|
|
|
|
syntax = Syntax(code, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, title="Implementation", border_style="green"))
|
|
|
|
# Summary
|
|
console.print("\n[bold green]✨ URL Seeder Benefits:[/bold green]")
|
|
console.print("• Pre-discover URLs before crawling - save time!")
|
|
console.print("• Filter by patterns, dates, or content relevance")
|
|
console.print("• Rank URLs by BM25 or semantic similarity")
|
|
console.print("• Extract metadata without full crawl")
|
|
console.print("• Perfect for large-scale targeted crawling\n")
|
|
|
|
|
|
async def c4a_script_demo(auto_mode=False):
|
|
"""
|
|
🎭 C4A Script Demo
|
|
Shows the power of our domain-specific language for web automation
|
|
"""
|
|
print_banner(
|
|
"🎭 C4A SCRIPT - AUTOMATION MADE SIMPLE",
|
|
"Domain-specific language for complex web interactions"
|
|
)
|
|
|
|
# Explain the feature
|
|
console.print(Panel(
|
|
"[bold]What is C4A Script?[/bold]\n\n"
|
|
"C4A Script is a simple yet powerful language for web automation:\n\n"
|
|
"• [cyan]English-like syntax[/cyan]: IF, CLICK, TYPE, WAIT - intuitive commands\n"
|
|
"• [magenta]Smart transpiler[/magenta]: Converts to optimized JavaScript\n"
|
|
"• [green]Error handling[/green]: Helpful error messages with suggestions\n"
|
|
"• [yellow]Reusable procedures[/yellow]: Build complex workflows easily\n\n"
|
|
"Perfect for automating logins, handling popups, pagination, and more!",
|
|
title="Feature Overview",
|
|
border_style="blue"
|
|
))
|
|
|
|
await asyncio.sleep(2)
|
|
|
|
# Demo 1: Basic transpilation demonstration
|
|
console.print("\n[bold yellow]Demo 1: Understanding C4A Script Transpilation[/bold yellow]\n")
|
|
|
|
simple_script = """# Handle cookie banner and scroll
|
|
WAIT `body` 2
|
|
IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
|
|
SCROLL DOWN 500
|
|
WAIT 1"""
|
|
|
|
console.print("[cyan]C4A Script:[/cyan]")
|
|
syntax = Syntax(simple_script, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, border_style="cyan"))
|
|
|
|
# Compile it
|
|
from crawl4ai import c4a_compile
|
|
|
|
console.print("\n[cyan]Transpiling to JavaScript...[/cyan]")
|
|
result = c4a_compile(simple_script)
|
|
|
|
if result.success:
|
|
console.print("[green]✅ Compilation successful![/green]\n")
|
|
console.print("[cyan]Generated JavaScript:[/cyan]")
|
|
|
|
js_display = "\n".join(result.js_code)
|
|
js_syntax = Syntax(js_display, "javascript", theme="monokai", line_numbers=True)
|
|
console.print(Panel(js_syntax, border_style="green"))
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 2...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 2: Error handling showcase
|
|
console.print("\n[bold yellow]Demo 2: Smart Error Detection & Suggestions[/bold yellow]\n")
|
|
|
|
# Script with intentional errors
|
|
error_script = """WAIT body 2
|
|
CLICK button.submit
|
|
IF (EXISTS .modal) CLICK .close"""
|
|
|
|
console.print("[cyan]C4A Script with errors:[/cyan]")
|
|
syntax = Syntax(error_script, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, border_style="red"))
|
|
|
|
console.print("\n[cyan]Compiling...[/cyan]")
|
|
result = c4a_compile(error_script)
|
|
|
|
if not result.success:
|
|
console.print("[red]❌ Compilation failed (as expected)[/red]\n")
|
|
|
|
# Show the first error
|
|
error = result.first_error
|
|
console.print(f"[bold red]Error at line {error.line}, column {error.column}:[/bold red]")
|
|
console.print(f"[yellow]{error.message}[/yellow]")
|
|
console.print(f"\nProblematic code: [red]{error.source_line}[/red]")
|
|
console.print(" " * (16 + error.column) + "[red]^[/red]")
|
|
|
|
if error.suggestions:
|
|
console.print("\n[green]💡 Suggestions:[/green]")
|
|
for suggestion in error.suggestions:
|
|
console.print(f" • {suggestion.message}")
|
|
|
|
# Show the fixed version
|
|
fixed_script = """WAIT `body` 2
|
|
CLICK `button.submit`
|
|
IF (EXISTS `.modal`) THEN CLICK `.close`"""
|
|
|
|
console.print("\n[cyan]Fixed C4A Script:[/cyan]")
|
|
syntax = Syntax(fixed_script, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, border_style="green"))
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 3...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 3: Real-world example - E-commerce automation
|
|
console.print("\n[bold yellow]Demo 3: Real-World E-commerce Automation[/bold yellow]\n")
|
|
|
|
console.print("[cyan]Scenario:[/cyan] Automate product search with smart handling\n")
|
|
|
|
ecommerce_script = """# E-commerce Product Search Automation
|
|
# Define reusable procedures
|
|
PROC handle_popups
|
|
# Close cookie banner if present
|
|
IF (EXISTS `.cookie-notice`) THEN CLICK `.cookie-accept`
|
|
|
|
# Close newsletter popup if it appears
|
|
IF (EXISTS `#newsletter-modal`) THEN CLICK `.modal-close`
|
|
ENDPROC
|
|
|
|
PROC search_product
|
|
# Click search box and type query
|
|
CLICK `.search-input`
|
|
TYPE "wireless headphones"
|
|
PRESS Enter
|
|
|
|
# Wait for results
|
|
WAIT `.product-grid` 10
|
|
ENDPROC
|
|
|
|
# Main automation flow
|
|
SET max_products = 50
|
|
|
|
# Step 1: Navigate and handle popups
|
|
GO https://shop.example.com
|
|
WAIT `body` 3
|
|
handle_popups
|
|
|
|
# Step 2: Perform search
|
|
search_product
|
|
|
|
# Step 3: Load more products (infinite scroll)
|
|
REPEAT (SCROLL DOWN 1000, `document.querySelectorAll('.product-card').length < 50`)
|
|
|
|
# Step 4: Apply filters
|
|
IF (EXISTS `.filter-price`) THEN CLICK `input[data-filter="under-100"]`
|
|
WAIT 2
|
|
|
|
# Step 5: Extract product count
|
|
EVAL `console.log('Found ' + document.querySelectorAll('.product-card').length + ' products')`"""
|
|
|
|
syntax = Syntax(ecommerce_script, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, title="E-commerce Automation Script", border_style="cyan"))
|
|
|
|
# Compile and show results
|
|
console.print("\n[cyan]Compiling automation script...[/cyan]")
|
|
result = c4a_compile(ecommerce_script)
|
|
|
|
if result.success:
|
|
console.print(f"[green]✅ Successfully compiled to {len(result.js_code)} JavaScript statements![/green]")
|
|
console.print("\n[bold]Script Analysis:[/bold]")
|
|
console.print(f"• Procedures defined: {len(result.metadata.get('procedures', []))}")
|
|
console.print(f"• Variables used: {len(result.metadata.get('variables', []))}")
|
|
console.print(f"• Total commands: {result.metadata.get('total_commands', 0)}")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to continue to Demo 4...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 4: Integration with Crawl4AI - LIVE DEMO
|
|
console.print("\n[bold yellow]Demo 4: Live Integration with Crawl4AI[/bold yellow]\n")
|
|
|
|
console.print("[cyan]Let's see C4A Script in action with real web crawling![/cyan]\n")
|
|
|
|
# Create a simple C4A script for demo
|
|
live_script = """# Handle common website patterns
|
|
WAIT `body` 2
|
|
# Close cookie banner if exists
|
|
IF (EXISTS `.cookie-banner, .cookie-notice, #cookie-consent`) THEN CLICK `.accept, .agree, button[aria-label*="accept"]`
|
|
# Scroll to load content
|
|
SCROLL DOWN 500
|
|
WAIT 1"""
|
|
|
|
console.print("[bold]Our C4A Script:[/bold]")
|
|
syntax = Syntax(live_script, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, border_style="cyan"))
|
|
|
|
# Method 1: Direct C4A Script usage
|
|
console.print("\n[bold cyan]Method 1: Direct C4A Script Integration[/bold cyan]\n")
|
|
|
|
try:
|
|
# Import necessary components
|
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
|
|
|
# Define extraction schema
|
|
schema = {
|
|
"name": "page_content",
|
|
"selector": "body",
|
|
"fields": {
|
|
"title": {"selector": "h1, title", "type": "text"},
|
|
"paragraphs": {"selector": "p", "type": "list", "fields": {"text": {"type": "text"}}},
|
|
"links": {"selector": "a[href]", "type": "list", "fields": {"text": {"type": "text"}, "href": {"type": "attribute", "attribute": "href"}}}
|
|
}
|
|
}
|
|
|
|
# Create config with C4A script
|
|
config = CrawlerRunConfig(
|
|
c4a_script=live_script,
|
|
extraction_strategy=JsonCssExtractionStrategy(schema),
|
|
only_text=True,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
console.print("[green]✅ Config created with C4A script![/green]")
|
|
console.print(f"[dim]The C4A script will be automatically transpiled when crawling[/dim]\n")
|
|
|
|
# Show the actual code
|
|
code_example1 = f'''# Live code that's actually running:
|
|
config = CrawlerRunConfig(
|
|
c4a_script="""{live_script}""",
|
|
extraction_strategy=JsonCssExtractionStrategy(schema),
|
|
only_text=True,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
# This would run the crawler:
|
|
# async with AsyncWebCrawler() as crawler:
|
|
# result = await crawler.arun("https://example.com", config=config)
|
|
# print(f"Extracted {{len(result.extracted_content)}} items")'''
|
|
|
|
syntax = Syntax(code_example1, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, title="Method 1: Direct Integration (Live Code)", border_style="green"))
|
|
|
|
except Exception as e:
|
|
console.print(f"[red]Error in demo: {e}[/red]")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to see Method 2...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Method 2: Pre-compilation approach
|
|
console.print("\n[bold cyan]Method 2: Pre-compile and Reuse[/bold cyan]\n")
|
|
|
|
# Advanced script with procedures
|
|
advanced_script = """# E-commerce automation with procedures
|
|
PROC handle_popups
|
|
IF (EXISTS `.popup-overlay`) THEN CLICK `.popup-close`
|
|
IF (EXISTS `#newsletter-modal`) THEN CLICK `.modal-dismiss`
|
|
ENDPROC
|
|
|
|
PROC load_all_products
|
|
# Keep scrolling until no more products load
|
|
REPEAT (SCROLL DOWN 1000, `document.querySelectorAll('.product').length < window.lastProductCount`)
|
|
EVAL `window.lastProductCount = document.querySelectorAll('.product').length`
|
|
ENDPROC
|
|
|
|
# Main flow
|
|
WAIT `.products-container` 5
|
|
handle_popups
|
|
EVAL `window.lastProductCount = 0`
|
|
load_all_products"""
|
|
|
|
console.print("[bold]Advanced C4A Script with Procedures:[/bold]")
|
|
syntax = Syntax(advanced_script, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, border_style="cyan"))
|
|
|
|
# Actually compile it
|
|
console.print("\n[cyan]Compiling the script...[/cyan]")
|
|
compilation_result = c4a_compile(advanced_script)
|
|
|
|
if compilation_result.success:
|
|
console.print(f"[green]✅ Successfully compiled to {len(compilation_result.js_code)} JavaScript statements![/green]\n")
|
|
|
|
# Show first few JS statements
|
|
console.print("[bold]Generated JavaScript (first 5 statements):[/bold]")
|
|
js_preview = "\n".join(compilation_result.js_code[:5])
|
|
if len(compilation_result.js_code) > 5:
|
|
js_preview += f"\n... and {len(compilation_result.js_code) - 5} more statements"
|
|
|
|
js_syntax = Syntax(js_preview, "javascript", theme="monokai", line_numbers=True)
|
|
console.print(Panel(js_syntax, border_style="green"))
|
|
|
|
# Create actual config with compiled code
|
|
config_with_js = CrawlerRunConfig(
|
|
js_code=compilation_result.js_code,
|
|
wait_for="css:.products-container",
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
console.print("\n[green]✅ Config created with pre-compiled JavaScript![/green]")
|
|
|
|
# Show the actual implementation
|
|
code_example2 = f'''# Live code showing pre-compilation:
|
|
# Step 1: Compile once
|
|
result = c4a_compile(advanced_script)
|
|
if result.success:
|
|
js_code = result.js_code # {len(compilation_result.js_code)} statements generated
|
|
|
|
# Step 2: Use compiled code multiple times
|
|
config = CrawlerRunConfig(
|
|
js_code=js_code,
|
|
wait_for="css:.products-container",
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
# Step 3: Run crawler with compiled code
|
|
# async with AsyncWebCrawler() as crawler:
|
|
# # Can reuse js_code for multiple URLs
|
|
# for url in ["shop1.com", "shop2.com"]:
|
|
# result = await crawler.arun(url, config=config)
|
|
else:
|
|
print(f"Compilation error: {{result.first_error.message}}")'''
|
|
|
|
syntax = Syntax(code_example2, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, title="Method 2: Pre-compilation (Live Code)", border_style="green"))
|
|
|
|
else:
|
|
console.print(f"[red]Compilation failed: {compilation_result.first_error.message}[/red]")
|
|
|
|
if not auto_mode:
|
|
console.print("\n[dim]Press Enter to see a real-world example...[/dim]")
|
|
input()
|
|
else:
|
|
await asyncio.sleep(1)
|
|
|
|
# Demo 5: Real-world example with actual crawling
|
|
console.print("\n[bold yellow]Demo 5: Real-World Example - News Site Automation[/bold yellow]\n")
|
|
|
|
news_script = """# News site content extraction
|
|
# Wait for main content
|
|
WAIT `article, .article-content, main` 5
|
|
|
|
# Handle common annoyances
|
|
IF (EXISTS `.cookie-notice`) THEN CLICK `button[class*="accept"]`
|
|
IF (EXISTS `.newsletter-popup`) THEN CLICK `.close, .dismiss`
|
|
|
|
# Expand "Read More" sections
|
|
IF (EXISTS `.read-more-button`) THEN CLICK `.read-more-button`
|
|
|
|
# Load comments if available
|
|
IF (EXISTS `.load-comments`) THEN CLICK `.load-comments`
|
|
WAIT 2"""
|
|
|
|
console.print("[bold]News Site Automation Script:[/bold]")
|
|
syntax = Syntax(news_script, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, border_style="cyan"))
|
|
|
|
# Create and show actual working config
|
|
console.print("\n[cyan]Creating crawler configuration...[/cyan]")
|
|
|
|
news_config = CrawlerRunConfig(
|
|
c4a_script=news_script,
|
|
wait_for="css:article",
|
|
only_text=True,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
console.print("[green]✅ Configuration ready for crawling![/green]\n")
|
|
|
|
# Show how to actually use it
|
|
usage_example = '''# Complete working example:
|
|
async def crawl_news_site():
|
|
"""Crawl a news site with C4A automation"""
|
|
|
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example-news.com/article",
|
|
config=CrawlerRunConfig(
|
|
c4a_script=news_script,
|
|
wait_for="css:article",
|
|
only_text=True
|
|
)
|
|
)
|
|
|
|
if result.success:
|
|
print(f"✓ Crawled: {result.url}")
|
|
print(f"✓ Content length: {len(result.markdown.raw_markdown)} chars")
|
|
print(f"✓ Links found: {len(result.links.get('internal', []))} internal")
|
|
|
|
# The C4A script ensured we:
|
|
# - Handled cookie banners
|
|
# - Expanded collapsed content
|
|
# - Loaded dynamic comments
|
|
# All automatically!
|
|
|
|
return result
|
|
|
|
# Run it:
|
|
# result = await crawl_news_site()'''
|
|
|
|
syntax = Syntax(usage_example, "python", theme="monokai", line_numbers=True)
|
|
console.print(Panel(syntax, title="Complete Working Example", border_style="green"))
|
|
|
|
# Summary
|
|
console.print("\n[bold green]✨ What We Demonstrated:[/bold green]")
|
|
console.print("• C4A Script transpiles to optimized JavaScript automatically")
|
|
console.print("• Direct integration via `c4a_script` parameter - easiest approach")
|
|
console.print("• Pre-compilation via `c4a_compile()` - best for reuse")
|
|
console.print("• Real configs that you can copy and use immediately")
|
|
console.print("• Actual code running, not just examples!\n")
|
|
|
|
|
|
async def interactive_menu():
|
|
"""Interactive menu to select demos"""
|
|
from rich.prompt import Prompt
|
|
|
|
demos = {
|
|
"1": ("Link Preview & Scoring", link_preview_demo),
|
|
"2": ("Adaptive Crawling", adaptive_crawling_demo),
|
|
"3": ("Virtual Scroll", virtual_scroll_demo),
|
|
"4": ("URL Seeder", url_seeder_demo),
|
|
"5": ("C4A Script", c4a_script_demo),
|
|
"6": ("LLM Context Builder", lambda auto: console.print("[yellow]LLM Context demo coming soon![/yellow]")),
|
|
"7": ("Run All Demos", None), # Special case
|
|
"0": ("Exit", None)
|
|
}
|
|
|
|
while True:
|
|
# Clear screen for better presentation
|
|
console.clear()
|
|
|
|
print_banner(
|
|
"🚀 CRAWL4AI v0.7.0 SHOWCASE",
|
|
"Interactive Demo Menu"
|
|
)
|
|
|
|
console.print("\n[bold cyan]Select a demo to run:[/bold cyan]\n")
|
|
|
|
for key, (name, _) in demos.items():
|
|
if key == "0":
|
|
console.print(f"\n[dim]{key}. {name}[/dim]")
|
|
else:
|
|
console.print(f"{key}. {name}")
|
|
|
|
choice = Prompt.ask("\n[bold]Enter your choice[/bold]", choices=list(demos.keys()))
|
|
|
|
if choice == "0":
|
|
console.print("\n[yellow]Thanks for exploring Crawl4AI v0.7.0![/yellow]")
|
|
break
|
|
elif choice == "7":
|
|
# Run all demos
|
|
console.clear()
|
|
for key in ["1", "3", "4", "5"]: # Link Preview, Virtual Scroll, URL Seeder, C4A Script
|
|
name, demo_func = demos[key]
|
|
if demo_func:
|
|
await demo_func(auto_mode=True)
|
|
console.print("\n[dim]Press Enter to continue...[/dim]")
|
|
input()
|
|
else:
|
|
name, demo_func = demos[choice]
|
|
if demo_func:
|
|
console.clear()
|
|
await demo_func(auto_mode=False)
|
|
console.print("\n[dim]Press Enter to return to menu...[/dim]")
|
|
input()
|
|
|
|
|
|
async def main():
|
|
"""Run all feature demonstrations"""
|
|
import sys
|
|
|
|
# Check command line arguments
|
|
interactive_mode = "--interactive" in sys.argv or "-i" in sys.argv
|
|
auto_mode = "--auto" in sys.argv
|
|
|
|
if interactive_mode:
|
|
await interactive_menu()
|
|
elif auto_mode:
|
|
console.print("[yellow]Running in AUTO MODE - skipping user prompts[/yellow]\n")
|
|
|
|
# Run demos automatically
|
|
await link_preview_demo(auto_mode=True)
|
|
await asyncio.sleep(2)
|
|
# await adaptive_crawling_demo(auto_mode=True) # Skip for now
|
|
await virtual_scroll_demo(auto_mode=True)
|
|
await asyncio.sleep(2)
|
|
await url_seeder_demo(auto_mode=True)
|
|
await asyncio.sleep(2)
|
|
await c4a_script_demo(auto_mode=True)
|
|
else:
|
|
# Default: run all demos with prompts
|
|
try:
|
|
# 1. Link Preview Demo
|
|
await link_preview_demo(auto_mode=False)
|
|
|
|
console.print("\n[dim]Press Enter to continue to Virtual Scroll demo...[/dim]")
|
|
input()
|
|
|
|
# 2. Virtual Scroll Demo
|
|
await virtual_scroll_demo(auto_mode=False)
|
|
|
|
console.print("\n[dim]Press Enter to continue to URL Seeder demo...[/dim]")
|
|
input()
|
|
|
|
# 3. URL Seeder Demo
|
|
await url_seeder_demo(auto_mode=False)
|
|
|
|
console.print("\n[dim]Press Enter to continue to C4A Script demo...[/dim]")
|
|
input()
|
|
|
|
# 4. C4A Script Demo
|
|
await c4a_script_demo(auto_mode=False)
|
|
|
|
# TODO: Add other demos here
|
|
# await llm_context_demo()
|
|
|
|
console.print("\n[bold green]✨ All demos completed![/bold green]")
|
|
console.print("\nTo explore individual demos, run: [cyan]python crawl4ai_v0_7_0_showcase.py --interactive[/cyan]")
|
|
|
|
except KeyboardInterrupt:
|
|
console.print("\n[yellow]Demo interrupted by user[/yellow]")
|
|
except Exception as e:
|
|
console.print(f"\n[red]Error: {str(e)}[/red]")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
# Show usage if --help is provided
|
|
if "--help" in sys.argv or "-h" in sys.argv:
|
|
console.print("\n[bold]Crawl4AI v0.7.0 Feature Showcase[/bold]\n")
|
|
console.print("Usage: python crawl4ai_v0_7_0_showcase.py [options]\n")
|
|
console.print("Options:")
|
|
console.print(" --interactive, -i Interactive menu to select demos")
|
|
console.print(" --auto Run all demos without user prompts")
|
|
console.print(" --help, -h Show this help message\n")
|
|
console.print("Default: Run all demos with prompts between each\n")
|
|
else:
|
|
asyncio.run(main()) |