docs(linkdin, url_seeder): update and reorganize LinkedIn data discovery and URL seeder documentation
This commit introduces significant updates to the LinkedIn data discovery documentation by adding two new Jupyter notebooks that provide detailed insights into data discovery processes. The previous workshop notebook has been removed to streamline the content and avoid redundancy. Additionally, the URL seeder documentation has been expanded with a new tutorial and several enhancements to existing scripts, improving usability and clarity. The changes include: - Added and for comprehensive LinkedIn data discovery. - Removed to eliminate outdated content. - Updated to reflect new data visualization requirements. - Introduced and to facilitate easier access to URL seeding techniques. - Enhanced existing Python scripts and markdown files in the URL seeder section for better documentation and examples. These changes aim to improve the overall documentation quality and user experience for developers working with LinkedIn data and URL seeding techniques.
This commit is contained in:
1323
docs/apps/linkdin/Crawl4ai_Linkedin_Data_Discovery_Part_1.ipynb
Normal file
1323
docs/apps/linkdin/Crawl4ai_Linkedin_Data_Discovery_Part_1.ipynb
Normal file
File diff suppressed because one or more lines are too long
5859
docs/apps/linkdin/Crawl4ai_Linkedin_Data_Discovery_Part_2.ipynb
Normal file
5859
docs/apps/linkdin/Crawl4ai_Linkedin_Data_Discovery_Part_2.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
@@ -447,10 +447,7 @@
|
||||
dragNodes: true,
|
||||
dragView: true,
|
||||
zoomView: true,
|
||||
mouseWheel: {
|
||||
speed: 0.15, // Reduced from default 1.0
|
||||
smooth: true // Enable smooth zooming
|
||||
}
|
||||
zoomSpeed: 0.15 // Reduced from default 1.0
|
||||
},
|
||||
nodes: {
|
||||
font: {
|
||||
|
||||
1171
docs/examples/url_seeder/Crawl4AI_URL_Seeder_Tutorial.ipynb
Normal file
1171
docs/examples/url_seeder/Crawl4AI_URL_Seeder_Tutorial.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
@@ -23,6 +23,8 @@ Requirements:
|
||||
Usage:
|
||||
- Run normally: python bbc_sport_research_assistant.py
|
||||
- Run test mode: python bbc_sport_research_assistant.py test
|
||||
|
||||
Note: AsyncUrlSeeder now uses context manager for automatic cleanup.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
@@ -269,44 +271,43 @@ async def discover_urls(domain: str, query: str, config: ResearchConfig) -> List
|
||||
|
||||
console.print(f"\n[cyan]🔍 Discovering URLs from {domain}...[/cyan]")
|
||||
|
||||
# Initialize URL seeder
|
||||
seeder = AsyncUrlSeeder(logger=AsyncLogger(verbose=config.verbose))
|
||||
|
||||
# Configure seeding
|
||||
seeding_config = SeedingConfig(
|
||||
source="sitemap+cc", # Use both sitemap and Common Crawl
|
||||
extract_head=config.extract_head_metadata,
|
||||
query=query,
|
||||
scoring_method=config.scoring_method,
|
||||
score_threshold=config.score_threshold,
|
||||
max_urls=config.max_urls_discovery,
|
||||
live_check=config.live_check,
|
||||
force=config.force_refresh
|
||||
)
|
||||
|
||||
try:
|
||||
# Discover URLs
|
||||
urls = await seeder.urls(domain, seeding_config)
|
||||
|
||||
# Sort by relevance score (descending)
|
||||
sorted_urls = sorted(
|
||||
urls,
|
||||
key=lambda x: x.get('relevance_score', 0),
|
||||
reverse=True
|
||||
# Initialize URL seeder with context manager
|
||||
async with AsyncUrlSeeder(logger=AsyncLogger(verbose=config.verbose)) as seeder:
|
||||
# Configure seeding
|
||||
seeding_config = SeedingConfig(
|
||||
source="sitemap+cc", # Use both sitemap and Common Crawl
|
||||
extract_head=config.extract_head_metadata,
|
||||
query=query,
|
||||
scoring_method=config.scoring_method,
|
||||
score_threshold=config.score_threshold,
|
||||
max_urls=config.max_urls_discovery,
|
||||
live_check=config.live_check,
|
||||
force=config.force_refresh
|
||||
)
|
||||
|
||||
# Take top K
|
||||
top_urls = sorted_urls[:config.top_k_urls]
|
||||
|
||||
console.print(f"[green]✅ Discovered {len(urls)} URLs, selected top {len(top_urls)}[/green]")
|
||||
|
||||
# Cache the result
|
||||
save_to_cache(cache_key, top_urls)
|
||||
return top_urls
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[red]❌ URL discovery failed: {e}[/red]")
|
||||
return []
|
||||
try:
|
||||
# Discover URLs
|
||||
urls = await seeder.urls(domain, seeding_config)
|
||||
|
||||
# Sort by relevance score (descending)
|
||||
sorted_urls = sorted(
|
||||
urls,
|
||||
key=lambda x: x.get('relevance_score', 0),
|
||||
reverse=True
|
||||
)
|
||||
|
||||
# Take top K
|
||||
top_urls = sorted_urls[:config.top_k_urls]
|
||||
|
||||
console.print(f"[green]✅ Discovered {len(urls)} URLs, selected top {len(top_urls)}[/green]")
|
||||
|
||||
# Cache the result
|
||||
save_to_cache(cache_key, top_urls)
|
||||
return top_urls
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[red]❌ URL discovery failed: {e}[/red]")
|
||||
return []
|
||||
|
||||
|
||||
async def crawl_selected_urls(urls: List[str], query: str, config: ResearchConfig) -> List[Dict]:
|
||||
|
||||
155
docs/examples/url_seeder/convert_tutorial_to_colab.py
Normal file
155
docs/examples/url_seeder/convert_tutorial_to_colab.py
Normal file
@@ -0,0 +1,155 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Convert Crawl4AI URL Seeder tutorial markdown to Colab notebook format
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def parse_markdown_to_cells(markdown_content):
|
||||
"""Parse markdown content and convert to notebook cells"""
|
||||
cells = []
|
||||
|
||||
# Split content by cell markers
|
||||
lines = markdown_content.split('\n')
|
||||
|
||||
# Extract the header content before first cell marker
|
||||
header_lines = []
|
||||
i = 0
|
||||
while i < len(lines) and not lines[i].startswith('# cell'):
|
||||
header_lines.append(lines[i])
|
||||
i += 1
|
||||
|
||||
# Add header as markdown cell if it exists
|
||||
if header_lines:
|
||||
header_content = '\n'.join(header_lines).strip()
|
||||
if header_content:
|
||||
cells.append({
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": header_content.split('\n')
|
||||
})
|
||||
|
||||
# Process cells marked with # cell X type:Y
|
||||
current_cell_content = []
|
||||
current_cell_type = None
|
||||
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
|
||||
# Check for cell marker
|
||||
cell_match = re.match(r'^# cell (\d+) type:(markdown|code)$', line)
|
||||
|
||||
if cell_match:
|
||||
# Save previous cell if exists
|
||||
if current_cell_content and current_cell_type:
|
||||
content = '\n'.join(current_cell_content).strip()
|
||||
if content:
|
||||
if current_cell_type == 'code':
|
||||
cells.append({
|
||||
"cell_type": "code",
|
||||
"execution_count": None,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": content.split('\n')
|
||||
})
|
||||
else:
|
||||
cells.append({
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": content.split('\n')
|
||||
})
|
||||
|
||||
# Start new cell
|
||||
current_cell_type = cell_match.group(2)
|
||||
current_cell_content = []
|
||||
else:
|
||||
# Add line to current cell
|
||||
current_cell_content.append(line)
|
||||
|
||||
i += 1
|
||||
|
||||
# Add last cell if exists
|
||||
if current_cell_content and current_cell_type:
|
||||
content = '\n'.join(current_cell_content).strip()
|
||||
if content:
|
||||
if current_cell_type == 'code':
|
||||
cells.append({
|
||||
"cell_type": "code",
|
||||
"execution_count": None,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": content.split('\n')
|
||||
})
|
||||
else:
|
||||
cells.append({
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": content.split('\n')
|
||||
})
|
||||
|
||||
return cells
|
||||
|
||||
|
||||
def create_colab_notebook(cells):
|
||||
"""Create a Colab notebook structure"""
|
||||
notebook = {
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"name": "Crawl4AI_URL_Seeder_Tutorial.ipynb",
|
||||
"provenance": [],
|
||||
"collapsed_sections": [],
|
||||
"toc_visible": True
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"cells": cells
|
||||
}
|
||||
|
||||
return notebook
|
||||
|
||||
|
||||
def main():
|
||||
# Read the markdown file
|
||||
md_path = Path("tutorial_url_seeder.md")
|
||||
|
||||
if not md_path.exists():
|
||||
print(f"Error: {md_path} not found!")
|
||||
return
|
||||
|
||||
print(f"Reading {md_path}...")
|
||||
with open(md_path, 'r', encoding='utf-8') as f:
|
||||
markdown_content = f.read()
|
||||
|
||||
# Parse markdown to cells
|
||||
print("Parsing markdown content...")
|
||||
cells = parse_markdown_to_cells(markdown_content)
|
||||
print(f"Created {len(cells)} cells")
|
||||
|
||||
# Create notebook
|
||||
print("Creating Colab notebook...")
|
||||
notebook = create_colab_notebook(cells)
|
||||
|
||||
# Save notebook
|
||||
output_path = Path("Crawl4AI_URL_Seeder_Tutorial.ipynb")
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(notebook, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"✅ Successfully created {output_path}")
|
||||
print(f" - Total cells: {len(cells)}")
|
||||
print(f" - Markdown cells: {sum(1 for c in cells if c['cell_type'] == 'markdown')}")
|
||||
print(f" - Code cells: {sum(1 for c in cells if c['cell_type'] == 'code')}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
991
docs/examples/url_seeder/tutorial_url_seeder.md
Normal file
991
docs/examples/url_seeder/tutorial_url_seeder.md
Normal file
@@ -0,0 +1,991 @@
|
||||
# 🔬 Building an AI Research Assistant with Crawl4AI: Smart URL Discovery
|
||||
|
||||
## Welcome to the Research Pipeline Workshop!
|
||||
|
||||
In this tutorial, we'll build an **AI-powered research assistant** that intelligently discovers, filters, and analyzes web content. Instead of blindly crawling hundreds of pages, we'll use Crawl4AI's URL Seeder to:
|
||||
|
||||
- 🔍 **Discover all available URLs** without crawling them first
|
||||
- 🎯 **Score and rank** them by relevance using AI
|
||||
- 🕷️ **Crawl only the most relevant** content
|
||||
- 🤖 **Generate research insights** with proper citations
|
||||
|
||||
By the end, you'll have a complete research pipeline that can analyze any topic across multiple websites efficiently.
|
||||
|
||||
## What You'll Build
|
||||
|
||||
A **smart research assistant** that:
|
||||
1. Takes any research query (e.g., "Premier League transfer news")
|
||||
2. Discovers relevant articles from news sites
|
||||
3. Ranks them by relevance using BM25 scoring
|
||||
4. Crawls only the top-ranked articles
|
||||
5. Synthesizes findings into a comprehensive report
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Python 3.8+ environment
|
||||
- Basic understanding of async Python
|
||||
- API keys for LLM (Gemini or OpenAI recommended)
|
||||
|
||||
## Pipeline Overview
|
||||
|
||||
```
|
||||
User Query → Query Enhancement → URL Discovery → Relevance Scoring → Smart Crawling → AI Synthesis → Research Report
|
||||
```
|
||||
|
||||
Each step builds on the previous one, creating an efficient research system that saves time and resources.
|
||||
|
||||
Let's begin! 🚀
|
||||
|
||||
---
|
||||
|
||||
# cell 1 type:markdown
|
||||
## Step 0: Environment Setup and Dependencies
|
||||
|
||||
First, we'll set up our environment with all necessary libraries. We need Crawl4AI for intelligent web crawling, LiteLLM for AI integration, and Rich for beautiful terminal output. This foundation ensures our research assistant has all the tools it needs.
|
||||
|
||||
# cell 2 type:code
|
||||
# Install required packages
|
||||
!pip install -q crawl4ai litellm rich
|
||||
|
||||
# cell 3 type:code
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
from typing import List, Dict, Optional, Tuple
|
||||
from dataclasses import dataclass, asdict
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# Rich for beautiful console output
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich.progress import Progress, SpinnerColumn, TextColumn
|
||||
|
||||
# Crawl4AI imports for intelligent crawling
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
AsyncUrlSeeder,
|
||||
SeedingConfig,
|
||||
AsyncLogger
|
||||
)
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
# LiteLLM for AI capabilities
|
||||
import litellm
|
||||
|
||||
# Initialize Rich console for pretty output
|
||||
console = Console()
|
||||
|
||||
print("✅ Environment ready! All dependencies loaded successfully.")
|
||||
|
||||
# cell 4 type:markdown
|
||||
## Step 1: Configuration and Data Classes
|
||||
|
||||
Here we define our research pipeline configuration. These dataclasses act as our control center, allowing us to fine-tune every aspect of the research process. Think of them as the settings panel for your research assistant - from discovery limits to AI model choices.
|
||||
|
||||
# cell 5 type:code
|
||||
@dataclass
|
||||
class ResearchConfig:
|
||||
"""Configuration for the research pipeline
|
||||
|
||||
This class controls every aspect of our research assistant:
|
||||
- How many URLs to discover and crawl
|
||||
- Which scoring methods to use
|
||||
- Whether to use AI enhancement
|
||||
- Output preferences
|
||||
"""
|
||||
# Core settings
|
||||
domain: str = "www.bbc.com/sport"
|
||||
max_urls_discovery: int = 100 # Cast a wide net initially
|
||||
max_urls_to_crawl: int = 10 # But only crawl the best
|
||||
top_k_urls: int = 10 # Focus on top results
|
||||
|
||||
# Scoring and filtering
|
||||
score_threshold: float = 0.3 # Minimum relevance score
|
||||
scoring_method: str = "bm25" # BM25 is great for relevance
|
||||
|
||||
# AI and processing
|
||||
use_llm_enhancement: bool = True # Enhance queries with AI
|
||||
llm_model: str = "gemini/gemini-1.5-flash" # Fast and capable
|
||||
|
||||
# URL discovery options
|
||||
extract_head_metadata: bool = True # Get titles, descriptions
|
||||
live_check: bool = False # Verify URLs are accessible
|
||||
force_refresh: bool = False # Bypass cache
|
||||
|
||||
# Crawler settings
|
||||
max_concurrent_crawls: int = 5 # Parallel crawling
|
||||
timeout: int = 30000 # 30 second timeout
|
||||
headless: bool = True # No browser window
|
||||
|
||||
# Output settings
|
||||
output_dir: Path = Path("research_results")
|
||||
verbose: bool = True
|
||||
|
||||
@dataclass
|
||||
class ResearchQuery:
|
||||
"""Container for research query and metadata"""
|
||||
original_query: str
|
||||
enhanced_query: Optional[str] = None
|
||||
search_patterns: List[str] = None
|
||||
timestamp: str = None
|
||||
|
||||
@dataclass
|
||||
class ResearchResult:
|
||||
"""Container for research results"""
|
||||
query: ResearchQuery
|
||||
discovered_urls: List[Dict]
|
||||
crawled_content: List[Dict]
|
||||
synthesis: str
|
||||
citations: List[Dict]
|
||||
metadata: Dict
|
||||
|
||||
# Create default configuration
|
||||
config = ResearchConfig()
|
||||
console.print(Panel(
|
||||
f"[bold cyan]Research Configuration[/bold cyan]\n\n"
|
||||
f"🌐 Domain: {config.domain}\n"
|
||||
f"🔍 Max Discovery: {config.max_urls_discovery} URLs\n"
|
||||
f"🕷️ Max Crawl: {config.max_urls_to_crawl} pages\n"
|
||||
f"🤖 AI Model: {config.llm_model}",
|
||||
title="⚙️ Settings"
|
||||
))
|
||||
|
||||
# cell 6 type:markdown
|
||||
## Step 2: Query Enhancement with AI
|
||||
|
||||
Not all search queries are created equal. Here we use AI to transform simple queries into comprehensive search strategies. The LLM analyzes your query, extracts key concepts, and generates related terms - turning "football news" into a rich set of search patterns.
|
||||
|
||||
# cell 7 type:code
|
||||
async def enhance_query_with_llm(query: str, config: ResearchConfig) -> ResearchQuery:
|
||||
"""
|
||||
Transform simple queries into comprehensive search strategies
|
||||
|
||||
Why enhance queries?
|
||||
- Users often use simple terms ("football news")
|
||||
- But relevant content might use varied terminology
|
||||
- AI helps capture all relevant variations
|
||||
"""
|
||||
console.print(f"\n[cyan]🤖 Enhancing query: '{query}'...[/cyan]")
|
||||
|
||||
try:
|
||||
# Ask AI to analyze and expand the query
|
||||
response = await litellm.acompletion(
|
||||
model=config.llm_model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": f"""Given this research query: "{query}"
|
||||
|
||||
Extract:
|
||||
1. Key terms and concepts (as a list)
|
||||
2. Related search terms
|
||||
3. A more specific/enhanced version of the query
|
||||
|
||||
Return as JSON:
|
||||
{{
|
||||
"key_terms": ["term1", "term2"],
|
||||
"related_terms": ["related1", "related2"],
|
||||
"enhanced_query": "enhanced version of query"
|
||||
}}"""
|
||||
}],
|
||||
temperature=0.3, # Low temperature for consistency
|
||||
response_format={"type": "json_object"}
|
||||
)
|
||||
|
||||
data = json.loads(response.choices[0].message.content)
|
||||
|
||||
# Create search patterns from extracted terms
|
||||
# These patterns help the URL seeder find relevant pages
|
||||
all_terms = data["key_terms"] + data["related_terms"]
|
||||
patterns = [f"*{term.lower()}*" for term in all_terms]
|
||||
|
||||
result = ResearchQuery(
|
||||
original_query=query,
|
||||
enhanced_query=data["enhanced_query"],
|
||||
search_patterns=patterns[:10], # Limit to 10 patterns
|
||||
timestamp=datetime.now().isoformat()
|
||||
)
|
||||
|
||||
# Show the enhancement
|
||||
console.print(Panel(
|
||||
f"[green]✅ Enhanced Query:[/green] {result.enhanced_query}\n"
|
||||
f"[dim]Key terms: {', '.join(data['key_terms'])}[/dim]",
|
||||
title="🔍 Query Enhancement"
|
||||
))
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[yellow]⚠️ Enhancement failed, using original query: {e}[/yellow]")
|
||||
# Fallback to simple tokenization
|
||||
words = query.lower().split()
|
||||
patterns = [f"*{word}*" for word in words if len(word) > 2]
|
||||
|
||||
return ResearchQuery(
|
||||
original_query=query,
|
||||
enhanced_query=query,
|
||||
search_patterns=patterns,
|
||||
timestamp=datetime.now().isoformat()
|
||||
)
|
||||
|
||||
# Example usage
|
||||
test_query = "Premier League transfer news"
|
||||
enhanced = await enhance_query_with_llm(test_query, config)
|
||||
|
||||
# cell 8 type:markdown
|
||||
## Step 3: Smart URL Discovery with AsyncUrlSeeder
|
||||
|
||||
This is where the magic begins! Instead of crawling pages to find links, AsyncUrlSeeder discovers URLs from sitemaps and Common Crawl data. It's like having a map of the entire website before you start exploring. We'll discover hundreds of URLs in seconds, complete with metadata.
|
||||
|
||||
# cell 9 type:code
|
||||
async def discover_urls(
|
||||
domain: str,
|
||||
query: ResearchQuery,
|
||||
config: ResearchConfig
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Discover and rank URLs without crawling them
|
||||
|
||||
The URL Seeder is incredibly powerful because it:
|
||||
1. Gets URLs from sitemaps (official site maps)
|
||||
2. Gets URLs from Common Crawl (web-scale data)
|
||||
3. Extracts metadata without full page loads
|
||||
4. Scores relevance using BM25 algorithm
|
||||
|
||||
This means we know which pages are worth crawling
|
||||
BEFORE we spend time crawling them!
|
||||
"""
|
||||
console.print(f"\n[cyan]🔍 Discovering URLs from {domain}...[/cyan]")
|
||||
|
||||
# Use context manager for automatic cleanup
|
||||
async with AsyncUrlSeeder(logger=AsyncLogger(verbose=config.verbose)) as seeder:
|
||||
# Configure the discovery process
|
||||
seeding_config = SeedingConfig(
|
||||
# Data sources
|
||||
source="sitemap+cc", # Use both sitemap AND Common Crawl
|
||||
|
||||
# Metadata extraction
|
||||
extract_head=config.extract_head_metadata, # Get titles, descriptions
|
||||
|
||||
# Relevance scoring
|
||||
query=query.enhanced_query or query.original_query,
|
||||
scoring_method=config.scoring_method, # BM25 scoring
|
||||
score_threshold=config.score_threshold, # Minimum score
|
||||
|
||||
# Limits and performance
|
||||
max_urls=config.max_urls_discovery,
|
||||
live_check=config.live_check, # Verify URLs work
|
||||
force=config.force_refresh, # Bypass cache if needed
|
||||
|
||||
# Performance tuning
|
||||
concurrency=20, # Parallel workers
|
||||
)
|
||||
|
||||
try:
|
||||
# Discover URLs - this is FAST!
|
||||
urls = await seeder.urls(domain, seeding_config)
|
||||
|
||||
# Results are already sorted by relevance
|
||||
# thanks to BM25 scoring
|
||||
top_urls = urls[:config.top_k_urls]
|
||||
|
||||
# Show discovery results
|
||||
console.print(f"[green]✅ Discovered {len(urls)} URLs, selected top {len(top_urls)}[/green]")
|
||||
|
||||
# Display a sample of what we found
|
||||
if top_urls:
|
||||
table = Table(title="🎯 Top Discovered URLs")
|
||||
table.add_column("Score", style="cyan")
|
||||
table.add_column("Title", style="green")
|
||||
table.add_column("URL", style="dim")
|
||||
|
||||
for url in top_urls[:5]:
|
||||
score = f"{url.get('relevance_score', 0):.3f}"
|
||||
title = "N/A"
|
||||
if url.get('head_data') and url['head_data'].get('title'):
|
||||
title = url['head_data']['title'][:50] + "..."
|
||||
url_str = url['url'][:60] + "..."
|
||||
|
||||
table.add_row(score, title, url_str)
|
||||
|
||||
console.print(table)
|
||||
|
||||
return top_urls
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[red]❌ URL discovery failed: {e}[/red]")
|
||||
return []
|
||||
|
||||
# Example discovery
|
||||
discovered = await discover_urls(config.domain, enhanced, config)
|
||||
|
||||
# cell 10 type:markdown
|
||||
## Step 4: Intelligent Content Crawling
|
||||
|
||||
Now we crawl only the most relevant URLs. This is where our smart filtering pays off - instead of crawling hundreds of pages, we focus on the top 10-20 most relevant ones. We use content filtering to extract only the meaningful text, removing ads and navigation.
|
||||
|
||||
# cell 11 type:code
|
||||
async def crawl_selected_urls(
|
||||
urls: List[Dict],
|
||||
query: ResearchQuery,
|
||||
config: ResearchConfig
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Crawl only the most relevant URLs with smart content filtering
|
||||
|
||||
Key optimizations:
|
||||
1. We already know these URLs are relevant (from scoring)
|
||||
2. We crawl them in parallel for speed
|
||||
3. We extract only meaningful content (no ads/nav)
|
||||
4. We generate clean markdown for analysis
|
||||
"""
|
||||
# Extract URLs from discovery results
|
||||
url_list = [u['url'] for u in urls if 'url' in u][:config.max_urls_to_crawl]
|
||||
|
||||
if not url_list:
|
||||
console.print("[red]❌ No URLs to crawl[/red]")
|
||||
return []
|
||||
|
||||
console.print(f"\n[cyan]🕷️ Crawling {len(url_list)} URLs...[/cyan]")
|
||||
|
||||
# Configure intelligent content extraction
|
||||
# This removes ads, navigation, and other noise
|
||||
md_generator = DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48, # Content relevance threshold
|
||||
threshold_type="dynamic", # Adapts to page structure
|
||||
min_word_threshold=10 # Ignore tiny text blocks
|
||||
),
|
||||
)
|
||||
|
||||
# Configure the crawler
|
||||
crawler_config = CrawlerRunConfig(
|
||||
markdown_generator=md_generator,
|
||||
exclude_external_links=True, # Focus on content, not links
|
||||
excluded_tags=['nav', 'header', 'footer', 'aside'], # Skip UI elements
|
||||
)
|
||||
|
||||
# Create crawler with browser config
|
||||
async with AsyncWebCrawler(
|
||||
config=BrowserConfig(
|
||||
headless=config.headless,
|
||||
verbose=config.verbose
|
||||
)
|
||||
) as crawler:
|
||||
# Crawl URLs in parallel for speed
|
||||
# arun_many handles concurrency automatically
|
||||
results = await crawler.arun_many(
|
||||
url_list,
|
||||
config=crawler_config,
|
||||
max_concurrent=config.max_concurrent_crawls
|
||||
)
|
||||
|
||||
# Process successful results
|
||||
crawled_content = []
|
||||
for url, result in zip(url_list, results):
|
||||
if result.success:
|
||||
# Extract the content we need
|
||||
content_data = {
|
||||
'url': url,
|
||||
'title': result.metadata.get('title', 'No title'),
|
||||
'markdown': result.markdown.fit_markdown or result.markdown.raw_markdown,
|
||||
'metadata': result.metadata
|
||||
}
|
||||
crawled_content.append(content_data)
|
||||
console.print(f" [green]✓[/green] Crawled: {url[:60]}...")
|
||||
else:
|
||||
console.print(f" [red]✗[/red] Failed: {url[:50]}... - {result.error}")
|
||||
|
||||
console.print(f"[green]✅ Successfully crawled {len(crawled_content)} pages[/green]")
|
||||
return crawled_content
|
||||
|
||||
# Example crawling
|
||||
crawled = await crawl_selected_urls(discovered[:5], enhanced, config)
|
||||
|
||||
# cell 12 type:markdown
|
||||
## Step 5: AI-Powered Research Synthesis
|
||||
|
||||
This is where we transform raw content into insights. The AI analyzes all crawled articles, identifies key themes, and generates a comprehensive synthesis with proper citations. It's like having a research assistant read everything and write you a summary.
|
||||
|
||||
# cell 13 type:code
|
||||
async def generate_research_synthesis(
|
||||
query: ResearchQuery,
|
||||
crawled_content: List[Dict],
|
||||
config: ResearchConfig
|
||||
) -> Tuple[str, List[Dict]]:
|
||||
"""
|
||||
Use AI to synthesize findings from multiple sources
|
||||
|
||||
The synthesis process:
|
||||
1. Sends all content to the LLM
|
||||
2. Asks for key findings and analysis
|
||||
3. Ensures proper citation of sources
|
||||
4. Generates actionable insights
|
||||
"""
|
||||
if not crawled_content:
|
||||
return "No content available for synthesis.", []
|
||||
|
||||
console.print("\n[cyan]🤖 Generating research synthesis...[/cyan]")
|
||||
|
||||
# Prepare content for the AI
|
||||
# We include source info for proper citations
|
||||
content_sections = []
|
||||
for i, content in enumerate(crawled_content, 1):
|
||||
section = f"""
|
||||
SOURCE {i}:
|
||||
Title: {content['title']}
|
||||
URL: {content['url']}
|
||||
Content Preview:
|
||||
{content['markdown'][:1500]}...
|
||||
"""
|
||||
content_sections.append(section)
|
||||
|
||||
combined_content = "\n---\n".join(content_sections)
|
||||
|
||||
try:
|
||||
# Generate comprehensive synthesis
|
||||
response = await litellm.acompletion(
|
||||
model=config.llm_model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": f"""Research Query: "{query.original_query}"
|
||||
|
||||
Based on the following sources, provide a comprehensive research synthesis.
|
||||
|
||||
{combined_content}
|
||||
|
||||
Please provide:
|
||||
1. An executive summary (2-3 sentences)
|
||||
2. Key findings (3-5 bullet points)
|
||||
3. Detailed analysis (2-3 paragraphs)
|
||||
4. Future implications or trends
|
||||
|
||||
Format your response with clear sections and cite sources using [Source N] notation.
|
||||
Keep the total response under 800 words."""
|
||||
}],
|
||||
temperature=0.7 # Some creativity for synthesis
|
||||
)
|
||||
|
||||
synthesis = response.choices[0].message.content
|
||||
|
||||
# Extract citations from the synthesis
|
||||
citations = []
|
||||
for i, content in enumerate(crawled_content, 1):
|
||||
# Check if this source was cited
|
||||
if f"[Source {i}]" in synthesis or f"Source {i}" in synthesis:
|
||||
citations.append({
|
||||
'source_id': i,
|
||||
'title': content['title'],
|
||||
'url': content['url']
|
||||
})
|
||||
|
||||
return synthesis, citations
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[red]❌ Synthesis generation failed: {e}[/red]")
|
||||
# Fallback to simple summary
|
||||
summary = f"Research on '{query.original_query}' found {len(crawled_content)} relevant articles:\n\n"
|
||||
for content in crawled_content[:3]:
|
||||
summary += f"- {content['title']}\n {content['url']}\n\n"
|
||||
return summary, []
|
||||
|
||||
# Example synthesis
|
||||
synthesis, citations = await generate_research_synthesis(enhanced, crawled, config)
|
||||
console.print(Panel(synthesis[:500] + "...", title="📝 Research Synthesis Preview"))
|
||||
|
||||
# cell 14 type:markdown
|
||||
## Step 6: Complete Research Pipeline
|
||||
|
||||
Now let's put it all together! This orchestrator function manages the entire research pipeline from query to final report. It coordinates all the components we've built, handling errors gracefully and providing progress updates.
|
||||
|
||||
# cell 15 type:code
|
||||
async def research_pipeline(
|
||||
query: str,
|
||||
config: ResearchConfig = None
|
||||
) -> ResearchResult:
|
||||
"""
|
||||
Main research pipeline orchestrator
|
||||
|
||||
This brings together all components:
|
||||
1. Query enhancement (AI-powered)
|
||||
2. URL discovery (AsyncUrlSeeder)
|
||||
3. Smart crawling (AsyncWebCrawler)
|
||||
4. AI synthesis (LiteLLM)
|
||||
|
||||
Returns a complete research result
|
||||
"""
|
||||
if config is None:
|
||||
config = ResearchConfig()
|
||||
|
||||
start_time = datetime.now()
|
||||
|
||||
# Display pipeline header
|
||||
console.print(Panel(
|
||||
f"[bold cyan]Research Pipeline[/bold cyan]\n\n"
|
||||
f"[dim]Query:[/dim] {query}\n"
|
||||
f"[dim]Domain:[/dim] {config.domain}",
|
||||
title="🚀 Starting Research",
|
||||
border_style="cyan"
|
||||
))
|
||||
|
||||
# Step 1: Enhance query
|
||||
console.print(f"\n[bold cyan]📝 Step 1: Query Processing[/bold cyan]")
|
||||
if config.use_llm_enhancement:
|
||||
research_query = await enhance_query_with_llm(query, config)
|
||||
else:
|
||||
# Simple fallback without AI
|
||||
research_query = ResearchQuery(
|
||||
original_query=query,
|
||||
enhanced_query=query,
|
||||
search_patterns=[f"*{word}*" for word in query.lower().split()],
|
||||
timestamp=datetime.now().isoformat()
|
||||
)
|
||||
|
||||
# Step 2: Discover URLs
|
||||
console.print(f"\n[bold cyan]🔍 Step 2: URL Discovery[/bold cyan]")
|
||||
discovered_urls = await discover_urls(
|
||||
domain=config.domain,
|
||||
query=research_query,
|
||||
config=config
|
||||
)
|
||||
|
||||
if not discovered_urls:
|
||||
# No URLs found - return empty result
|
||||
return ResearchResult(
|
||||
query=research_query,
|
||||
discovered_urls=[],
|
||||
crawled_content=[],
|
||||
synthesis="No relevant URLs found for the given query.",
|
||||
citations=[],
|
||||
metadata={'duration': str(datetime.now() - start_time)}
|
||||
)
|
||||
|
||||
# Step 3: Crawl selected URLs
|
||||
console.print(f"\n[bold cyan]🕷️ Step 3: Content Crawling[/bold cyan]")
|
||||
crawled_content = await crawl_selected_urls(
|
||||
urls=discovered_urls,
|
||||
query=research_query,
|
||||
config=config
|
||||
)
|
||||
|
||||
# Step 4: Generate synthesis
|
||||
console.print(f"\n[bold cyan]🤖 Step 4: Synthesis Generation[/bold cyan]")
|
||||
synthesis, citations = await generate_research_synthesis(
|
||||
query=research_query,
|
||||
crawled_content=crawled_content,
|
||||
config=config
|
||||
)
|
||||
|
||||
# Create final result
|
||||
result = ResearchResult(
|
||||
query=research_query,
|
||||
discovered_urls=discovered_urls,
|
||||
crawled_content=crawled_content,
|
||||
synthesis=synthesis,
|
||||
citations=citations,
|
||||
metadata={
|
||||
'duration': str(datetime.now() - start_time),
|
||||
'domain': config.domain,
|
||||
'timestamp': datetime.now().isoformat(),
|
||||
'total_discovered': len(discovered_urls),
|
||||
'total_crawled': len(crawled_content),
|
||||
'total_cited': len(citations)
|
||||
}
|
||||
)
|
||||
|
||||
# Display summary
|
||||
duration = datetime.now() - start_time
|
||||
console.print(Panel(
|
||||
f"[bold green]✅ Research completed in {duration}[/bold green]\n\n"
|
||||
f"📊 Discovered: {len(discovered_urls)} URLs\n"
|
||||
f"🕷️ Crawled: {len(crawled_content)} pages\n"
|
||||
f"📚 Citations: {len(citations)} sources",
|
||||
title="🎉 Pipeline Complete",
|
||||
border_style="green"
|
||||
))
|
||||
|
||||
return result
|
||||
|
||||
# Example: Run complete pipeline
|
||||
result = await research_pipeline("Champions League latest results", config)
|
||||
|
||||
# cell 16 type:markdown
|
||||
## Step 7: Beautiful Output Formatting
|
||||
|
||||
A good research report needs clear presentation. Here we format our results into a professional report with executive summary, key findings, and proper citations. This makes the research actionable and easy to share.
|
||||
|
||||
# cell 17 type:code
|
||||
def format_research_output(result: ResearchResult) -> None:
|
||||
"""
|
||||
Create a beautifully formatted research report
|
||||
|
||||
Good formatting makes insights actionable:
|
||||
- Clear structure with sections
|
||||
- Highlighted key findings
|
||||
- Proper source attribution
|
||||
- Easy to scan and understand
|
||||
"""
|
||||
# Header
|
||||
console.print("\n" + "=" * 60)
|
||||
console.print("[bold cyan]🔬 RESEARCH REPORT[/bold cyan]")
|
||||
console.print("=" * 60)
|
||||
|
||||
# Query information
|
||||
console.print(f"\n[bold]Query:[/bold] {result.query.original_query}")
|
||||
if result.query.enhanced_query != result.query.original_query:
|
||||
console.print(f"[dim]Enhanced: {result.query.enhanced_query}[/dim]")
|
||||
|
||||
# Statistics
|
||||
stats_table = Table(show_header=False, box=None)
|
||||
stats_table.add_column(style="cyan")
|
||||
stats_table.add_column()
|
||||
|
||||
stats_table.add_row("📊 URLs Discovered", str(result.metadata['total_discovered']))
|
||||
stats_table.add_row("🕷️ Pages Crawled", str(result.metadata['total_crawled']))
|
||||
stats_table.add_row("📚 Sources Cited", str(result.metadata['total_cited']))
|
||||
stats_table.add_row("⏱️ Processing Time", result.metadata['duration'])
|
||||
|
||||
console.print("\n[bold]Statistics:[/bold]")
|
||||
console.print(stats_table)
|
||||
|
||||
# Synthesis
|
||||
console.print("\n[bold]📝 SYNTHESIS[/bold]")
|
||||
console.print("-" * 60)
|
||||
console.print(result.synthesis)
|
||||
|
||||
# Citations
|
||||
if result.citations:
|
||||
console.print("\n[bold]📚 SOURCES[/bold]")
|
||||
console.print("-" * 60)
|
||||
for citation in result.citations:
|
||||
console.print(f"\n[{citation['source_id']}] [cyan]{citation['title']}[/cyan]")
|
||||
console.print(f" [dim]{citation['url']}[/dim]")
|
||||
|
||||
# Top discovered URLs
|
||||
console.print("\n[bold]🔍 TOP DISCOVERED URLS[/bold]")
|
||||
console.print("-" * 60)
|
||||
|
||||
urls_table = Table()
|
||||
urls_table.add_column("Score", style="cyan")
|
||||
urls_table.add_column("Title")
|
||||
urls_table.add_column("URL", style="dim")
|
||||
|
||||
for url_data in result.discovered_urls[:5]:
|
||||
score = f"{url_data.get('relevance_score', 0):.3f}"
|
||||
title = "N/A"
|
||||
if url_data.get('head_data') and url_data['head_data'].get('title'):
|
||||
title = url_data['head_data']['title'][:40] + "..."
|
||||
url = url_data['url'][:50] + "..."
|
||||
|
||||
urls_table.add_row(score, title, url)
|
||||
|
||||
console.print(urls_table)
|
||||
|
||||
# Display the formatted report
|
||||
format_research_output(result)
|
||||
|
||||
# cell 18 type:markdown
|
||||
## Step 8: Save Research Results
|
||||
|
||||
Finally, let's save our research for future reference. We'll create both JSON (for data analysis) and Markdown (for reading) formats. This ensures your research is preserved and shareable.
|
||||
|
||||
# cell 19 type:code
|
||||
async def save_research_results(
|
||||
result: ResearchResult,
|
||||
config: ResearchConfig
|
||||
) -> Tuple[Path, Path]:
|
||||
"""
|
||||
Save research results in multiple formats
|
||||
|
||||
Why save in multiple formats?
|
||||
- JSON: Perfect for further analysis or automation
|
||||
- Markdown: Human-readable, great for sharing
|
||||
"""
|
||||
# Create output directory
|
||||
config.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Generate filename based on query and timestamp
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
query_slug = result.query.original_query[:30].replace(" ", "_").replace("/", "_")
|
||||
base_filename = f"{timestamp}_{query_slug}"
|
||||
|
||||
# Save JSON
|
||||
json_path = config.output_dir / f"{base_filename}.json"
|
||||
with open(json_path, 'w') as f:
|
||||
json.dump(asdict(result), f, indent=2, default=str)
|
||||
|
||||
# Create markdown report
|
||||
md_content = [
|
||||
f"# Research Report: {result.query.original_query}",
|
||||
f"\n**Generated on:** {result.metadata.get('timestamp', 'N/A')}",
|
||||
f"\n**Domain:** {result.metadata.get('domain', 'N/A')}",
|
||||
f"\n**Processing time:** {result.metadata.get('duration', 'N/A')}",
|
||||
"\n---\n",
|
||||
"## Query Information",
|
||||
f"- **Original Query:** {result.query.original_query}",
|
||||
f"- **Enhanced Query:** {result.query.enhanced_query or 'N/A'}",
|
||||
"\n## Statistics",
|
||||
f"- **URLs Discovered:** {result.metadata['total_discovered']}",
|
||||
f"- **Pages Crawled:** {result.metadata['total_crawled']}",
|
||||
f"- **Sources Cited:** {result.metadata['total_cited']}",
|
||||
"\n## Research Synthesis\n",
|
||||
result.synthesis,
|
||||
"\n## Sources\n"
|
||||
]
|
||||
|
||||
# Add citations
|
||||
for citation in result.citations:
|
||||
md_content.extend([
|
||||
f"### [{citation['source_id']}] {citation['title']}",
|
||||
f"- **URL:** [{citation['url']}]({citation['url']})",
|
||||
""
|
||||
])
|
||||
|
||||
# Add discovered URLs
|
||||
md_content.extend([
|
||||
"\n## Discovered URLs (Top 10)\n",
|
||||
"| Score | Title | URL |",
|
||||
"|-------|-------|-----|"
|
||||
])
|
||||
|
||||
for url_data in result.discovered_urls[:10]:
|
||||
score = url_data.get('relevance_score', 0)
|
||||
title = 'N/A'
|
||||
if url_data.get('head_data') and url_data['head_data'].get('title'):
|
||||
title = url_data['head_data']['title'][:50] + '...'
|
||||
url = url_data['url'][:60] + '...'
|
||||
md_content.append(f"| {score:.3f} | {title} | {url} |")
|
||||
|
||||
# Save markdown
|
||||
md_path = config.output_dir / f"{base_filename}.md"
|
||||
with open(md_path, 'w') as f:
|
||||
f.write('\n'.join(md_content))
|
||||
|
||||
console.print(f"\n[green]💾 Results saved:[/green]")
|
||||
console.print(f" JSON: {json_path}")
|
||||
console.print(f" Markdown: {md_path}")
|
||||
|
||||
return json_path, md_path
|
||||
|
||||
# Save our results
|
||||
json_path, md_path = await save_research_results(result, config)
|
||||
|
||||
# cell 20 type:markdown
|
||||
## 🎯 Putting It All Together: Interactive Research Assistant
|
||||
|
||||
Now let's create an interactive version where you can research any topic! This brings together everything we've learned into a user-friendly tool.
|
||||
|
||||
# cell 21 type:code
|
||||
async def interactive_research_assistant():
|
||||
"""
|
||||
Interactive research assistant with example queries
|
||||
|
||||
This demonstrates how to build a user-friendly interface
|
||||
for your research pipeline.
|
||||
"""
|
||||
# Welcome message
|
||||
console.print(Panel.fit(
|
||||
"[bold cyan]🔬 AI Research Assistant[/bold cyan]\n\n"
|
||||
"Powered by Crawl4AI's intelligent URL discovery\n"
|
||||
"[dim]• Discover without crawling\n"
|
||||
"• Score by relevance\n"
|
||||
"• Crawl only what matters\n"
|
||||
"• Generate AI insights[/dim]",
|
||||
title="Welcome",
|
||||
border_style="cyan"
|
||||
))
|
||||
|
||||
# Example queries
|
||||
examples = [
|
||||
"Premier League transfer news and rumors",
|
||||
"Champions League match results and analysis",
|
||||
"Tennis grand slam tournament updates",
|
||||
"Formula 1 race results and standings",
|
||||
"NBA playoff predictions and analysis"
|
||||
]
|
||||
|
||||
# Display examples
|
||||
console.print("\n[bold]📋 Example queries:[/bold]")
|
||||
for i, example in enumerate(examples, 1):
|
||||
console.print(f" {i}. {example}")
|
||||
|
||||
# Get user input
|
||||
console.print("\n[bold]Enter a number (1-5) or type your own query:[/bold]")
|
||||
user_input = input("🔍 > ").strip()
|
||||
|
||||
# Determine query
|
||||
if user_input.isdigit() and 1 <= int(user_input) <= len(examples):
|
||||
query = examples[int(user_input) - 1]
|
||||
else:
|
||||
query = user_input if user_input else examples[0]
|
||||
|
||||
console.print(f"\n[cyan]Selected query: {query}[/cyan]")
|
||||
|
||||
# Configuration options
|
||||
console.print("\n[bold]Choose configuration:[/bold]")
|
||||
console.print(" 1. Quick (5 URLs, fast)")
|
||||
console.print(" 2. Standard (10 URLs, balanced)")
|
||||
console.print(" 3. Comprehensive (20 URLs, thorough)")
|
||||
|
||||
config_choice = input("⚙️ > ").strip()
|
||||
|
||||
# Create configuration
|
||||
if config_choice == "1":
|
||||
config = ResearchConfig(max_urls_to_crawl=5, top_k_urls=5)
|
||||
elif config_choice == "3":
|
||||
config = ResearchConfig(max_urls_to_crawl=20, top_k_urls=20)
|
||||
else:
|
||||
config = ResearchConfig() # Standard
|
||||
|
||||
# Run research
|
||||
result = await research_pipeline(query, config)
|
||||
|
||||
# Display results
|
||||
format_research_output(result)
|
||||
|
||||
# Save results
|
||||
save_choice = input("\n💾 Save results? (y/n): ").strip().lower()
|
||||
if save_choice == 'y':
|
||||
await save_research_results(result, config)
|
||||
|
||||
# Run the interactive assistant
|
||||
await interactive_research_assistant()
|
||||
|
||||
# cell 22 type:markdown
|
||||
## 🚀 Advanced Tips and Best Practices
|
||||
|
||||
### 1. Domain-Specific Research
|
||||
|
||||
Customize the pipeline for specific domains:
|
||||
|
||||
# cell 23 type:code
|
||||
# Research across multiple sports sites
|
||||
async def multi_domain_research(query: str):
|
||||
"""Research across multiple sports websites"""
|
||||
|
||||
domains = [
|
||||
"www.bbc.com/sport",
|
||||
"www.espn.com",
|
||||
"www.skysports.com"
|
||||
]
|
||||
|
||||
all_results = []
|
||||
|
||||
for domain in domains:
|
||||
config = ResearchConfig(
|
||||
domain=domain,
|
||||
max_urls_to_crawl=5 # 5 per domain
|
||||
)
|
||||
|
||||
console.print(f"\n[cyan]Researching {domain}...[/cyan]")
|
||||
result = await research_pipeline(query, config)
|
||||
all_results.append(result)
|
||||
|
||||
# Combine insights from all domains
|
||||
console.print("\n[bold green]✅ Multi-domain research complete![/bold green]")
|
||||
return all_results
|
||||
|
||||
# Example usage
|
||||
# results = await multi_domain_research("World Cup 2024")
|
||||
|
||||
# cell 24 type:markdown
|
||||
### 2. Performance Optimization
|
||||
|
||||
Tips for faster research:
|
||||
|
||||
# cell 25 type:code
|
||||
# Optimized configuration for speed
|
||||
speed_config = ResearchConfig(
|
||||
# Reduce discovery scope
|
||||
max_urls_discovery=50, # Don't discover too many
|
||||
|
||||
# Skip live checking (trust the sitemap)
|
||||
live_check=False,
|
||||
|
||||
# Increase parallelism
|
||||
max_concurrent_crawls=10,
|
||||
|
||||
# Skip AI enhancement for simple queries
|
||||
use_llm_enhancement=False,
|
||||
|
||||
# Use faster model
|
||||
llm_model="gemini/gemini-1.5-flash"
|
||||
)
|
||||
|
||||
console.print(Panel(
|
||||
"[green]⚡ Speed Optimizations:[/green]\n\n"
|
||||
"• Reduced discovery scope\n"
|
||||
"• Disabled live URL checking\n"
|
||||
"• Increased parallelism\n"
|
||||
"• Using faster AI model",
|
||||
title="Performance Tips"
|
||||
))
|
||||
|
||||
# cell 26 type:markdown
|
||||
### 3. Caching Strategy
|
||||
|
||||
The URL Seeder automatically caches results for efficiency:
|
||||
|
||||
# cell 27 type:code
|
||||
# Cache demonstration
|
||||
console.print("[bold]🗄️ Understanding Caching:[/bold]\n")
|
||||
|
||||
console.print("1. [cyan]First run:[/cyan] Fetches fresh data")
|
||||
console.print(" - Discovers URLs from sitemap/Common Crawl")
|
||||
console.print(" - Extracts metadata")
|
||||
console.print(" - Caches results for 7 days")
|
||||
|
||||
console.print("\n2. [cyan]Subsequent runs:[/cyan] Uses cache (instant!)")
|
||||
console.print(" - No network requests needed")
|
||||
console.print(" - Same query returns cached results")
|
||||
|
||||
console.print("\n3. [cyan]Force refresh:[/cyan] Bypass cache when needed")
|
||||
console.print(" - Set `force_refresh=True` in config")
|
||||
console.print(" - Useful for breaking news or updates")
|
||||
|
||||
# Example with cache control
|
||||
cache_config = ResearchConfig(
|
||||
force_refresh=True # Always get fresh data
|
||||
)
|
||||
|
||||
# cell 28 type:markdown
|
||||
## 🎓 Summary & Next Steps
|
||||
|
||||
### What You've Learned
|
||||
|
||||
You've built a complete AI research assistant that:
|
||||
|
||||
✅ **Discovers URLs intelligently** - No blind crawling
|
||||
✅ **Scores by relevance** - Focus on what matters
|
||||
✅ **Crawls efficiently** - Parallel processing
|
||||
✅ **Generates insights** - AI-powered synthesis
|
||||
✅ **Saves results** - JSON and Markdown formats
|
||||
|
||||
### Key Advantages
|
||||
|
||||
1. **Efficiency**: Discover 1000s of URLs in seconds, crawl only the best
|
||||
2. **Intelligence**: BM25 scoring ensures relevance
|
||||
3. **Scalability**: Works across multiple domains
|
||||
4. **Flexibility**: Configurable for any use case
|
||||
|
||||
### Next Steps
|
||||
|
||||
1. **Customize for your domain**: Adapt the pipeline for your specific needs
|
||||
2. **Add persistence**: Store results in a database
|
||||
3. **Build an API**: Turn this into a web service
|
||||
4. **Schedule updates**: Monitor topics over time
|
||||
5. **Enhance with more AI**: Add summarization, sentiment analysis, etc.
|
||||
|
||||
### Resources
|
||||
|
||||
- 🐙 **GitHub**: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- 📚 **Documentation**: [crawl4ai.com/docs](https://crawl4ai.com/docs)
|
||||
- 💬 **Discord**: [Join our community](https://discord.gg/crawl4ai)
|
||||
|
||||
Thank you for learning with Crawl4AI! 🙏
|
||||
|
||||
Happy researching! 🚀🔬
|
||||
@@ -7,6 +7,9 @@ This demo shows:
|
||||
3. Live URL validation and metadata extraction
|
||||
4. BM25 relevance scoring for intelligent filtering
|
||||
5. Integration with AsyncWebCrawler for the complete pipeline
|
||||
6. Multi-domain discovery across multiple sites
|
||||
|
||||
Note: The AsyncUrlSeeder now supports context manager protocol for automatic cleanup.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
@@ -217,44 +220,43 @@ async def section_6_multi_domain(seed: AsyncUrlSeeder):
|
||||
|
||||
|
||||
async def main():
|
||||
seed = AsyncUrlSeeder()
|
||||
|
||||
# Interactive menu
|
||||
sections = {
|
||||
"1": ("Basic URL Discovery", section_1_basic_exploration),
|
||||
"2": ("Cache Management Demo", section_2_cache_demo),
|
||||
"3": ("Live Check & Metadata Extraction", section_3_live_head),
|
||||
"4": ("BM25 Relevance Scoring", section_4_bm25_scoring),
|
||||
"5": ("Complete Pipeline (Discover → Filter → Crawl)", section_5_keyword_filter_to_agent),
|
||||
"6": ("Multi-Domain Discovery", section_6_multi_domain),
|
||||
"7": ("Run All Demos", None)
|
||||
}
|
||||
|
||||
console.print("\n[bold]Available Demos:[/bold]")
|
||||
for key, (title, _) in sections.items():
|
||||
console.print(f" {key}. {title}")
|
||||
|
||||
choice = Prompt.ask("\n[cyan]Which demo would you like to run?[/cyan]",
|
||||
choices=list(sections.keys()),
|
||||
default="7")
|
||||
|
||||
console.print()
|
||||
|
||||
if choice == "7":
|
||||
# Run all demos
|
||||
for key, (title, func) in sections.items():
|
||||
if key != "7" and func:
|
||||
await func(seed)
|
||||
if key != "6": # Don't pause after the last demo
|
||||
if not Confirm.ask("\n[yellow]Continue to next demo?[/yellow]", default=True):
|
||||
break
|
||||
console.print()
|
||||
else:
|
||||
# Run selected demo
|
||||
_, func = sections[choice]
|
||||
await func(seed)
|
||||
|
||||
console.rule("[bold green]Demo Complete ✔︎")
|
||||
async with AsyncUrlSeeder() as seed:
|
||||
# Interactive menu
|
||||
sections = {
|
||||
"1": ("Basic URL Discovery", section_1_basic_exploration),
|
||||
"2": ("Cache Management Demo", section_2_cache_demo),
|
||||
"3": ("Live Check & Metadata Extraction", section_3_live_head),
|
||||
"4": ("BM25 Relevance Scoring", section_4_bm25_scoring),
|
||||
"5": ("Complete Pipeline (Discover → Filter → Crawl)", section_5_keyword_filter_to_agent),
|
||||
"6": ("Multi-Domain Discovery", section_6_multi_domain),
|
||||
"7": ("Run All Demos", None)
|
||||
}
|
||||
|
||||
console.print("\n[bold]Available Demos:[/bold]")
|
||||
for key, (title, _) in sections.items():
|
||||
console.print(f" {key}. {title}")
|
||||
|
||||
choice = Prompt.ask("\n[cyan]Which demo would you like to run?[/cyan]",
|
||||
choices=list(sections.keys()),
|
||||
default="7")
|
||||
|
||||
console.print()
|
||||
|
||||
if choice == "7":
|
||||
# Run all demos
|
||||
for key, (title, func) in sections.items():
|
||||
if key != "7" and func:
|
||||
await func(seed)
|
||||
if key != "6": # Don't pause after the last demo
|
||||
if not Confirm.ask("\n[yellow]Continue to next demo?[/yellow]", default=True):
|
||||
break
|
||||
console.print()
|
||||
else:
|
||||
# Run selected demo
|
||||
_, func = sections[choice]
|
||||
await func(seed)
|
||||
|
||||
console.rule("[bold green]Demo Complete ✔︎")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
"""
|
||||
🚀 URL Seeder + AsyncWebCrawler = Magic!
|
||||
Quick demo showing discovery → filter → crawl pipeline
|
||||
|
||||
Note: Uses context manager for automatic cleanup of resources.
|
||||
"""
|
||||
import asyncio, os
|
||||
from crawl4ai import AsyncUrlSeeder, AsyncWebCrawler, SeedingConfig, CrawlerRunConfig, AsyncLogger, DefaultMarkdownGenerator
|
||||
@@ -11,29 +13,26 @@ CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
# 🔍 Example 1: Discover ALL → Filter → Crawl
|
||||
async def discover_and_crawl():
|
||||
"""Find Python module tutorials & extract them all!"""
|
||||
seeder = AsyncUrlSeeder(
|
||||
logger=AsyncLogger() # Log everything
|
||||
)
|
||||
|
||||
# Step 1: See how many URLs exist (spoiler: A LOT!)
|
||||
print("📊 Let's see what RealPython has...")
|
||||
all_urls = await seeder.urls("realpython.com",
|
||||
SeedingConfig(source="sitemap"))
|
||||
print(f"😱 Found {len(all_urls)} total URLs!")
|
||||
|
||||
# Step 2: Filter for Python modules (perfect size ~13)
|
||||
print("\n🎯 Filtering for 'python-modules' tutorials...")
|
||||
module_urls = await seeder.urls("realpython.com",
|
||||
SeedingConfig(
|
||||
source="sitemap",
|
||||
pattern="*python-modules*",
|
||||
live_check=True # Make sure they're alive!
|
||||
))
|
||||
|
||||
print(f"✨ Found {len(module_urls)} module tutorials")
|
||||
for url in module_urls[:3]: # Show first 3
|
||||
status = "✅" if url["status"] == "valid" else "❌"
|
||||
print(f"{status} {url['url']}")
|
||||
async with AsyncUrlSeeder(logger=AsyncLogger()) as seeder:
|
||||
# Step 1: See how many URLs exist (spoiler: A LOT!)
|
||||
print("📊 Let's see what RealPython has...")
|
||||
all_urls = await seeder.urls("realpython.com",
|
||||
SeedingConfig(source="sitemap"))
|
||||
print(f"😱 Found {len(all_urls)} total URLs!")
|
||||
|
||||
# Step 2: Filter for Python modules (perfect size ~13)
|
||||
print("\n🎯 Filtering for 'python-modules' tutorials...")
|
||||
module_urls = await seeder.urls("realpython.com",
|
||||
SeedingConfig(
|
||||
source="sitemap",
|
||||
pattern="*python-modules*",
|
||||
live_check=True # Make sure they're alive!
|
||||
))
|
||||
|
||||
print(f"✨ Found {len(module_urls)} module tutorials")
|
||||
for url in module_urls[:3]: # Show first 3
|
||||
status = "✅" if url["status"] == "valid" else "❌"
|
||||
print(f"{status} {url['url']}")
|
||||
|
||||
# Step 3: Crawl them all with pruning (keep it lean!)
|
||||
print("\n🕷️ Crawling all module tutorials...")
|
||||
@@ -70,53 +69,51 @@ async def discover_and_crawl():
|
||||
# 🔍 Example 2: Beautiful Soup articles with metadata peek
|
||||
async def explore_beautifulsoup():
|
||||
"""Discover BeautifulSoup content & peek at metadata"""
|
||||
seeder = AsyncUrlSeeder(logger=AsyncLogger() )
|
||||
|
||||
print("🍲 Looking for Beautiful Soup articles...")
|
||||
soup_urls = await seeder.urls("realpython.com",
|
||||
SeedingConfig(
|
||||
source="sitemap",
|
||||
pattern="*beautiful-soup*",
|
||||
extract_head=True # Get the metadata!
|
||||
))
|
||||
|
||||
print(f"\n📚 Found {len(soup_urls)} Beautiful Soup articles:\n")
|
||||
|
||||
# Show what we discovered
|
||||
for i, url in enumerate(soup_urls, 1):
|
||||
meta = url["head_data"]["meta"]
|
||||
async with AsyncUrlSeeder(logger=AsyncLogger()) as seeder:
|
||||
print("🍲 Looking for Beautiful Soup articles...")
|
||||
soup_urls = await seeder.urls("realpython.com",
|
||||
SeedingConfig(
|
||||
source="sitemap",
|
||||
pattern="*beautiful-soup*",
|
||||
extract_head=True # Get the metadata!
|
||||
))
|
||||
|
||||
print(f"{i}. {url['head_data']['title']}")
|
||||
print(f" 📝 {meta.get('description', 'No description')[:60]}...")
|
||||
print(f" 👤 By: {meta.get('author', 'Unknown')}")
|
||||
print(f" 🔗 {url['url']}\n")
|
||||
print(f"\n📚 Found {len(soup_urls)} Beautiful Soup articles:\n")
|
||||
|
||||
# Show what we discovered
|
||||
for i, url in enumerate(soup_urls, 1):
|
||||
meta = url["head_data"]["meta"]
|
||||
|
||||
print(f"{i}. {url['head_data']['title']}")
|
||||
print(f" 📝 {meta.get('description', 'No description')[:60]}...")
|
||||
print(f" 👤 By: {meta.get('author', 'Unknown')}")
|
||||
print(f" 🔗 {url['url']}\n")
|
||||
|
||||
# 🔍 Example 3: Smart search with BM25 relevance scoring
|
||||
async def smart_search_with_bm25():
|
||||
"""Use AI-powered relevance scoring to find the best content"""
|
||||
seeder = AsyncUrlSeeder(logger=AsyncLogger() )
|
||||
|
||||
print("🧠 Smart search: 'web scraping tutorial quiz'")
|
||||
|
||||
# Search with BM25 scoring - AI finds the best matches!
|
||||
results = await seeder.urls("realpython.com",
|
||||
SeedingConfig(
|
||||
source="sitemap",
|
||||
pattern="*beautiful-soup*",
|
||||
extract_head=True,
|
||||
query="web scraping tutorial quiz", # Our search
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2 # Quality filter
|
||||
))
|
||||
|
||||
print(f"\n🎯 Top {len(results)} most relevant results:\n")
|
||||
|
||||
# Show ranked results with relevance scores
|
||||
for i, result in enumerate(results[:3], 1):
|
||||
print(f"{i}. [{result['relevance_score']:.2f}] {result['head_data']['title']}")
|
||||
print(f" 🔗 {result['url'][:60]}...")
|
||||
|
||||
print("\n✨ BM25 automatically ranked by relevance!")
|
||||
async with AsyncUrlSeeder(logger=AsyncLogger()) as seeder:
|
||||
print("🧠 Smart search: 'web scraping tutorial quiz'")
|
||||
|
||||
# Search with BM25 scoring - AI finds the best matches!
|
||||
results = await seeder.urls("realpython.com",
|
||||
SeedingConfig(
|
||||
source="sitemap",
|
||||
pattern="*beautiful-soup*",
|
||||
extract_head=True,
|
||||
query="web scraping tutorial quiz", # Our search
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2 # Quality filter
|
||||
))
|
||||
|
||||
print(f"\n🎯 Top {len(results)} most relevant results:\n")
|
||||
|
||||
# Show ranked results with relevance scores
|
||||
for i, result in enumerate(results[:3], 1):
|
||||
print(f"{i}. [{result['relevance_score']:.2f}] {result['head_data']['title']}")
|
||||
print(f" 🔗 {result['url'][:60]}...")
|
||||
|
||||
print("\n✨ BM25 automatically ranked by relevance!")
|
||||
|
||||
# 🎬 Run the show!
|
||||
async def main():
|
||||
|
||||
@@ -173,12 +173,19 @@ Creating a URL seeder is simple:
|
||||
```python
|
||||
from crawl4ai import AsyncUrlSeeder
|
||||
|
||||
# Create a seeder instance
|
||||
# Method 1: Manual cleanup
|
||||
seeder = AsyncUrlSeeder()
|
||||
try:
|
||||
config = SeedingConfig(source="sitemap")
|
||||
urls = await seeder.urls("example.com", config)
|
||||
finally:
|
||||
await seeder.close()
|
||||
|
||||
# Discover URLs from a domain
|
||||
config = SeedingConfig(source="sitemap")
|
||||
urls = await seeder.urls("example.com", config)
|
||||
# Method 2: Context manager (recommended)
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
config = SeedingConfig(source="sitemap")
|
||||
urls = await seeder.urls("example.com", config)
|
||||
# Automatically cleaned up on exit
|
||||
```
|
||||
|
||||
The seeder can discover URLs from two powerful sources:
|
||||
@@ -193,6 +200,23 @@ urls = await seeder.urls("example.com", config)
|
||||
|
||||
Sitemaps are XML files that websites create specifically to list all their URLs. It's like getting a menu at a restaurant - everything is listed upfront.
|
||||
|
||||
**Sitemap Index Support**: For large websites like TechCrunch that use sitemap indexes (a sitemap of sitemaps), the seeder automatically detects and processes all sub-sitemaps in parallel:
|
||||
|
||||
```xml
|
||||
<!-- Example sitemap index -->
|
||||
<sitemapindex>
|
||||
<sitemap>
|
||||
<loc>https://techcrunch.com/sitemap-1.xml</loc>
|
||||
</sitemap>
|
||||
<sitemap>
|
||||
<loc>https://techcrunch.com/sitemap-2.xml</loc>
|
||||
</sitemap>
|
||||
<!-- ... more sitemaps ... -->
|
||||
</sitemapindex>
|
||||
```
|
||||
|
||||
The seeder handles this transparently - you'll get all URLs from all sub-sitemaps automatically!
|
||||
|
||||
#### 2. Common Crawl (Most Comprehensive)
|
||||
|
||||
```python
|
||||
@@ -349,6 +373,35 @@ The head extraction gives you a treasure trove of information:
|
||||
|
||||
This metadata is gold for filtering! You can find exactly what you need without crawling a single page.
|
||||
|
||||
### Smart URL-Based Filtering (No Head Extraction)
|
||||
|
||||
When `extract_head=False` but you still provide a query, the seeder uses intelligent URL-based scoring:
|
||||
|
||||
```python
|
||||
# Fast filtering based on URL structure alone
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=False, # Don't fetch page metadata
|
||||
query="python tutorial async",
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.3
|
||||
)
|
||||
|
||||
urls = await seeder.urls("example.com", config)
|
||||
|
||||
# URLs are scored based on:
|
||||
# 1. Domain parts matching (e.g., 'python' in python.example.com)
|
||||
# 2. Path segments (e.g., '/tutorials/python-async/')
|
||||
# 3. Query parameters (e.g., '?topic=python')
|
||||
# 4. Fuzzy matching using character n-grams
|
||||
|
||||
# Example URL scoring:
|
||||
# https://example.com/tutorials/python/async-guide.html - High score
|
||||
# https://example.com/blog/javascript-tips.html - Low score
|
||||
```
|
||||
|
||||
This approach is much faster than head extraction while still providing intelligent filtering!
|
||||
|
||||
### Understanding Results
|
||||
|
||||
Each URL in the results has this structure:
|
||||
@@ -710,7 +763,16 @@ from crawl4ai import AsyncUrlSeeder, AsyncWebCrawler, SeedingConfig, CrawlerRunC
|
||||
|
||||
class ResearchAssistant:
|
||||
def __init__(self):
|
||||
self.seeder = None
|
||||
|
||||
async def __aenter__(self):
|
||||
self.seeder = AsyncUrlSeeder()
|
||||
await self.seeder.__aenter__()
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
if self.seeder:
|
||||
await self.seeder.__aexit__(exc_type, exc_val, exc_tb)
|
||||
|
||||
async def research_topic(self, topic, domains, max_articles=20):
|
||||
"""Research a topic across multiple domains."""
|
||||
@@ -812,18 +874,17 @@ class ResearchAssistant:
|
||||
|
||||
# Use the research assistant
|
||||
async def main():
|
||||
assistant = ResearchAssistant()
|
||||
|
||||
# Research Python async programming across multiple sources
|
||||
topic = "python asyncio best practices performance optimization"
|
||||
domains = [
|
||||
"realpython.com",
|
||||
"python.org",
|
||||
"stackoverflow.com",
|
||||
"medium.com"
|
||||
]
|
||||
|
||||
summary = await assistant.research_topic(topic, domains, max_articles=15)
|
||||
async with ResearchAssistant() as assistant:
|
||||
# Research Python async programming across multiple sources
|
||||
topic = "python asyncio best practices performance optimization"
|
||||
domains = [
|
||||
"realpython.com",
|
||||
"python.org",
|
||||
"stackoverflow.com",
|
||||
"medium.com"
|
||||
]
|
||||
|
||||
summary = await assistant.research_topic(topic, domains, max_articles=15)
|
||||
|
||||
# Display results
|
||||
print("\n" + "="*60)
|
||||
@@ -878,6 +939,24 @@ async with AsyncWebCrawler() as crawler:
|
||||
process_immediately(result) # Don't wait for all
|
||||
```
|
||||
|
||||
4. **Memory protection for large domains**
|
||||
|
||||
The seeder uses bounded queues to prevent memory issues when processing domains with millions of URLs:
|
||||
|
||||
```python
|
||||
# Safe for domains with 1M+ URLs
|
||||
config = SeedingConfig(
|
||||
source="cc+sitemap",
|
||||
concurrency=50, # Queue size adapts to concurrency
|
||||
max_urls=100000 # Process in batches if needed
|
||||
)
|
||||
|
||||
# The seeder automatically manages memory by:
|
||||
# - Using bounded queues (prevents RAM spikes)
|
||||
# - Applying backpressure when queue is full
|
||||
# - Processing URLs as they're discovered
|
||||
```
|
||||
|
||||
## Best Practices & Tips
|
||||
|
||||
### Cache Management
|
||||
@@ -975,6 +1054,8 @@ config = SeedingConfig(
|
||||
| Missing metadata | Ensure `extract_head=True` |
|
||||
| Low relevance scores | Refine query, lower `score_threshold` |
|
||||
| Rate limit errors | Reduce `hits_per_sec` and `concurrency` |
|
||||
| Memory issues with large sites | Use `max_urls` to limit results, reduce `concurrency` |
|
||||
| Connection not closed | Use context manager or call `await seeder.close()` |
|
||||
|
||||
### Performance Benchmarks
|
||||
|
||||
@@ -997,4 +1078,12 @@ URL seeding transforms web crawling from a blind expedition into a surgical stri
|
||||
|
||||
Whether you're building a research tool, monitoring competitors, or creating a content aggregator, URL seeding gives you the intelligence to crawl smarter, not harder.
|
||||
|
||||
### Key Features Summary
|
||||
|
||||
1. **Parallel Sitemap Index Processing**: Automatically detects and processes sitemap indexes in parallel
|
||||
2. **Memory Protection**: Bounded queues prevent RAM issues with large domains (1M+ URLs)
|
||||
3. **Context Manager Support**: Automatic cleanup with `async with` statement
|
||||
4. **URL-Based Scoring**: Smart filtering even without head extraction
|
||||
5. **Dual Caching**: Separate caches for URL lists and metadata
|
||||
|
||||
Now go forth and seed intelligently! 🌱🚀
|
||||
Reference in New Issue
Block a user