This commit introduces significant updates to the LinkedIn data discovery documentation by adding two new Jupyter notebooks that provide detailed insights into data discovery processes. The previous workshop notebook has been removed to streamline the content and avoid redundancy. Additionally, the URL seeder documentation has been expanded with a new tutorial and several enhancements to existing scripts, improving usability and clarity. The changes include: - Added and for comprehensive LinkedIn data discovery. - Removed to eliminate outdated content. - Updated to reflect new data visualization requirements. - Introduced and to facilitate easier access to URL seeding techniques. - Enhanced existing Python scripts and markdown files in the URL seeder section for better documentation and examples. These changes aim to improve the overall documentation quality and user experience for developers working with LinkedIn data and URL seeding techniques.
264 lines
9.9 KiB
Python
264 lines
9.9 KiB
Python
"""
|
|
URL Seeder Demo - Interactive showcase of Crawl4AI's URL discovery capabilities
|
|
|
|
This demo shows:
|
|
1. Basic URL discovery from sitemaps and Common Crawl
|
|
2. Cache management and forced refresh
|
|
3. Live URL validation and metadata extraction
|
|
4. BM25 relevance scoring for intelligent filtering
|
|
5. Integration with AsyncWebCrawler for the complete pipeline
|
|
6. Multi-domain discovery across multiple sites
|
|
|
|
Note: The AsyncUrlSeeder now supports context manager protocol for automatic cleanup.
|
|
"""
|
|
|
|
import asyncio
|
|
import time
|
|
from datetime import datetime
|
|
from rich.console import Console
|
|
from rich.table import Table
|
|
from rich.panel import Panel
|
|
from rich.progress import Progress, SpinnerColumn, BarColumn, TimeElapsedColumn
|
|
from rich.prompt import Prompt, Confirm
|
|
from crawl4ai import (
|
|
AsyncWebCrawler,
|
|
CrawlerRunConfig,
|
|
AsyncUrlSeeder,
|
|
SeedingConfig
|
|
)
|
|
|
|
console = Console()
|
|
|
|
console.rule("[bold green]🌐 Crawl4AI URL Seeder: Interactive Demo")
|
|
|
|
DOMAIN = "crawl4ai.com"
|
|
|
|
# Utils
|
|
|
|
def print_head_info(head_data):
|
|
table = Table(title="<head> Metadata", expand=True)
|
|
table.add_column("Key", style="cyan", no_wrap=True)
|
|
table.add_column("Value", style="magenta")
|
|
|
|
if not head_data:
|
|
console.print("[yellow]No head data found.")
|
|
return
|
|
|
|
if head_data.get("title"):
|
|
table.add_row("title", head_data["title"])
|
|
if head_data.get("charset"):
|
|
table.add_row("charset", head_data["charset"])
|
|
for k, v in head_data.get("meta", {}).items():
|
|
table.add_row(f"meta:{k}", v)
|
|
for rel, items in head_data.get("link", {}).items():
|
|
for item in items:
|
|
table.add_row(f"link:{rel}", item.get("href", ""))
|
|
console.print(table)
|
|
|
|
|
|
async def section_1_basic_exploration(seed: AsyncUrlSeeder):
|
|
console.rule("[bold cyan]1. Basic Seeding")
|
|
cfg = SeedingConfig(source="cc+sitemap", pattern="*", verbose=True)
|
|
|
|
start_time = time.time()
|
|
with Progress(SpinnerColumn(), "[progress.description]{task.description}") as p:
|
|
p.add_task(description="Fetching from Common Crawl + Sitemap...", total=None)
|
|
urls = await seed.urls(DOMAIN, cfg)
|
|
elapsed = time.time() - start_time
|
|
|
|
console.print(f"[green]✓ Fetched {len(urls)} URLs in {elapsed:.2f} seconds")
|
|
console.print(f"[dim] Speed: {len(urls)/elapsed:.0f} URLs/second[/dim]\n")
|
|
|
|
console.print("[bold]Sample URLs:[/bold]")
|
|
for u in urls[:5]:
|
|
console.print(f" • {u['url']}")
|
|
|
|
|
|
async def section_2_cache_demo(seed: AsyncUrlSeeder):
|
|
console.rule("[bold cyan]2. Caching Demonstration")
|
|
console.print("[yellow]Using `force=True` to bypass cache and fetch fresh data.[/yellow]")
|
|
cfg = SeedingConfig(source="cc", pattern="*crawl4ai.com/core/*", verbose=False, force = True)
|
|
await seed.urls(DOMAIN, cfg)
|
|
|
|
async def section_3_live_head(seed: AsyncUrlSeeder):
|
|
console.rule("[bold cyan]3. Live Check + Head Extraction")
|
|
cfg = SeedingConfig(
|
|
extract_head=True,
|
|
concurrency=10,
|
|
hits_per_sec=5,
|
|
pattern="*crawl4ai.com/*",
|
|
max_urls=10,
|
|
verbose=False,
|
|
)
|
|
urls = await seed.urls(DOMAIN, cfg)
|
|
|
|
valid = [u for u in urls if u["status"] == "valid"]
|
|
console.print(f"[green]Valid: {len(valid)} / {len(urls)}")
|
|
if valid:
|
|
print_head_info(valid[0]["head_data"])
|
|
|
|
|
|
async def section_4_bm25_scoring(seed: AsyncUrlSeeder):
|
|
console.rule("[bold cyan]4. BM25 Relevance Scoring")
|
|
console.print("[yellow]Using AI-powered relevance scoring to find the most relevant content[/yellow]")
|
|
|
|
query = "markdown generation extraction strategies"
|
|
cfg = SeedingConfig(
|
|
source="sitemap",
|
|
extract_head=True,
|
|
query=query,
|
|
scoring_method="bm25",
|
|
score_threshold=0.3, # Only URLs with >30% relevance
|
|
max_urls=20,
|
|
verbose=False
|
|
)
|
|
|
|
with Progress(SpinnerColumn(), "[progress.description]{task.description}") as p:
|
|
p.add_task(description=f"Searching for: '{query}'", total=None)
|
|
urls = await seed.urls(DOMAIN, cfg)
|
|
|
|
console.print(f"[green]Found {len(urls)} relevant URLs (score > 0.3)")
|
|
|
|
# Show top results with scores
|
|
table = Table(title="Top 5 Most Relevant Pages", expand=True)
|
|
table.add_column("Score", style="cyan", width=8)
|
|
table.add_column("Title", style="magenta")
|
|
table.add_column("URL", style="blue", overflow="fold")
|
|
|
|
for url in urls[:5]:
|
|
score = f"{url['relevance_score']:.2f}"
|
|
title = url['head_data'].get('title', 'No title')[:60] + "..."
|
|
table.add_row(score, title, url['url'])
|
|
|
|
console.print(table)
|
|
|
|
async def section_5_keyword_filter_to_agent(seed: AsyncUrlSeeder):
|
|
console.rule("[bold cyan]5. Complete Pipeline: Discover → Filter → Crawl")
|
|
cfg = SeedingConfig(
|
|
extract_head=True,
|
|
concurrency=20,
|
|
hits_per_sec=10,
|
|
max_urls=10,
|
|
pattern="*crawl4ai.com/*",
|
|
force=True,
|
|
)
|
|
urls = await seed.urls(DOMAIN, cfg)
|
|
|
|
keywords = ["deep crawling", "markdown", "llm"]
|
|
selected = [u for u in urls if any(k in str(u["head_data"]).lower() for k in keywords)]
|
|
|
|
console.print(f"[cyan]Selected {len(selected)} URLs with relevant keywords:")
|
|
for u in selected[:10]:
|
|
console.print("•", u["url"])
|
|
|
|
console.print("\n[yellow]Passing above URLs to arun_many() LLM agent for crawling...")
|
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
crawl_run_config = CrawlerRunConfig(
|
|
# Example crawl settings for these URLs:
|
|
only_text=True, # Just get text content
|
|
screenshot=False,
|
|
pdf=False,
|
|
word_count_threshold=50, # Only process pages with at least 50 words
|
|
stream=True,
|
|
verbose=False # Keep logs clean for arun_many in this demo
|
|
)
|
|
|
|
# Extract just the URLs from the selected results
|
|
urls_to_crawl = [u["url"] for u in selected]
|
|
|
|
# We'll stream results for large lists, but collect them here for demonstration
|
|
crawled_results_stream = await crawler.arun_many(urls_to_crawl, config=crawl_run_config)
|
|
final_crawled_data = []
|
|
async for result in crawled_results_stream:
|
|
final_crawled_data.append(result)
|
|
if len(final_crawled_data) % 5 == 0:
|
|
print(f" Processed {len(final_crawled_data)}/{len(urls_to_crawl)} URLs...")
|
|
|
|
print(f"\n Successfully crawled {len(final_crawled_data)} URLs.")
|
|
if final_crawled_data:
|
|
print("\n Example of a crawled result's URL and Markdown (first successful one):")
|
|
for result in final_crawled_data:
|
|
if result.success and result.markdown.raw_markdown:
|
|
print(f" URL: {result.url}")
|
|
print(f" Markdown snippet: {result.markdown.raw_markdown[:200]}...")
|
|
break
|
|
else:
|
|
print(" No successful crawls with markdown found.")
|
|
else:
|
|
print(" No successful crawls found.")
|
|
|
|
|
|
async def section_6_multi_domain(seed: AsyncUrlSeeder):
|
|
console.rule("[bold cyan]6. Multi-Domain Discovery")
|
|
console.print("[yellow]Discovering Python tutorials across multiple educational sites[/yellow]\n")
|
|
|
|
domains = ["docs.python.org", "realpython.com", "docs.crawl4ai.com"]
|
|
cfg = SeedingConfig(
|
|
source="sitemap",
|
|
extract_head=True,
|
|
query="python tutorial guide",
|
|
scoring_method="bm25",
|
|
score_threshold=0.2,
|
|
max_urls=5 # Per domain
|
|
)
|
|
|
|
start_time = time.time()
|
|
with Progress(SpinnerColumn(), "[progress.description]{task.description}") as p:
|
|
task = p.add_task(description="Discovering across domains...", total=None)
|
|
results = await seed.many_urls(domains, cfg)
|
|
elapsed = time.time() - start_time
|
|
|
|
total_urls = sum(len(urls) for urls in results.values())
|
|
console.print(f"[green]✓ Found {total_urls} relevant URLs across {len(domains)} domains in {elapsed:.2f}s\n")
|
|
|
|
# Show results per domain
|
|
for domain, urls in results.items():
|
|
console.print(f"[bold]{domain}:[/bold] {len(urls)} relevant pages")
|
|
if urls:
|
|
top = urls[0]
|
|
console.print(f" Top result: [{top['relevance_score']:.2f}] {top['head_data'].get('title', 'No title')}")
|
|
|
|
|
|
async def main():
|
|
async with AsyncUrlSeeder() as seed:
|
|
# Interactive menu
|
|
sections = {
|
|
"1": ("Basic URL Discovery", section_1_basic_exploration),
|
|
"2": ("Cache Management Demo", section_2_cache_demo),
|
|
"3": ("Live Check & Metadata Extraction", section_3_live_head),
|
|
"4": ("BM25 Relevance Scoring", section_4_bm25_scoring),
|
|
"5": ("Complete Pipeline (Discover → Filter → Crawl)", section_5_keyword_filter_to_agent),
|
|
"6": ("Multi-Domain Discovery", section_6_multi_domain),
|
|
"7": ("Run All Demos", None)
|
|
}
|
|
|
|
console.print("\n[bold]Available Demos:[/bold]")
|
|
for key, (title, _) in sections.items():
|
|
console.print(f" {key}. {title}")
|
|
|
|
choice = Prompt.ask("\n[cyan]Which demo would you like to run?[/cyan]",
|
|
choices=list(sections.keys()),
|
|
default="7")
|
|
|
|
console.print()
|
|
|
|
if choice == "7":
|
|
# Run all demos
|
|
for key, (title, func) in sections.items():
|
|
if key != "7" and func:
|
|
await func(seed)
|
|
if key != "6": # Don't pause after the last demo
|
|
if not Confirm.ask("\n[yellow]Continue to next demo?[/yellow]", default=True):
|
|
break
|
|
console.print()
|
|
else:
|
|
# Run selected demo
|
|
_, func = sections[choice]
|
|
await func(seed)
|
|
|
|
console.rule("[bold green]Demo Complete ✔︎")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|