docs(linkdin, url_seeder): update and reorganize LinkedIn data discovery and URL seeder documentation
This commit introduces significant updates to the LinkedIn data discovery documentation by adding two new Jupyter notebooks that provide detailed insights into data discovery processes. The previous workshop notebook has been removed to streamline the content and avoid redundancy. Additionally, the URL seeder documentation has been expanded with a new tutorial and several enhancements to existing scripts, improving usability and clarity. The changes include: - Added and for comprehensive LinkedIn data discovery. - Removed to eliminate outdated content. - Updated to reflect new data visualization requirements. - Introduced and to facilitate easier access to URL seeding techniques. - Enhanced existing Python scripts and markdown files in the URL seeder section for better documentation and examples. These changes aim to improve the overall documentation quality and user experience for developers working with LinkedIn data and URL seeding techniques.
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
"""
|
||||
🚀 URL Seeder + AsyncWebCrawler = Magic!
|
||||
Quick demo showing discovery → filter → crawl pipeline
|
||||
|
||||
Note: Uses context manager for automatic cleanup of resources.
|
||||
"""
|
||||
import asyncio, os
|
||||
from crawl4ai import AsyncUrlSeeder, AsyncWebCrawler, SeedingConfig, CrawlerRunConfig, AsyncLogger, DefaultMarkdownGenerator
|
||||
@@ -11,29 +13,26 @@ CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
# 🔍 Example 1: Discover ALL → Filter → Crawl
|
||||
async def discover_and_crawl():
|
||||
"""Find Python module tutorials & extract them all!"""
|
||||
seeder = AsyncUrlSeeder(
|
||||
logger=AsyncLogger() # Log everything
|
||||
)
|
||||
|
||||
# Step 1: See how many URLs exist (spoiler: A LOT!)
|
||||
print("📊 Let's see what RealPython has...")
|
||||
all_urls = await seeder.urls("realpython.com",
|
||||
SeedingConfig(source="sitemap"))
|
||||
print(f"😱 Found {len(all_urls)} total URLs!")
|
||||
|
||||
# Step 2: Filter for Python modules (perfect size ~13)
|
||||
print("\n🎯 Filtering for 'python-modules' tutorials...")
|
||||
module_urls = await seeder.urls("realpython.com",
|
||||
SeedingConfig(
|
||||
source="sitemap",
|
||||
pattern="*python-modules*",
|
||||
live_check=True # Make sure they're alive!
|
||||
))
|
||||
|
||||
print(f"✨ Found {len(module_urls)} module tutorials")
|
||||
for url in module_urls[:3]: # Show first 3
|
||||
status = "✅" if url["status"] == "valid" else "❌"
|
||||
print(f"{status} {url['url']}")
|
||||
async with AsyncUrlSeeder(logger=AsyncLogger()) as seeder:
|
||||
# Step 1: See how many URLs exist (spoiler: A LOT!)
|
||||
print("📊 Let's see what RealPython has...")
|
||||
all_urls = await seeder.urls("realpython.com",
|
||||
SeedingConfig(source="sitemap"))
|
||||
print(f"😱 Found {len(all_urls)} total URLs!")
|
||||
|
||||
# Step 2: Filter for Python modules (perfect size ~13)
|
||||
print("\n🎯 Filtering for 'python-modules' tutorials...")
|
||||
module_urls = await seeder.urls("realpython.com",
|
||||
SeedingConfig(
|
||||
source="sitemap",
|
||||
pattern="*python-modules*",
|
||||
live_check=True # Make sure they're alive!
|
||||
))
|
||||
|
||||
print(f"✨ Found {len(module_urls)} module tutorials")
|
||||
for url in module_urls[:3]: # Show first 3
|
||||
status = "✅" if url["status"] == "valid" else "❌"
|
||||
print(f"{status} {url['url']}")
|
||||
|
||||
# Step 3: Crawl them all with pruning (keep it lean!)
|
||||
print("\n🕷️ Crawling all module tutorials...")
|
||||
@@ -70,53 +69,51 @@ async def discover_and_crawl():
|
||||
# 🔍 Example 2: Beautiful Soup articles with metadata peek
|
||||
async def explore_beautifulsoup():
|
||||
"""Discover BeautifulSoup content & peek at metadata"""
|
||||
seeder = AsyncUrlSeeder(logger=AsyncLogger() )
|
||||
|
||||
print("🍲 Looking for Beautiful Soup articles...")
|
||||
soup_urls = await seeder.urls("realpython.com",
|
||||
SeedingConfig(
|
||||
source="sitemap",
|
||||
pattern="*beautiful-soup*",
|
||||
extract_head=True # Get the metadata!
|
||||
))
|
||||
|
||||
print(f"\n📚 Found {len(soup_urls)} Beautiful Soup articles:\n")
|
||||
|
||||
# Show what we discovered
|
||||
for i, url in enumerate(soup_urls, 1):
|
||||
meta = url["head_data"]["meta"]
|
||||
async with AsyncUrlSeeder(logger=AsyncLogger()) as seeder:
|
||||
print("🍲 Looking for Beautiful Soup articles...")
|
||||
soup_urls = await seeder.urls("realpython.com",
|
||||
SeedingConfig(
|
||||
source="sitemap",
|
||||
pattern="*beautiful-soup*",
|
||||
extract_head=True # Get the metadata!
|
||||
))
|
||||
|
||||
print(f"{i}. {url['head_data']['title']}")
|
||||
print(f" 📝 {meta.get('description', 'No description')[:60]}...")
|
||||
print(f" 👤 By: {meta.get('author', 'Unknown')}")
|
||||
print(f" 🔗 {url['url']}\n")
|
||||
print(f"\n📚 Found {len(soup_urls)} Beautiful Soup articles:\n")
|
||||
|
||||
# Show what we discovered
|
||||
for i, url in enumerate(soup_urls, 1):
|
||||
meta = url["head_data"]["meta"]
|
||||
|
||||
print(f"{i}. {url['head_data']['title']}")
|
||||
print(f" 📝 {meta.get('description', 'No description')[:60]}...")
|
||||
print(f" 👤 By: {meta.get('author', 'Unknown')}")
|
||||
print(f" 🔗 {url['url']}\n")
|
||||
|
||||
# 🔍 Example 3: Smart search with BM25 relevance scoring
|
||||
async def smart_search_with_bm25():
|
||||
"""Use AI-powered relevance scoring to find the best content"""
|
||||
seeder = AsyncUrlSeeder(logger=AsyncLogger() )
|
||||
|
||||
print("🧠 Smart search: 'web scraping tutorial quiz'")
|
||||
|
||||
# Search with BM25 scoring - AI finds the best matches!
|
||||
results = await seeder.urls("realpython.com",
|
||||
SeedingConfig(
|
||||
source="sitemap",
|
||||
pattern="*beautiful-soup*",
|
||||
extract_head=True,
|
||||
query="web scraping tutorial quiz", # Our search
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2 # Quality filter
|
||||
))
|
||||
|
||||
print(f"\n🎯 Top {len(results)} most relevant results:\n")
|
||||
|
||||
# Show ranked results with relevance scores
|
||||
for i, result in enumerate(results[:3], 1):
|
||||
print(f"{i}. [{result['relevance_score']:.2f}] {result['head_data']['title']}")
|
||||
print(f" 🔗 {result['url'][:60]}...")
|
||||
|
||||
print("\n✨ BM25 automatically ranked by relevance!")
|
||||
async with AsyncUrlSeeder(logger=AsyncLogger()) as seeder:
|
||||
print("🧠 Smart search: 'web scraping tutorial quiz'")
|
||||
|
||||
# Search with BM25 scoring - AI finds the best matches!
|
||||
results = await seeder.urls("realpython.com",
|
||||
SeedingConfig(
|
||||
source="sitemap",
|
||||
pattern="*beautiful-soup*",
|
||||
extract_head=True,
|
||||
query="web scraping tutorial quiz", # Our search
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2 # Quality filter
|
||||
))
|
||||
|
||||
print(f"\n🎯 Top {len(results)} most relevant results:\n")
|
||||
|
||||
# Show ranked results with relevance scores
|
||||
for i, result in enumerate(results[:3], 1):
|
||||
print(f"{i}. [{result['relevance_score']:.2f}] {result['head_data']['title']}")
|
||||
print(f" 🔗 {result['url'][:60]}...")
|
||||
|
||||
print("\n✨ BM25 automatically ranked by relevance!")
|
||||
|
||||
# 🎬 Run the show!
|
||||
async def main():
|
||||
|
||||
Reference in New Issue
Block a user