Files
crawl4ai/docs/examples/url_seeder/url_seeder_quick_demo.py
UncleCode c6fc5c0518 docs(linkdin, url_seeder): update and reorganize LinkedIn data discovery and URL seeder documentation
This commit introduces significant updates to the LinkedIn data discovery documentation by adding two new Jupyter notebooks that provide detailed insights into data discovery processes. The previous workshop notebook has been removed to streamline the content and avoid redundancy. Additionally, the URL seeder documentation has been expanded with a new tutorial and several enhancements to existing scripts, improving usability and clarity.

The changes include:
- Added  and  for comprehensive LinkedIn data discovery.
- Removed  to eliminate outdated content.
- Updated  to reflect new data visualization requirements.
- Introduced  and  to facilitate easier access to URL seeding techniques.
- Enhanced existing Python scripts and markdown files in the URL seeder section for better documentation and examples.

These changes aim to improve the overall documentation quality and user experience for developers working with LinkedIn data and URL seeding techniques.
2025-06-05 15:06:25 +08:00

128 lines
5.5 KiB
Python

"""
🚀 URL Seeder + AsyncWebCrawler = Magic!
Quick demo showing discovery → filter → crawl pipeline
Note: Uses context manager for automatic cleanup of resources.
"""
import asyncio, os
from crawl4ai import AsyncUrlSeeder, AsyncWebCrawler, SeedingConfig, CrawlerRunConfig, AsyncLogger, DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import PruningContentFilter
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
# 🔍 Example 1: Discover ALL → Filter → Crawl
async def discover_and_crawl():
"""Find Python module tutorials & extract them all!"""
async with AsyncUrlSeeder(logger=AsyncLogger()) as seeder:
# Step 1: See how many URLs exist (spoiler: A LOT!)
print("📊 Let's see what RealPython has...")
all_urls = await seeder.urls("realpython.com",
SeedingConfig(source="sitemap"))
print(f"😱 Found {len(all_urls)} total URLs!")
# Step 2: Filter for Python modules (perfect size ~13)
print("\n🎯 Filtering for 'python-modules' tutorials...")
module_urls = await seeder.urls("realpython.com",
SeedingConfig(
source="sitemap",
pattern="*python-modules*",
live_check=True # Make sure they're alive!
))
print(f"✨ Found {len(module_urls)} module tutorials")
for url in module_urls[:3]: # Show first 3
status = "" if url["status"] == "valid" else ""
print(f"{status} {url['url']}")
# Step 3: Crawl them all with pruning (keep it lean!)
print("\n🕷️ Crawling all module tutorials...")
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter( # Smart filtering!
threshold=0.48, # Remove fluff
threshold_type="fixed",
),
),
only_text=True,
stream=True,
)
# Extract just the URLs from the seeder results
urls_to_crawl = [u["url"] for u in module_urls[:5]]
results = await crawler.arun_many(urls_to_crawl, config=config)
# Process & save
saved = 0
async for result in results:
if result.success:
# Save each tutorial (name from URL)
name = result.url.split("/")[-2] + ".md"
name = os.path.join(CURRENT_DIR, name)
with open(name, "w") as f:
f.write(result.markdown.fit_markdown)
saved += 1
print(f"💾 Saved: {name}")
print(f"\n🎉 Successfully saved {saved} tutorials!")
# 🔍 Example 2: Beautiful Soup articles with metadata peek
async def explore_beautifulsoup():
"""Discover BeautifulSoup content & peek at metadata"""
async with AsyncUrlSeeder(logger=AsyncLogger()) as seeder:
print("🍲 Looking for Beautiful Soup articles...")
soup_urls = await seeder.urls("realpython.com",
SeedingConfig(
source="sitemap",
pattern="*beautiful-soup*",
extract_head=True # Get the metadata!
))
print(f"\n📚 Found {len(soup_urls)} Beautiful Soup articles:\n")
# Show what we discovered
for i, url in enumerate(soup_urls, 1):
meta = url["head_data"]["meta"]
print(f"{i}. {url['head_data']['title']}")
print(f" 📝 {meta.get('description', 'No description')[:60]}...")
print(f" 👤 By: {meta.get('author', 'Unknown')}")
print(f" 🔗 {url['url']}\n")
# 🔍 Example 3: Smart search with BM25 relevance scoring
async def smart_search_with_bm25():
"""Use AI-powered relevance scoring to find the best content"""
async with AsyncUrlSeeder(logger=AsyncLogger()) as seeder:
print("🧠 Smart search: 'web scraping tutorial quiz'")
# Search with BM25 scoring - AI finds the best matches!
results = await seeder.urls("realpython.com",
SeedingConfig(
source="sitemap",
pattern="*beautiful-soup*",
extract_head=True,
query="web scraping tutorial quiz", # Our search
scoring_method="bm25",
score_threshold=0.2 # Quality filter
))
print(f"\n🎯 Top {len(results)} most relevant results:\n")
# Show ranked results with relevance scores
for i, result in enumerate(results[:3], 1):
print(f"{i}. [{result['relevance_score']:.2f}] {result['head_data']['title']}")
print(f" 🔗 {result['url'][:60]}...")
print("\n✨ BM25 automatically ranked by relevance!")
# 🎬 Run the show!
async def main():
print("=" * 60)
await discover_and_crawl()
print("\n" + "=" * 60 + "\n")
await explore_beautifulsoup()
print("\n" + "=" * 60 + "\n")
await smart_search_with_bm25()
if __name__ == "__main__":
asyncio.run(main())