docs(linkdin, url_seeder): update and reorganize LinkedIn data discovery and URL seeder documentation

This commit introduces significant updates to the LinkedIn data discovery documentation by adding two new Jupyter notebooks that provide detailed insights into data discovery processes. The previous workshop notebook has been removed to streamline the content and avoid redundancy. Additionally, the URL seeder documentation has been expanded with a new tutorial and several enhancements to existing scripts, improving usability and clarity. The changes include: - Added and for comprehensive LinkedIn data discovery. - Removed to eliminate outdated content. - Updated to reflect new data visualization requirements. - Introduced and to facilitate easier access to URL seeding techniques. - Enhanced existing Python scripts and markdown files in the URL seeder section for better documentation and examples. These changes aim to improve the overall documentation quality and user experience for developers working with LinkedIn data and URL seeding techniques.
2025-06-05 15:06:25 +08:00
parent b5c2732f88
commit c6fc5c0518
11 changed files with 9744 additions and 1464 deletions
--- a/docs/examples/url_seeder/bbc_sport_research_assistant.py
+++ b/docs/examples/url_seeder/bbc_sport_research_assistant.py
@@ -23,6 +23,8 @@ Requirements:
 Usage:
 - Run normally: python bbc_sport_research_assistant.py
 - Run test mode: python bbc_sport_research_assistant.py test
+
+Note: AsyncUrlSeeder now uses context manager for automatic cleanup.
 """

 import asyncio
@@ -269,44 +271,43 @@ async def discover_urls(domain: str, query: str, config: ResearchConfig) -> List
    
    console.print(f"\n[cyan]🔍 Discovering URLs from {domain}...[/cyan]")
    
-    # Initialize URL seeder
-    seeder = AsyncUrlSeeder(logger=AsyncLogger(verbose=config.verbose))
-    
-    # Configure seeding
-    seeding_config = SeedingConfig(
-        source="sitemap+cc",  # Use both sitemap and Common Crawl
-        extract_head=config.extract_head_metadata,
-        query=query,
-        scoring_method=config.scoring_method,
-        score_threshold=config.score_threshold,
-        max_urls=config.max_urls_discovery,
-        live_check=config.live_check,
-        force=config.force_refresh
-    )
-    
-    try:
-        # Discover URLs
-        urls = await seeder.urls(domain, seeding_config)
-        
-        # Sort by relevance score (descending)
-        sorted_urls = sorted(
-            urls, 
-            key=lambda x: x.get('relevance_score', 0), 
-            reverse=True
+    # Initialize URL seeder with context manager
+    async with AsyncUrlSeeder(logger=AsyncLogger(verbose=config.verbose)) as seeder:
+        # Configure seeding
+        seeding_config = SeedingConfig(
+            source="sitemap+cc",  # Use both sitemap and Common Crawl
+            extract_head=config.extract_head_metadata,
+            query=query,
+            scoring_method=config.scoring_method,
+            score_threshold=config.score_threshold,
+            max_urls=config.max_urls_discovery,
+            live_check=config.live_check,
+            force=config.force_refresh
        )
        
-        # Take top K
-        top_urls = sorted_urls[:config.top_k_urls]
-        
-        console.print(f"[green]✅ Discovered {len(urls)} URLs, selected top {len(top_urls)}[/green]")
-        
-        # Cache the result
-        save_to_cache(cache_key, top_urls)
-        return top_urls
-        
-    except Exception as e:
-        console.print(f"[red]❌ URL discovery failed: {e}[/red]")
-        return []
+        try:
+            # Discover URLs
+            urls = await seeder.urls(domain, seeding_config)
+            
+            # Sort by relevance score (descending)
+            sorted_urls = sorted(
+                urls, 
+                key=lambda x: x.get('relevance_score', 0), 
+                reverse=True
+            )
+            
+            # Take top K
+            top_urls = sorted_urls[:config.top_k_urls]
+            
+            console.print(f"[green]✅ Discovered {len(urls)} URLs, selected top {len(top_urls)}[/green]")
+            
+            # Cache the result
+            save_to_cache(cache_key, top_urls)
+            return top_urls
+            
+        except Exception as e:
+            console.print(f"[red]❌ URL discovery failed: {e}[/red]")
+            return []


 async def crawl_selected_urls(urls: List[str], query: str, config: ResearchConfig) -> List[Dict]: