docs(linkdin, url_seeder): update and reorganize LinkedIn data discovery and URL seeder documentation

This commit introduces significant updates to the LinkedIn data discovery documentation by adding two new Jupyter notebooks that provide detailed insights into data discovery processes. The previous workshop notebook has been removed to streamline the content and avoid redundancy. Additionally, the URL seeder documentation has been expanded with a new tutorial and several enhancements to existing scripts, improving usability and clarity. The changes include: - Added and for comprehensive LinkedIn data discovery. - Removed to eliminate outdated content. - Updated to reflect new data visualization requirements. - Introduced and to facilitate easier access to URL seeding techniques. - Enhanced existing Python scripts and markdown files in the URL seeder section for better documentation and examples. These changes aim to improve the overall documentation quality and user experience for developers working with LinkedIn data and URL seeding techniques.
2025-06-05 15:06:25 +08:00
parent b5c2732f88
commit c6fc5c0518
11 changed files with 9744 additions and 1464 deletions
--- a/docs/examples/url_seeder/url_seeder_demo.py
+++ b/docs/examples/url_seeder/url_seeder_demo.py
@@ -7,6 +7,9 @@ This demo shows:
 3. Live URL validation and metadata extraction
 4. BM25 relevance scoring for intelligent filtering
 5. Integration with AsyncWebCrawler for the complete pipeline
+6. Multi-domain discovery across multiple sites
+
+Note: The AsyncUrlSeeder now supports context manager protocol for automatic cleanup.
 """

 import asyncio
@@ -217,44 +220,43 @@ async def section_6_multi_domain(seed: AsyncUrlSeeder):


 async def main():
-    seed = AsyncUrlSeeder()
-    
-    # Interactive menu
-    sections = {
-        "1": ("Basic URL Discovery", section_1_basic_exploration),
-        "2": ("Cache Management Demo", section_2_cache_demo),
-        "3": ("Live Check & Metadata Extraction", section_3_live_head),
-        "4": ("BM25 Relevance Scoring", section_4_bm25_scoring),
-        "5": ("Complete Pipeline (Discover → Filter → Crawl)", section_5_keyword_filter_to_agent),
-        "6": ("Multi-Domain Discovery", section_6_multi_domain),
-        "7": ("Run All Demos", None)
-    }
-    
-    console.print("\n[bold]Available Demos:[/bold]")
-    for key, (title, _) in sections.items():
-        console.print(f"  {key}. {title}")
-    
-    choice = Prompt.ask("\n[cyan]Which demo would you like to run?[/cyan]", 
-                       choices=list(sections.keys()), 
-                       default="7")
-    
-    console.print()
-    
-    if choice == "7":
-        # Run all demos
-        for key, (title, func) in sections.items():
-            if key != "7" and func:
-                await func(seed)
-                if key != "6":  # Don't pause after the last demo
-                    if not Confirm.ask("\n[yellow]Continue to next demo?[/yellow]", default=True):
-                        break
-                    console.print()
-    else:
-        # Run selected demo
-        _, func = sections[choice]
-        await func(seed)
-    
-    console.rule("[bold green]Demo Complete ✔︎")
+    async with AsyncUrlSeeder() as seed:
+        # Interactive menu
+        sections = {
+            "1": ("Basic URL Discovery", section_1_basic_exploration),
+            "2": ("Cache Management Demo", section_2_cache_demo),
+            "3": ("Live Check & Metadata Extraction", section_3_live_head),
+            "4": ("BM25 Relevance Scoring", section_4_bm25_scoring),
+            "5": ("Complete Pipeline (Discover → Filter → Crawl)", section_5_keyword_filter_to_agent),
+            "6": ("Multi-Domain Discovery", section_6_multi_domain),
+            "7": ("Run All Demos", None)
+        }
+        
+        console.print("\n[bold]Available Demos:[/bold]")
+        for key, (title, _) in sections.items():
+            console.print(f"  {key}. {title}")
+        
+        choice = Prompt.ask("\n[cyan]Which demo would you like to run?[/cyan]", 
+                           choices=list(sections.keys()), 
+                           default="7")
+        
+        console.print()
+        
+        if choice == "7":
+            # Run all demos
+            for key, (title, func) in sections.items():
+                if key != "7" and func:
+                    await func(seed)
+                    if key != "6":  # Don't pause after the last demo
+                        if not Confirm.ask("\n[yellow]Continue to next demo?[/yellow]", default=True):
+                            break
+                        console.print()
+        else:
+            # Run selected demo
+            _, func = sections[choice]
+            await func(seed)
+        
+        console.rule("[bold green]Demo Complete ✔︎")


 if __name__ == "__main__":