docs(linkdin, url_seeder): update and reorganize LinkedIn data discovery and URL seeder documentation

This commit introduces significant updates to the LinkedIn data discovery documentation by adding two new Jupyter notebooks that provide detailed insights into data discovery processes. The previous workshop notebook has been removed to streamline the content and avoid redundancy. Additionally, the URL seeder documentation has been expanded with a new tutorial and several enhancements to existing scripts, improving usability and clarity.

The changes include:
- Added  and  for comprehensive LinkedIn data discovery.
- Removed  to eliminate outdated content.
- Updated  to reflect new data visualization requirements.
- Introduced  and  to facilitate easier access to URL seeding techniques.
- Enhanced existing Python scripts and markdown files in the URL seeder section for better documentation and examples.

These changes aim to improve the overall documentation quality and user experience for developers working with LinkedIn data and URL seeding techniques.
This commit is contained in:
UncleCode
2025-06-05 15:06:25 +08:00
parent b5c2732f88
commit c6fc5c0518
11 changed files with 9744 additions and 1464 deletions

View File

@@ -23,6 +23,8 @@ Requirements:
Usage:
- Run normally: python bbc_sport_research_assistant.py
- Run test mode: python bbc_sport_research_assistant.py test
Note: AsyncUrlSeeder now uses context manager for automatic cleanup.
"""
import asyncio
@@ -269,44 +271,43 @@ async def discover_urls(domain: str, query: str, config: ResearchConfig) -> List
console.print(f"\n[cyan]🔍 Discovering URLs from {domain}...[/cyan]")
# Initialize URL seeder
seeder = AsyncUrlSeeder(logger=AsyncLogger(verbose=config.verbose))
# Configure seeding
seeding_config = SeedingConfig(
source="sitemap+cc", # Use both sitemap and Common Crawl
extract_head=config.extract_head_metadata,
query=query,
scoring_method=config.scoring_method,
score_threshold=config.score_threshold,
max_urls=config.max_urls_discovery,
live_check=config.live_check,
force=config.force_refresh
)
try:
# Discover URLs
urls = await seeder.urls(domain, seeding_config)
# Sort by relevance score (descending)
sorted_urls = sorted(
urls,
key=lambda x: x.get('relevance_score', 0),
reverse=True
# Initialize URL seeder with context manager
async with AsyncUrlSeeder(logger=AsyncLogger(verbose=config.verbose)) as seeder:
# Configure seeding
seeding_config = SeedingConfig(
source="sitemap+cc", # Use both sitemap and Common Crawl
extract_head=config.extract_head_metadata,
query=query,
scoring_method=config.scoring_method,
score_threshold=config.score_threshold,
max_urls=config.max_urls_discovery,
live_check=config.live_check,
force=config.force_refresh
)
# Take top K
top_urls = sorted_urls[:config.top_k_urls]
console.print(f"[green]✅ Discovered {len(urls)} URLs, selected top {len(top_urls)}[/green]")
# Cache the result
save_to_cache(cache_key, top_urls)
return top_urls
except Exception as e:
console.print(f"[red]❌ URL discovery failed: {e}[/red]")
return []
try:
# Discover URLs
urls = await seeder.urls(domain, seeding_config)
# Sort by relevance score (descending)
sorted_urls = sorted(
urls,
key=lambda x: x.get('relevance_score', 0),
reverse=True
)
# Take top K
top_urls = sorted_urls[:config.top_k_urls]
console.print(f"[green]✅ Discovered {len(urls)} URLs, selected top {len(top_urls)}[/green]")
# Cache the result
save_to_cache(cache_key, top_urls)
return top_urls
except Exception as e:
console.print(f"[red]❌ URL discovery failed: {e}[/red]")
return []
async def crawl_selected_urls(urls: List[str], query: str, config: ResearchConfig) -> List[Dict]: