#!/usr/bin/env python3 """ Demo: How users will call the Seed endpoint This shows practical examples of how developers would use the seed endpoint in their applications to discover URLs for crawling. """ import asyncio from typing import Any, Dict import aiohttp # Configuration API_BASE_URL = "http://localhost:11235" API_TOKEN = None # Set if your API requires authentication class SeedEndpointDemo: def __init__(self, base_url: str = API_BASE_URL, token: str = None): self.base_url = base_url self.headers = {"Content-Type": "application/json"} if token: self.headers["Authorization"] = f"Bearer {token}" async def call_seed_endpoint( self, url: str, max_urls: int = 20, filter_type: str = "all", **kwargs ) -> Dict[str, Any]: """Make a call to the seed endpoint""" # The seed endpoint expects 'url' and config with other parameters config = { "max_urls": max_urls, "filter_type": filter_type, **kwargs, } payload = { "url": url, "config": config, } async with aiohttp.ClientSession() as session: async with session.post( f"{self.base_url}/seed", headers=self.headers, json=payload ) as response: if response.status == 200: result = await response.json() # Extract the nested seeded_urls from the response seed_data = result.get('seed_url', {}) if isinstance(seed_data, dict): return seed_data else: return {'seeded_urls': seed_data or [], 'count': len(seed_data or [])} else: error_text = await response.text() raise Exception(f"API Error {response.status}: {error_text}") async def demo_news_site_seeding(self): """Demo: Seed URLs from a news website""" print("๐Ÿ—ž๏ธ Demo: Seeding URLs from a News Website") print("=" * 50) try: result = await self.call_seed_endpoint( url="https://techcrunch.com", max_urls=15, source="sitemap", # Try sitemap first live_check=True, ) urls_found = len(result.get('seeded_urls', [])) print(f"โœ… Found {urls_found} URLs") if 'message' in result: print(f"โ„น๏ธ Server message: {result['message']}") processing_time = result.get('processing_time', 'N/A') print(f"๐Ÿ“Š Seed completed in: {processing_time} seconds") # Show first 5 URLs as example seeded_urls = result.get("seeded_urls", []) for i, url in enumerate(seeded_urls[:5]): print(f" {i + 1}. {url}") if len(seeded_urls) > 5: print(f" ... and {len(seeded_urls) - 5} more URLs") elif len(seeded_urls) == 0: print(" ๐Ÿ’ก Note: No URLs found. This could be because:") print(" - The website doesn't have an accessible sitemap") print(" - The seeding configuration needs adjustment") print(" - Try different source options like 'cc' (Common Crawl)") except Exception as e: print(f"โŒ Error: {e}") print(" ๐Ÿ’ก This might be a connectivity issue or server problem") async def demo_ecommerce_seeding(self): """Demo: Seed product URLs from an e-commerce site""" print("\n๐Ÿ›’ Demo: Seeding Product URLs from E-commerce") print("=" * 50) print("๐Ÿ’ก Note: This demonstrates configuration for e-commerce sites") try: result = await self.call_seed_endpoint( url="https://example-shop.com", max_urls=25, source="sitemap+cc", pattern="*/product/*", # Focus on product pages live_check=False, ) urls_found = len(result.get('seeded_urls', [])) print(f"โœ… Found {urls_found} product URLs") if 'message' in result: print(f"โ„น๏ธ Server message: {result['message']}") # Show examples if any found seeded_urls = result.get("seeded_urls", []) if seeded_urls: print("๐Ÿ“ฆ Product URLs discovered:") for i, url in enumerate(seeded_urls[:3]): print(f" {i + 1}. {url}") else: print("๐Ÿ’ก For real e-commerce seeding, you would:") print(" โ€ข Use actual e-commerce site URLs") print(" โ€ข Set patterns like '*/product/*' or '*/item/*'") print(" โ€ข Enable live_check to verify product page availability") print(" โ€ข Use appropriate max_urls based on catalog size") except Exception as e: print(f"โŒ Error: {e}") print(" This is expected for the example URL") async def demo_documentation_seeding(self): """Demo: Seed documentation pages""" print("\n๐Ÿ“š Demo: Seeding Documentation Pages") print("=" * 50) try: result = await self.call_seed_endpoint( url="https://docs.python.org", max_urls=30, source="sitemap", pattern="*/library/*", # Focus on library documentation live_check=False, ) urls_found = len(result.get('seeded_urls', [])) print(f"โœ… Found {urls_found} documentation URLs") if 'message' in result: print(f"โ„น๏ธ Server message: {result['message']}") # Analyze URL structure if URLs found seeded_urls = result.get("seeded_urls", []) if seeded_urls: sections = {"library": 0, "tutorial": 0, "reference": 0, "other": 0} for url in seeded_urls: if "/library/" in url: sections["library"] += 1 elif "/tutorial/" in url: sections["tutorial"] += 1 elif "/reference/" in url: sections["reference"] += 1 else: sections["other"] += 1 print("๐Ÿ“Š URL distribution:") for section, count in sections.items(): if count > 0: print(f" {section.title()}: {count} URLs") # Show examples print("\n๐Ÿ“– Example URLs:") for i, url in enumerate(seeded_urls[:3]): print(f" {i + 1}. {url}") else: print("๐Ÿ’ก For documentation seeding, you would typically:") print(" โ€ข Use sites with comprehensive sitemaps like docs.python.org") print(" โ€ข Set patterns to focus on specific sections ('/library/', '/tutorial/')") print(" โ€ข Consider using 'cc' source for broader coverage") except Exception as e: print(f"โŒ Error: {e}") async def demo_seeding_sources(self): """Demo: Different seeding sources available""" print("\n๏ฟฝ Demo: Understanding Seeding Sources") print("=" * 50) print("๐Ÿ“– Available seeding sources:") print(" โ€ข 'sitemap': Discovers URLs from website's sitemap.xml") print(" โ€ข 'cc': Uses Common Crawl database for URL discovery") print(" โ€ข 'sitemap+cc': Combines both sources (default)") print() test_url = "https://docs.python.org" sources = ["sitemap", "cc", "sitemap+cc"] for source in sources: print(f"๐Ÿงช Testing source: '{source}'") try: result = await self.call_seed_endpoint( url=test_url, max_urls=5, source=source, live_check=False, # Faster for demo ) urls_found = len(result.get('seeded_urls', [])) print(f" โœ… {source}: Found {urls_found} URLs") if urls_found > 0: # Show first URL as example first_url = result.get('seeded_urls', [])[0] print(f" Example: {first_url}") elif 'message' in result: print(f" Info: {result['message']}") except Exception as e: print(f" โŒ {source}: Error - {e}") print() # Space between tests async def demo_working_example(self): """Demo: A realistic working example""" print("\nโœจ Demo: Working Example with Live Seeding") print("=" * 50) print("๐ŸŽฏ Testing with a site that likely has good sitemap support...") try: # Use a site that's more likely to have a working sitemap result = await self.call_seed_endpoint( url="https://github.com", max_urls=10, source="sitemap", pattern="*/blog/*", # Focus on blog posts live_check=False, ) urls_found = len(result.get('seeded_urls', [])) print(f"โœ… Found {urls_found} URLs from GitHub") if urls_found > 0: print("๐ŸŽ‰ Success! Here are some discovered URLs:") for i, url in enumerate(result.get('seeded_urls', [])[:3]): print(f" {i + 1}. {url}") print() print("๐Ÿ’ก This demonstrates that seeding works when:") print(" โ€ข The target site has an accessible sitemap") print(" โ€ข The configuration matches available content") print(" โ€ข Network connectivity allows sitemap access") else: print("โ„น๏ธ No URLs found, but this is normal for demo purposes.") print("๐Ÿ’ก In real usage, you would:") print(" โ€ข Test with sites you know have sitemaps") print(" โ€ข Use appropriate URL patterns for your use case") print(" โ€ข Consider using 'cc' source for broader discovery") except Exception as e: print(f"โŒ Error: {e}") print("๐Ÿ’ก This might indicate:") print(" โ€ข Network connectivity issues") print(" โ€ข Server configuration problems") print(" โ€ข Need to adjust seeding parameters") async def main(): """Run all seed endpoint demos""" print("๐ŸŒฑ Crawl4AI Seed Endpoint - User Demo") print("=" * 60) print("This demo shows how developers use the seed endpoint") print("to discover URLs for their crawling workflows.\n") demo = SeedEndpointDemo() # Run individual demos await demo.demo_news_site_seeding() await demo.demo_ecommerce_seeding() await demo.demo_documentation_seeding() await demo.demo_seeding_sources() await demo.demo_working_example() print("\n๐ŸŽ‰ Demo completed!") print("\n๐Ÿ“š Key Takeaways:") print("1. Seed endpoint discovers URLs from sitemaps and Common Crawl") print("2. Different sources ('sitemap', 'cc', 'sitemap+cc') offer different coverage") print("3. URL patterns help filter discovered content to your needs") print("4. Live checking verifies URL accessibility but slows discovery") print("5. Success depends on target site's sitemap availability") print("\n๐Ÿ’ก Next steps for your application:") print("1. Test with your target websites to verify sitemap availability") print("2. Choose appropriate seeding sources for your use case") print("3. Use discovered URLs as input for your crawling pipeline") print("4. Consider fallback strategies if seeding returns few results") if __name__ == "__main__": asyncio.run(main())