Files
crawl4ai/tests/docker/extended_features/demo_seed_endpoint.py
AHMET YILMAZ 201843a204 Add comprehensive tests for anti-bot strategies and extended features
- Implemented `test_adapter_verification.py` to verify correct usage of browser adapters.
- Created `test_all_features.py` for a comprehensive suite covering URL seeding, adaptive crawling, browser adapters, proxy rotation, and dispatchers.
- Developed `test_anti_bot_strategy.py` to validate the functionality of various anti-bot strategies.
- Added `test_antibot_simple.py` for simple testing of anti-bot strategies using async web crawling.
- Introduced `test_bot_detection.py` to assess adapter performance against bot detection mechanisms.
- Compiled `test_final_summary.py` to provide a detailed summary of all tests and their results.
2025-10-07 18:51:13 +08:00

301 lines
12 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Demo: How users will call the Seed endpoint
This shows practical examples of how developers would use the seed endpoint
in their applications to discover URLs for crawling.
"""
import asyncio
from typing import Any, Dict
import aiohttp
# Configuration
API_BASE_URL = "http://localhost:11235"
API_TOKEN = None # Set if your API requires authentication
class SeedEndpointDemo:
def __init__(self, base_url: str = API_BASE_URL, token: str = None):
self.base_url = base_url
self.headers = {"Content-Type": "application/json"}
if token:
self.headers["Authorization"] = f"Bearer {token}"
async def call_seed_endpoint(
self, url: str, max_urls: int = 20, filter_type: str = "all", **kwargs
) -> Dict[str, Any]:
"""Make a call to the seed endpoint"""
# The seed endpoint expects 'url' and config with other parameters
config = {
"max_urls": max_urls,
"filter_type": filter_type,
**kwargs,
}
payload = {
"url": url,
"config": config,
}
async with aiohttp.ClientSession() as session:
async with session.post(
f"{self.base_url}/seed", headers=self.headers, json=payload
) as response:
if response.status == 200:
result = await response.json()
# Extract the nested seeded_urls from the response
seed_data = result.get('seed_url', {})
if isinstance(seed_data, dict):
return seed_data
else:
return {'seeded_urls': seed_data or [], 'count': len(seed_data or [])}
else:
error_text = await response.text()
raise Exception(f"API Error {response.status}: {error_text}")
async def demo_news_site_seeding(self):
"""Demo: Seed URLs from a news website"""
print("🗞️ Demo: Seeding URLs from a News Website")
print("=" * 50)
try:
result = await self.call_seed_endpoint(
url="https://techcrunch.com",
max_urls=15,
source="sitemap", # Try sitemap first
live_check=True,
)
urls_found = len(result.get('seeded_urls', []))
print(f"✅ Found {urls_found} URLs")
if 'message' in result:
print(f" Server message: {result['message']}")
processing_time = result.get('processing_time', 'N/A')
print(f"📊 Seed completed in: {processing_time} seconds")
# Show first 5 URLs as example
seeded_urls = result.get("seeded_urls", [])
for i, url in enumerate(seeded_urls[:5]):
print(f" {i + 1}. {url}")
if len(seeded_urls) > 5:
print(f" ... and {len(seeded_urls) - 5} more URLs")
elif len(seeded_urls) == 0:
print(" 💡 Note: No URLs found. This could be because:")
print(" - The website doesn't have an accessible sitemap")
print(" - The seeding configuration needs adjustment")
print(" - Try different source options like 'cc' (Common Crawl)")
except Exception as e:
print(f"❌ Error: {e}")
print(" 💡 This might be a connectivity issue or server problem")
async def demo_ecommerce_seeding(self):
"""Demo: Seed product URLs from an e-commerce site"""
print("\n🛒 Demo: Seeding Product URLs from E-commerce")
print("=" * 50)
print("💡 Note: This demonstrates configuration for e-commerce sites")
try:
result = await self.call_seed_endpoint(
url="https://example-shop.com",
max_urls=25,
source="sitemap+cc",
pattern="*/product/*", # Focus on product pages
live_check=False,
)
urls_found = len(result.get('seeded_urls', []))
print(f"✅ Found {urls_found} product URLs")
if 'message' in result:
print(f" Server message: {result['message']}")
# Show examples if any found
seeded_urls = result.get("seeded_urls", [])
if seeded_urls:
print("📦 Product URLs discovered:")
for i, url in enumerate(seeded_urls[:3]):
print(f" {i + 1}. {url}")
else:
print("💡 For real e-commerce seeding, you would:")
print(" • Use actual e-commerce site URLs")
print(" • Set patterns like '*/product/*' or '*/item/*'")
print(" • Enable live_check to verify product page availability")
print(" • Use appropriate max_urls based on catalog size")
except Exception as e:
print(f"❌ Error: {e}")
print(" This is expected for the example URL")
async def demo_documentation_seeding(self):
"""Demo: Seed documentation pages"""
print("\n📚 Demo: Seeding Documentation Pages")
print("=" * 50)
try:
result = await self.call_seed_endpoint(
url="https://docs.python.org",
max_urls=30,
source="sitemap",
pattern="*/library/*", # Focus on library documentation
live_check=False,
)
urls_found = len(result.get('seeded_urls', []))
print(f"✅ Found {urls_found} documentation URLs")
if 'message' in result:
print(f" Server message: {result['message']}")
# Analyze URL structure if URLs found
seeded_urls = result.get("seeded_urls", [])
if seeded_urls:
sections = {"library": 0, "tutorial": 0, "reference": 0, "other": 0}
for url in seeded_urls:
if "/library/" in url:
sections["library"] += 1
elif "/tutorial/" in url:
sections["tutorial"] += 1
elif "/reference/" in url:
sections["reference"] += 1
else:
sections["other"] += 1
print("📊 URL distribution:")
for section, count in sections.items():
if count > 0:
print(f" {section.title()}: {count} URLs")
# Show examples
print("\n📖 Example URLs:")
for i, url in enumerate(seeded_urls[:3]):
print(f" {i + 1}. {url}")
else:
print("💡 For documentation seeding, you would typically:")
print(" • Use sites with comprehensive sitemaps like docs.python.org")
print(" • Set patterns to focus on specific sections ('/library/', '/tutorial/')")
print(" • Consider using 'cc' source for broader coverage")
except Exception as e:
print(f"❌ Error: {e}")
async def demo_seeding_sources(self):
"""Demo: Different seeding sources available"""
print("\n<EFBFBD> Demo: Understanding Seeding Sources")
print("=" * 50)
print("📖 Available seeding sources:")
print("'sitemap': Discovers URLs from website's sitemap.xml")
print("'cc': Uses Common Crawl database for URL discovery")
print("'sitemap+cc': Combines both sources (default)")
print()
test_url = "https://docs.python.org"
sources = ["sitemap", "cc", "sitemap+cc"]
for source in sources:
print(f"🧪 Testing source: '{source}'")
try:
result = await self.call_seed_endpoint(
url=test_url,
max_urls=5,
source=source,
live_check=False, # Faster for demo
)
urls_found = len(result.get('seeded_urls', []))
print(f"{source}: Found {urls_found} URLs")
if urls_found > 0:
# Show first URL as example
first_url = result.get('seeded_urls', [])[0]
print(f" Example: {first_url}")
elif 'message' in result:
print(f" Info: {result['message']}")
except Exception as e:
print(f"{source}: Error - {e}")
print() # Space between tests
async def demo_working_example(self):
"""Demo: A realistic working example"""
print("\n✨ Demo: Working Example with Live Seeding")
print("=" * 50)
print("🎯 Testing with a site that likely has good sitemap support...")
try:
# Use a site that's more likely to have a working sitemap
result = await self.call_seed_endpoint(
url="https://github.com",
max_urls=10,
source="sitemap",
pattern="*/blog/*", # Focus on blog posts
live_check=False,
)
urls_found = len(result.get('seeded_urls', []))
print(f"✅ Found {urls_found} URLs from GitHub")
if urls_found > 0:
print("🎉 Success! Here are some discovered URLs:")
for i, url in enumerate(result.get('seeded_urls', [])[:3]):
print(f" {i + 1}. {url}")
print()
print("💡 This demonstrates that seeding works when:")
print(" • The target site has an accessible sitemap")
print(" • The configuration matches available content")
print(" • Network connectivity allows sitemap access")
else:
print(" No URLs found, but this is normal for demo purposes.")
print("💡 In real usage, you would:")
print(" • Test with sites you know have sitemaps")
print(" • Use appropriate URL patterns for your use case")
print(" • Consider using 'cc' source for broader discovery")
except Exception as e:
print(f"❌ Error: {e}")
print("💡 This might indicate:")
print(" • Network connectivity issues")
print(" • Server configuration problems")
print(" • Need to adjust seeding parameters")
async def main():
"""Run all seed endpoint demos"""
print("🌱 Crawl4AI Seed Endpoint - User Demo")
print("=" * 60)
print("This demo shows how developers use the seed endpoint")
print("to discover URLs for their crawling workflows.\n")
demo = SeedEndpointDemo()
# Run individual demos
await demo.demo_news_site_seeding()
await demo.demo_ecommerce_seeding()
await demo.demo_documentation_seeding()
await demo.demo_seeding_sources()
await demo.demo_working_example()
print("\n🎉 Demo completed!")
print("\n📚 Key Takeaways:")
print("1. Seed endpoint discovers URLs from sitemaps and Common Crawl")
print("2. Different sources ('sitemap', 'cc', 'sitemap+cc') offer different coverage")
print("3. URL patterns help filter discovered content to your needs")
print("4. Live checking verifies URL accessibility but slows discovery")
print("5. Success depends on target site's sitemap availability")
print("\n💡 Next steps for your application:")
print("1. Test with your target websites to verify sitemap availability")
print("2. Choose appropriate seeding sources for your use case")
print("3. Use discovered URLs as input for your crawling pipeline")
print("4. Consider fallback strategies if seeding returns few results")
if __name__ == "__main__":
asyncio.run(main())