Files
crawl4ai/tests/docker/extended_features/demo_adaptive_endpoint.py
AHMET YILMAZ 201843a204 Add comprehensive tests for anti-bot strategies and extended features
- Implemented `test_adapter_verification.py` to verify correct usage of browser adapters.
- Created `test_all_features.py` for a comprehensive suite covering URL seeding, adaptive crawling, browser adapters, proxy rotation, and dispatchers.
- Developed `test_anti_bot_strategy.py` to validate the functionality of various anti-bot strategies.
- Added `test_antibot_simple.py` for simple testing of anti-bot strategies using async web crawling.
- Introduced `test_bot_detection.py` to assess adapter performance against bot detection mechanisms.
- Compiled `test_final_summary.py` to provide a detailed summary of all tests and their results.
2025-10-07 18:51:13 +08:00

436 lines
16 KiB
Python

#!/usr/bin/env python3
"""
Demo: How users will call the Adaptive Digest endpoint
This shows practical examples of how developers would use the adaptive crawling
feature to intelligently gather relevant content based on queries.
"""
import asyncio
import time
from typing import Any, Dict, Optional
import aiohttp
# Configuration
API_BASE_URL = "http://localhost:11235"
API_TOKEN = None # Set if your API requires authentication
class AdaptiveEndpointDemo:
def __init__(self, base_url: str = API_BASE_URL, token: str = None):
self.base_url = base_url
self.headers = {"Content-Type": "application/json"}
if token:
self.headers["Authorization"] = f"Bearer {token}"
async def submit_adaptive_job(
self, start_url: str, query: str, config: Optional[Dict] = None
) -> str:
"""Submit an adaptive crawling job and return task ID"""
payload = {"start_url": start_url, "query": query}
if config:
payload["config"] = config
async with aiohttp.ClientSession() as session:
async with session.post(
f"{self.base_url}/adaptive/digest/job",
headers=self.headers,
json=payload,
) as response:
if response.status == 202: # Accepted
result = await response.json()
return result["task_id"]
else:
error_text = await response.text()
raise Exception(f"API Error {response.status}: {error_text}")
async def check_job_status(self, task_id: str) -> Dict[str, Any]:
"""Check the status of an adaptive crawling job"""
async with aiohttp.ClientSession() as session:
async with session.get(
f"{self.base_url}/adaptive/digest/job/{task_id}", headers=self.headers
) as response:
if response.status == 200:
return await response.json()
else:
error_text = await response.text()
raise Exception(f"API Error {response.status}: {error_text}")
async def wait_for_completion(
self, task_id: str, max_wait: int = 300
) -> Dict[str, Any]:
"""Poll job status until completion or timeout"""
start_time = time.time()
while time.time() - start_time < max_wait:
status = await self.check_job_status(task_id)
if status["status"] == "COMPLETED":
return status
elif status["status"] == "FAILED":
raise Exception(f"Job failed: {status.get('error', 'Unknown error')}")
print(
f"⏳ Job {status['status']}... (elapsed: {int(time.time() - start_time)}s)"
)
await asyncio.sleep(3) # Poll every 3 seconds
raise Exception(f"Job timed out after {max_wait} seconds")
async def demo_research_assistant(self):
"""Demo: Research assistant for academic papers"""
print("🔬 Demo: Academic Research Assistant")
print("=" * 50)
try:
print("🚀 Submitting job: Find research on 'machine learning optimization'")
task_id = await self.submit_adaptive_job(
start_url="https://arxiv.org",
query="machine learning optimization techniques recent papers",
config={
"max_depth": 3,
"confidence_threshold": 0.7,
"max_pages": 20,
"content_filters": ["academic", "research"],
},
)
print(f"📋 Job submitted with ID: {task_id}")
# Wait for completion
result = await self.wait_for_completion(task_id)
print("✅ Research completed!")
print(f"🎯 Confidence score: {result['result']['confidence']:.2f}")
print(f"📊 Coverage stats: {result['result']['coverage_stats']}")
# Show relevant content found
relevant_content = result["result"]["relevant_content"]
print(f"\n📚 Found {len(relevant_content)} relevant research papers:")
for i, content in enumerate(relevant_content[:3], 1):
title = content.get("title", "Untitled")[:60]
relevance = content.get("relevance_score", 0)
print(f" {i}. {title}... (relevance: {relevance:.2f})")
except Exception as e:
print(f"❌ Error: {e}")
async def demo_market_intelligence(self):
"""Demo: Market intelligence gathering"""
print("\n💼 Demo: Market Intelligence Gathering")
print("=" * 50)
try:
print("🚀 Submitting job: Analyze competitors in 'sustainable packaging'")
task_id = await self.submit_adaptive_job(
start_url="https://packagingeurope.com",
query="sustainable packaging solutions eco-friendly materials competitors market trends",
config={
"max_depth": 4,
"confidence_threshold": 0.6,
"max_pages": 30,
"content_filters": ["business", "industry"],
"follow_external_links": True,
},
)
print(f"📋 Job submitted with ID: {task_id}")
# Wait for completion
result = await self.wait_for_completion(task_id)
print("✅ Market analysis completed!")
print(f"🎯 Intelligence confidence: {result['result']['confidence']:.2f}")
# Analyze findings
relevant_content = result["result"]["relevant_content"]
print(
f"\n📈 Market intelligence gathered from {len(relevant_content)} sources:"
)
companies = set()
trends = []
for content in relevant_content:
# Extract company mentions (simplified)
text = content.get("content", "")
if any(
word in text.lower()
for word in ["company", "corporation", "inc", "ltd"]
):
# This would be more sophisticated in real implementation
companies.add(content.get("source_url", "Unknown"))
# Extract trend keywords
if any(
word in text.lower() for word in ["trend", "innovation", "future"]
):
trends.append(content.get("title", "Trend"))
print(f"🏢 Companies analyzed: {len(companies)}")
print(f"📊 Trends identified: {len(trends)}")
except Exception as e:
print(f"❌ Error: {e}")
async def demo_content_curation(self):
"""Demo: Content curation for newsletter"""
print("\n📰 Demo: Content Curation for Tech Newsletter")
print("=" * 50)
try:
print("🚀 Submitting job: Curate content about 'AI developments this week'")
task_id = await self.submit_adaptive_job(
start_url="https://techcrunch.com",
query="artificial intelligence AI developments news this week recent advances",
config={
"max_depth": 2,
"confidence_threshold": 0.8,
"max_pages": 25,
"content_filters": ["news", "recent"],
"date_range": "last_7_days",
},
)
print(f"📋 Job submitted with ID: {task_id}")
# Wait for completion
result = await self.wait_for_completion(task_id)
print("✅ Content curation completed!")
print(f"🎯 Curation confidence: {result['result']['confidence']:.2f}")
# Process curated content
relevant_content = result["result"]["relevant_content"]
print(f"\n📮 Curated {len(relevant_content)} articles for your newsletter:")
# Group by category/topic
categories = {
"AI Research": [],
"Industry News": [],
"Product Launches": [],
"Other": [],
}
for content in relevant_content:
title = content.get("title", "Untitled")
if any(
word in title.lower() for word in ["research", "study", "paper"]
):
categories["AI Research"].append(content)
elif any(
word in title.lower() for word in ["company", "startup", "funding"]
):
categories["Industry News"].append(content)
elif any(
word in title.lower() for word in ["launch", "release", "unveil"]
):
categories["Product Launches"].append(content)
else:
categories["Other"].append(content)
for category, articles in categories.items():
if articles:
print(f"\n📂 {category} ({len(articles)} articles):")
for article in articles[:2]: # Show top 2 per category
title = article.get("title", "Untitled")[:50]
print(f"{title}...")
except Exception as e:
print(f"❌ Error: {e}")
async def demo_product_research(self):
"""Demo: Product research and comparison"""
print("\n🛍️ Demo: Product Research & Comparison")
print("=" * 50)
try:
print("🚀 Submitting job: Research 'best wireless headphones 2024'")
task_id = await self.submit_adaptive_job(
start_url="https://www.cnet.com",
query="best wireless headphones 2024 reviews comparison features price",
config={
"max_depth": 3,
"confidence_threshold": 0.75,
"max_pages": 20,
"content_filters": ["review", "comparison"],
"extract_structured_data": True,
},
)
print(f"📋 Job submitted with ID: {task_id}")
# Wait for completion
result = await self.wait_for_completion(task_id)
print("✅ Product research completed!")
print(f"🎯 Research confidence: {result['result']['confidence']:.2f}")
# Analyze product data
relevant_content = result["result"]["relevant_content"]
print(
f"\n🎧 Product research summary from {len(relevant_content)} sources:"
)
# Extract product mentions (simplified example)
products = {}
for content in relevant_content:
text = content.get("content", "").lower()
# Look for common headphone brands
brands = [
"sony",
"bose",
"apple",
"sennheiser",
"jabra",
"audio-technica",
]
for brand in brands:
if brand in text:
if brand not in products:
products[brand] = 0
products[brand] += 1
print("🏷️ Product mentions:")
for product, mentions in sorted(
products.items(), key=lambda x: x[1], reverse=True
)[:5]:
print(f" {product.title()}: {mentions} mentions")
except Exception as e:
print(f"❌ Error: {e}")
async def demo_monitoring_pipeline(self):
"""Demo: Set up a monitoring pipeline for ongoing content tracking"""
print("\n📡 Demo: Content Monitoring Pipeline")
print("=" * 50)
monitoring_queries = [
{
"name": "Brand Mentions",
"start_url": "https://news.google.com",
"query": "YourBrand company news mentions",
"priority": "high",
},
{
"name": "Industry Trends",
"start_url": "https://techcrunch.com",
"query": "SaaS industry trends 2024",
"priority": "medium",
},
{
"name": "Competitor Activity",
"start_url": "https://crunchbase.com",
"query": "competitor funding announcements product launches",
"priority": "high",
},
]
print("🚀 Starting monitoring pipeline with 3 queries...")
jobs = {}
# Submit all monitoring jobs
for query_config in monitoring_queries:
print(f"\n📋 Submitting: {query_config['name']}")
try:
task_id = await self.submit_adaptive_job(
start_url=query_config["start_url"],
query=query_config["query"],
config={
"max_depth": 2,
"confidence_threshold": 0.6,
"max_pages": 15,
},
)
jobs[query_config["name"]] = {
"task_id": task_id,
"priority": query_config["priority"],
"status": "submitted",
}
print(f" ✅ Job ID: {task_id}")
except Exception as e:
print(f" ❌ Failed: {e}")
# Monitor all jobs
print(f"\n⏳ Monitoring {len(jobs)} jobs...")
completed_jobs = {}
max_wait = 180 # 3 minutes total
start_time = time.time()
while jobs and (time.time() - start_time) < max_wait:
for name, job_info in list(jobs.items()):
try:
status = await self.check_job_status(job_info["task_id"])
if status["status"] == "COMPLETED":
completed_jobs[name] = status
del jobs[name]
print(f"{name} completed")
elif status["status"] == "FAILED":
print(f"{name} failed: {status.get('error', 'Unknown')}")
del jobs[name]
except Exception as e:
print(f" ⚠️ Error checking {name}: {e}")
if jobs: # Still have pending jobs
await asyncio.sleep(5)
# Summary
print("\n📊 Monitoring Pipeline Summary:")
print(f" ✅ Completed: {len(completed_jobs)} jobs")
print(f" ⏳ Pending: {len(jobs)} jobs")
for name, result in completed_jobs.items():
confidence = result["result"]["confidence"]
content_count = len(result["result"]["relevant_content"])
print(f" {name}: {content_count} items (confidence: {confidence:.2f})")
async def main():
"""Run all adaptive endpoint demos"""
print("🧠 Crawl4AI Adaptive Digest Endpoint - User Demo")
print("=" * 60)
print("This demo shows how developers use adaptive crawling")
print("to intelligently gather relevant content based on queries.\n")
demo = AdaptiveEndpointDemo()
try:
# Run individual demos
await demo.demo_research_assistant()
await demo.demo_market_intelligence()
await demo.demo_content_curation()
await demo.demo_product_research()
# Run monitoring pipeline demo
await demo.demo_monitoring_pipeline()
print("\n🎉 All demos completed successfully!")
print("\nReal-world usage patterns:")
print("1. Submit multiple jobs for parallel processing")
print("2. Poll job status to track progress")
print("3. Process results when jobs complete")
print("4. Use confidence scores to filter quality content")
print("5. Set up monitoring pipelines for ongoing intelligence")
except Exception as e:
print(f"\n❌ Demo failed: {e}")
print("Make sure the Crawl4AI server is running on localhost:11235")
if __name__ == "__main__":
asyncio.run(main())