Add comprehensive tests for anti-bot strategies and extended features
- Implemented `test_adapter_verification.py` to verify correct usage of browser adapters. - Created `test_all_features.py` for a comprehensive suite covering URL seeding, adaptive crawling, browser adapters, proxy rotation, and dispatchers. - Developed `test_anti_bot_strategy.py` to validate the functionality of various anti-bot strategies. - Added `test_antibot_simple.py` for simple testing of anti-bot strategies using async web crawling. - Introduced `test_bot_detection.py` to assess adapter performance against bot detection mechanisms. - Compiled `test_final_summary.py` to provide a detailed summary of all tests and their results.
This commit is contained in:
435
tests/docker/extended_features/demo_adaptive_endpoint.py
Normal file
435
tests/docker/extended_features/demo_adaptive_endpoint.py
Normal file
@@ -0,0 +1,435 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Demo: How users will call the Adaptive Digest endpoint
|
||||
This shows practical examples of how developers would use the adaptive crawling
|
||||
feature to intelligently gather relevant content based on queries.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import aiohttp
|
||||
|
||||
# Configuration
|
||||
API_BASE_URL = "http://localhost:11235"
|
||||
API_TOKEN = None # Set if your API requires authentication
|
||||
|
||||
|
||||
class AdaptiveEndpointDemo:
|
||||
def __init__(self, base_url: str = API_BASE_URL, token: str = None):
|
||||
self.base_url = base_url
|
||||
self.headers = {"Content-Type": "application/json"}
|
||||
if token:
|
||||
self.headers["Authorization"] = f"Bearer {token}"
|
||||
|
||||
async def submit_adaptive_job(
|
||||
self, start_url: str, query: str, config: Optional[Dict] = None
|
||||
) -> str:
|
||||
"""Submit an adaptive crawling job and return task ID"""
|
||||
payload = {"start_url": start_url, "query": query}
|
||||
|
||||
if config:
|
||||
payload["config"] = config
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(
|
||||
f"{self.base_url}/adaptive/digest/job",
|
||||
headers=self.headers,
|
||||
json=payload,
|
||||
) as response:
|
||||
if response.status == 202: # Accepted
|
||||
result = await response.json()
|
||||
return result["task_id"]
|
||||
else:
|
||||
error_text = await response.text()
|
||||
raise Exception(f"API Error {response.status}: {error_text}")
|
||||
|
||||
async def check_job_status(self, task_id: str) -> Dict[str, Any]:
|
||||
"""Check the status of an adaptive crawling job"""
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(
|
||||
f"{self.base_url}/adaptive/digest/job/{task_id}", headers=self.headers
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
return await response.json()
|
||||
else:
|
||||
error_text = await response.text()
|
||||
raise Exception(f"API Error {response.status}: {error_text}")
|
||||
|
||||
async def wait_for_completion(
|
||||
self, task_id: str, max_wait: int = 300
|
||||
) -> Dict[str, Any]:
|
||||
"""Poll job status until completion or timeout"""
|
||||
start_time = time.time()
|
||||
|
||||
while time.time() - start_time < max_wait:
|
||||
status = await self.check_job_status(task_id)
|
||||
|
||||
if status["status"] == "COMPLETED":
|
||||
return status
|
||||
elif status["status"] == "FAILED":
|
||||
raise Exception(f"Job failed: {status.get('error', 'Unknown error')}")
|
||||
|
||||
print(
|
||||
f"⏳ Job {status['status']}... (elapsed: {int(time.time() - start_time)}s)"
|
||||
)
|
||||
await asyncio.sleep(3) # Poll every 3 seconds
|
||||
|
||||
raise Exception(f"Job timed out after {max_wait} seconds")
|
||||
|
||||
async def demo_research_assistant(self):
|
||||
"""Demo: Research assistant for academic papers"""
|
||||
print("🔬 Demo: Academic Research Assistant")
|
||||
print("=" * 50)
|
||||
|
||||
try:
|
||||
print("🚀 Submitting job: Find research on 'machine learning optimization'")
|
||||
|
||||
task_id = await self.submit_adaptive_job(
|
||||
start_url="https://arxiv.org",
|
||||
query="machine learning optimization techniques recent papers",
|
||||
config={
|
||||
"max_depth": 3,
|
||||
"confidence_threshold": 0.7,
|
||||
"max_pages": 20,
|
||||
"content_filters": ["academic", "research"],
|
||||
},
|
||||
)
|
||||
|
||||
print(f"📋 Job submitted with ID: {task_id}")
|
||||
|
||||
# Wait for completion
|
||||
result = await self.wait_for_completion(task_id)
|
||||
|
||||
print("✅ Research completed!")
|
||||
print(f"🎯 Confidence score: {result['result']['confidence']:.2f}")
|
||||
print(f"📊 Coverage stats: {result['result']['coverage_stats']}")
|
||||
|
||||
# Show relevant content found
|
||||
relevant_content = result["result"]["relevant_content"]
|
||||
print(f"\n📚 Found {len(relevant_content)} relevant research papers:")
|
||||
|
||||
for i, content in enumerate(relevant_content[:3], 1):
|
||||
title = content.get("title", "Untitled")[:60]
|
||||
relevance = content.get("relevance_score", 0)
|
||||
print(f" {i}. {title}... (relevance: {relevance:.2f})")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
|
||||
async def demo_market_intelligence(self):
|
||||
"""Demo: Market intelligence gathering"""
|
||||
print("\n💼 Demo: Market Intelligence Gathering")
|
||||
print("=" * 50)
|
||||
|
||||
try:
|
||||
print("🚀 Submitting job: Analyze competitors in 'sustainable packaging'")
|
||||
|
||||
task_id = await self.submit_adaptive_job(
|
||||
start_url="https://packagingeurope.com",
|
||||
query="sustainable packaging solutions eco-friendly materials competitors market trends",
|
||||
config={
|
||||
"max_depth": 4,
|
||||
"confidence_threshold": 0.6,
|
||||
"max_pages": 30,
|
||||
"content_filters": ["business", "industry"],
|
||||
"follow_external_links": True,
|
||||
},
|
||||
)
|
||||
|
||||
print(f"📋 Job submitted with ID: {task_id}")
|
||||
|
||||
# Wait for completion
|
||||
result = await self.wait_for_completion(task_id)
|
||||
|
||||
print("✅ Market analysis completed!")
|
||||
print(f"🎯 Intelligence confidence: {result['result']['confidence']:.2f}")
|
||||
|
||||
# Analyze findings
|
||||
relevant_content = result["result"]["relevant_content"]
|
||||
print(
|
||||
f"\n📈 Market intelligence gathered from {len(relevant_content)} sources:"
|
||||
)
|
||||
|
||||
companies = set()
|
||||
trends = []
|
||||
|
||||
for content in relevant_content:
|
||||
# Extract company mentions (simplified)
|
||||
text = content.get("content", "")
|
||||
if any(
|
||||
word in text.lower()
|
||||
for word in ["company", "corporation", "inc", "ltd"]
|
||||
):
|
||||
# This would be more sophisticated in real implementation
|
||||
companies.add(content.get("source_url", "Unknown"))
|
||||
|
||||
# Extract trend keywords
|
||||
if any(
|
||||
word in text.lower() for word in ["trend", "innovation", "future"]
|
||||
):
|
||||
trends.append(content.get("title", "Trend"))
|
||||
|
||||
print(f"🏢 Companies analyzed: {len(companies)}")
|
||||
print(f"📊 Trends identified: {len(trends)}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
|
||||
async def demo_content_curation(self):
|
||||
"""Demo: Content curation for newsletter"""
|
||||
print("\n📰 Demo: Content Curation for Tech Newsletter")
|
||||
print("=" * 50)
|
||||
|
||||
try:
|
||||
print("🚀 Submitting job: Curate content about 'AI developments this week'")
|
||||
|
||||
task_id = await self.submit_adaptive_job(
|
||||
start_url="https://techcrunch.com",
|
||||
query="artificial intelligence AI developments news this week recent advances",
|
||||
config={
|
||||
"max_depth": 2,
|
||||
"confidence_threshold": 0.8,
|
||||
"max_pages": 25,
|
||||
"content_filters": ["news", "recent"],
|
||||
"date_range": "last_7_days",
|
||||
},
|
||||
)
|
||||
|
||||
print(f"📋 Job submitted with ID: {task_id}")
|
||||
|
||||
# Wait for completion
|
||||
result = await self.wait_for_completion(task_id)
|
||||
|
||||
print("✅ Content curation completed!")
|
||||
print(f"🎯 Curation confidence: {result['result']['confidence']:.2f}")
|
||||
|
||||
# Process curated content
|
||||
relevant_content = result["result"]["relevant_content"]
|
||||
print(f"\n📮 Curated {len(relevant_content)} articles for your newsletter:")
|
||||
|
||||
# Group by category/topic
|
||||
categories = {
|
||||
"AI Research": [],
|
||||
"Industry News": [],
|
||||
"Product Launches": [],
|
||||
"Other": [],
|
||||
}
|
||||
|
||||
for content in relevant_content:
|
||||
title = content.get("title", "Untitled")
|
||||
if any(
|
||||
word in title.lower() for word in ["research", "study", "paper"]
|
||||
):
|
||||
categories["AI Research"].append(content)
|
||||
elif any(
|
||||
word in title.lower() for word in ["company", "startup", "funding"]
|
||||
):
|
||||
categories["Industry News"].append(content)
|
||||
elif any(
|
||||
word in title.lower() for word in ["launch", "release", "unveil"]
|
||||
):
|
||||
categories["Product Launches"].append(content)
|
||||
else:
|
||||
categories["Other"].append(content)
|
||||
|
||||
for category, articles in categories.items():
|
||||
if articles:
|
||||
print(f"\n📂 {category} ({len(articles)} articles):")
|
||||
for article in articles[:2]: # Show top 2 per category
|
||||
title = article.get("title", "Untitled")[:50]
|
||||
print(f" • {title}...")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
|
||||
async def demo_product_research(self):
|
||||
"""Demo: Product research and comparison"""
|
||||
print("\n🛍️ Demo: Product Research & Comparison")
|
||||
print("=" * 50)
|
||||
|
||||
try:
|
||||
print("🚀 Submitting job: Research 'best wireless headphones 2024'")
|
||||
|
||||
task_id = await self.submit_adaptive_job(
|
||||
start_url="https://www.cnet.com",
|
||||
query="best wireless headphones 2024 reviews comparison features price",
|
||||
config={
|
||||
"max_depth": 3,
|
||||
"confidence_threshold": 0.75,
|
||||
"max_pages": 20,
|
||||
"content_filters": ["review", "comparison"],
|
||||
"extract_structured_data": True,
|
||||
},
|
||||
)
|
||||
|
||||
print(f"📋 Job submitted with ID: {task_id}")
|
||||
|
||||
# Wait for completion
|
||||
result = await self.wait_for_completion(task_id)
|
||||
|
||||
print("✅ Product research completed!")
|
||||
print(f"🎯 Research confidence: {result['result']['confidence']:.2f}")
|
||||
|
||||
# Analyze product data
|
||||
relevant_content = result["result"]["relevant_content"]
|
||||
print(
|
||||
f"\n🎧 Product research summary from {len(relevant_content)} sources:"
|
||||
)
|
||||
|
||||
# Extract product mentions (simplified example)
|
||||
products = {}
|
||||
for content in relevant_content:
|
||||
text = content.get("content", "").lower()
|
||||
# Look for common headphone brands
|
||||
brands = [
|
||||
"sony",
|
||||
"bose",
|
||||
"apple",
|
||||
"sennheiser",
|
||||
"jabra",
|
||||
"audio-technica",
|
||||
]
|
||||
for brand in brands:
|
||||
if brand in text:
|
||||
if brand not in products:
|
||||
products[brand] = 0
|
||||
products[brand] += 1
|
||||
|
||||
print("🏷️ Product mentions:")
|
||||
for product, mentions in sorted(
|
||||
products.items(), key=lambda x: x[1], reverse=True
|
||||
)[:5]:
|
||||
print(f" {product.title()}: {mentions} mentions")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
|
||||
async def demo_monitoring_pipeline(self):
|
||||
"""Demo: Set up a monitoring pipeline for ongoing content tracking"""
|
||||
print("\n📡 Demo: Content Monitoring Pipeline")
|
||||
print("=" * 50)
|
||||
|
||||
monitoring_queries = [
|
||||
{
|
||||
"name": "Brand Mentions",
|
||||
"start_url": "https://news.google.com",
|
||||
"query": "YourBrand company news mentions",
|
||||
"priority": "high",
|
||||
},
|
||||
{
|
||||
"name": "Industry Trends",
|
||||
"start_url": "https://techcrunch.com",
|
||||
"query": "SaaS industry trends 2024",
|
||||
"priority": "medium",
|
||||
},
|
||||
{
|
||||
"name": "Competitor Activity",
|
||||
"start_url": "https://crunchbase.com",
|
||||
"query": "competitor funding announcements product launches",
|
||||
"priority": "high",
|
||||
},
|
||||
]
|
||||
|
||||
print("🚀 Starting monitoring pipeline with 3 queries...")
|
||||
|
||||
jobs = {}
|
||||
|
||||
# Submit all monitoring jobs
|
||||
for query_config in monitoring_queries:
|
||||
print(f"\n📋 Submitting: {query_config['name']}")
|
||||
|
||||
try:
|
||||
task_id = await self.submit_adaptive_job(
|
||||
start_url=query_config["start_url"],
|
||||
query=query_config["query"],
|
||||
config={
|
||||
"max_depth": 2,
|
||||
"confidence_threshold": 0.6,
|
||||
"max_pages": 15,
|
||||
},
|
||||
)
|
||||
|
||||
jobs[query_config["name"]] = {
|
||||
"task_id": task_id,
|
||||
"priority": query_config["priority"],
|
||||
"status": "submitted",
|
||||
}
|
||||
|
||||
print(f" ✅ Job ID: {task_id}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Failed: {e}")
|
||||
|
||||
# Monitor all jobs
|
||||
print(f"\n⏳ Monitoring {len(jobs)} jobs...")
|
||||
|
||||
completed_jobs = {}
|
||||
max_wait = 180 # 3 minutes total
|
||||
start_time = time.time()
|
||||
|
||||
while jobs and (time.time() - start_time) < max_wait:
|
||||
for name, job_info in list(jobs.items()):
|
||||
try:
|
||||
status = await self.check_job_status(job_info["task_id"])
|
||||
|
||||
if status["status"] == "COMPLETED":
|
||||
completed_jobs[name] = status
|
||||
del jobs[name]
|
||||
print(f" ✅ {name} completed")
|
||||
elif status["status"] == "FAILED":
|
||||
print(f" ❌ {name} failed: {status.get('error', 'Unknown')}")
|
||||
del jobs[name]
|
||||
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Error checking {name}: {e}")
|
||||
|
||||
if jobs: # Still have pending jobs
|
||||
await asyncio.sleep(5)
|
||||
|
||||
# Summary
|
||||
print("\n📊 Monitoring Pipeline Summary:")
|
||||
print(f" ✅ Completed: {len(completed_jobs)} jobs")
|
||||
print(f" ⏳ Pending: {len(jobs)} jobs")
|
||||
|
||||
for name, result in completed_jobs.items():
|
||||
confidence = result["result"]["confidence"]
|
||||
content_count = len(result["result"]["relevant_content"])
|
||||
print(f" {name}: {content_count} items (confidence: {confidence:.2f})")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all adaptive endpoint demos"""
|
||||
print("🧠 Crawl4AI Adaptive Digest Endpoint - User Demo")
|
||||
print("=" * 60)
|
||||
print("This demo shows how developers use adaptive crawling")
|
||||
print("to intelligently gather relevant content based on queries.\n")
|
||||
|
||||
demo = AdaptiveEndpointDemo()
|
||||
|
||||
try:
|
||||
# Run individual demos
|
||||
await demo.demo_research_assistant()
|
||||
await demo.demo_market_intelligence()
|
||||
await demo.demo_content_curation()
|
||||
await demo.demo_product_research()
|
||||
|
||||
# Run monitoring pipeline demo
|
||||
await demo.demo_monitoring_pipeline()
|
||||
|
||||
print("\n🎉 All demos completed successfully!")
|
||||
print("\nReal-world usage patterns:")
|
||||
print("1. Submit multiple jobs for parallel processing")
|
||||
print("2. Poll job status to track progress")
|
||||
print("3. Process results when jobs complete")
|
||||
print("4. Use confidence scores to filter quality content")
|
||||
print("5. Set up monitoring pipelines for ongoing intelligence")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n❌ Demo failed: {e}")
|
||||
print("Make sure the Crawl4AI server is running on localhost:11235")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user