Add comprehensive tests for anti-bot strategies and extended features

- Implemented `test_adapter_verification.py` to verify correct usage of browser adapters. - Created `test_all_features.py` for a comprehensive suite covering URL seeding, adaptive crawling, browser adapters, proxy rotation, and dispatchers. - Developed `test_anti_bot_strategy.py` to validate the functionality of various anti-bot strategies. - Added `test_antibot_simple.py` for simple testing of anti-bot strategies using async web crawling. - Introduced `test_bot_detection.py` to assess adapter performance against bot detection mechanisms. - Compiled `test_final_summary.py` to provide a detailed summary of all tests and their results.
2025-10-07 18:51:13 +08:00
parent f00e8cbf35
commit 201843a204
23 changed files with 5265 additions and 96 deletions
--- a/tests/docker/extended_features/demo_adaptive_endpoint.py
+++ b/tests/docker/extended_features/demo_adaptive_endpoint.py
@@ -0,0 +1,435 @@
+#!/usr/bin/env python3
+"""
+Demo: How users will call the Adaptive Digest endpoint
+This shows practical examples of how developers would use the adaptive crawling
+feature to intelligently gather relevant content based on queries.
+"""
+
+import asyncio
+import time
+from typing import Any, Dict, Optional
+
+import aiohttp
+
+# Configuration
+API_BASE_URL = "http://localhost:11235"
+API_TOKEN = None  # Set if your API requires authentication
+
+
+class AdaptiveEndpointDemo:
+    def __init__(self, base_url: str = API_BASE_URL, token: str = None):
+        self.base_url = base_url
+        self.headers = {"Content-Type": "application/json"}
+        if token:
+            self.headers["Authorization"] = f"Bearer {token}"
+
+    async def submit_adaptive_job(
+        self, start_url: str, query: str, config: Optional[Dict] = None
+    ) -> str:
+        """Submit an adaptive crawling job and return task ID"""
+        payload = {"start_url": start_url, "query": query}
+
+        if config:
+            payload["config"] = config
+
+        async with aiohttp.ClientSession() as session:
+            async with session.post(
+                f"{self.base_url}/adaptive/digest/job",
+                headers=self.headers,
+                json=payload,
+            ) as response:
+                if response.status == 202:  # Accepted
+                    result = await response.json()
+                    return result["task_id"]
+                else:
+                    error_text = await response.text()
+                    raise Exception(f"API Error {response.status}: {error_text}")
+
+    async def check_job_status(self, task_id: str) -> Dict[str, Any]:
+        """Check the status of an adaptive crawling job"""
+        async with aiohttp.ClientSession() as session:
+            async with session.get(
+                f"{self.base_url}/adaptive/digest/job/{task_id}", headers=self.headers
+            ) as response:
+                if response.status == 200:
+                    return await response.json()
+                else:
+                    error_text = await response.text()
+                    raise Exception(f"API Error {response.status}: {error_text}")
+
+    async def wait_for_completion(
+        self, task_id: str, max_wait: int = 300
+    ) -> Dict[str, Any]:
+        """Poll job status until completion or timeout"""
+        start_time = time.time()
+
+        while time.time() - start_time < max_wait:
+            status = await self.check_job_status(task_id)
+
+            if status["status"] == "COMPLETED":
+                return status
+            elif status["status"] == "FAILED":
+                raise Exception(f"Job failed: {status.get('error', 'Unknown error')}")
+
+            print(
+                f"⏳ Job {status['status']}... (elapsed: {int(time.time() - start_time)}s)"
+            )
+            await asyncio.sleep(3)  # Poll every 3 seconds
+
+        raise Exception(f"Job timed out after {max_wait} seconds")
+
+    async def demo_research_assistant(self):
+        """Demo: Research assistant for academic papers"""
+        print("🔬 Demo: Academic Research Assistant")
+        print("=" * 50)
+
+        try:
+            print("🚀 Submitting job: Find research on 'machine learning optimization'")
+
+            task_id = await self.submit_adaptive_job(
+                start_url="https://arxiv.org",
+                query="machine learning optimization techniques recent papers",
+                config={
+                    "max_depth": 3,
+                    "confidence_threshold": 0.7,
+                    "max_pages": 20,
+                    "content_filters": ["academic", "research"],
+                },
+            )
+
+            print(f"📋 Job submitted with ID: {task_id}")
+
+            # Wait for completion
+            result = await self.wait_for_completion(task_id)
+
+            print("✅ Research completed!")
+            print(f"🎯 Confidence score: {result['result']['confidence']:.2f}")
+            print(f"📊 Coverage stats: {result['result']['coverage_stats']}")
+
+            # Show relevant content found
+            relevant_content = result["result"]["relevant_content"]
+            print(f"\n📚 Found {len(relevant_content)} relevant research papers:")
+
+            for i, content in enumerate(relevant_content[:3], 1):
+                title = content.get("title", "Untitled")[:60]
+                relevance = content.get("relevance_score", 0)
+                print(f"  {i}. {title}... (relevance: {relevance:.2f})")
+
+        except Exception as e:
+            print(f"❌ Error: {e}")
+
+    async def demo_market_intelligence(self):
+        """Demo: Market intelligence gathering"""
+        print("\n💼 Demo: Market Intelligence Gathering")
+        print("=" * 50)
+
+        try:
+            print("🚀 Submitting job: Analyze competitors in 'sustainable packaging'")
+
+            task_id = await self.submit_adaptive_job(
+                start_url="https://packagingeurope.com",
+                query="sustainable packaging solutions eco-friendly materials competitors market trends",
+                config={
+                    "max_depth": 4,
+                    "confidence_threshold": 0.6,
+                    "max_pages": 30,
+                    "content_filters": ["business", "industry"],
+                    "follow_external_links": True,
+                },
+            )
+
+            print(f"📋 Job submitted with ID: {task_id}")
+
+            # Wait for completion
+            result = await self.wait_for_completion(task_id)
+
+            print("✅ Market analysis completed!")
+            print(f"🎯 Intelligence confidence: {result['result']['confidence']:.2f}")
+
+            # Analyze findings
+            relevant_content = result["result"]["relevant_content"]
+            print(
+                f"\n📈 Market intelligence gathered from {len(relevant_content)} sources:"
+            )
+
+            companies = set()
+            trends = []
+
+            for content in relevant_content:
+                # Extract company mentions (simplified)
+                text = content.get("content", "")
+                if any(
+                    word in text.lower()
+                    for word in ["company", "corporation", "inc", "ltd"]
+                ):
+                    # This would be more sophisticated in real implementation
+                    companies.add(content.get("source_url", "Unknown"))
+
+                # Extract trend keywords
+                if any(
+                    word in text.lower() for word in ["trend", "innovation", "future"]
+                ):
+                    trends.append(content.get("title", "Trend"))
+
+            print(f"🏢 Companies analyzed: {len(companies)}")
+            print(f"📊 Trends identified: {len(trends)}")
+
+        except Exception as e:
+            print(f"❌ Error: {e}")
+
+    async def demo_content_curation(self):
+        """Demo: Content curation for newsletter"""
+        print("\n📰 Demo: Content Curation for Tech Newsletter")
+        print("=" * 50)
+
+        try:
+            print("🚀 Submitting job: Curate content about 'AI developments this week'")
+
+            task_id = await self.submit_adaptive_job(
+                start_url="https://techcrunch.com",
+                query="artificial intelligence AI developments news this week recent advances",
+                config={
+                    "max_depth": 2,
+                    "confidence_threshold": 0.8,
+                    "max_pages": 25,
+                    "content_filters": ["news", "recent"],
+                    "date_range": "last_7_days",
+                },
+            )
+
+            print(f"📋 Job submitted with ID: {task_id}")
+
+            # Wait for completion
+            result = await self.wait_for_completion(task_id)
+
+            print("✅ Content curation completed!")
+            print(f"🎯 Curation confidence: {result['result']['confidence']:.2f}")
+
+            # Process curated content
+            relevant_content = result["result"]["relevant_content"]
+            print(f"\n📮 Curated {len(relevant_content)} articles for your newsletter:")
+
+            # Group by category/topic
+            categories = {
+                "AI Research": [],
+                "Industry News": [],
+                "Product Launches": [],
+                "Other": [],
+            }
+
+            for content in relevant_content:
+                title = content.get("title", "Untitled")
+                if any(
+                    word in title.lower() for word in ["research", "study", "paper"]
+                ):
+                    categories["AI Research"].append(content)
+                elif any(
+                    word in title.lower() for word in ["company", "startup", "funding"]
+                ):
+                    categories["Industry News"].append(content)
+                elif any(
+                    word in title.lower() for word in ["launch", "release", "unveil"]
+                ):
+                    categories["Product Launches"].append(content)
+                else:
+                    categories["Other"].append(content)
+
+            for category, articles in categories.items():
+                if articles:
+                    print(f"\n📂 {category} ({len(articles)} articles):")
+                    for article in articles[:2]:  # Show top 2 per category
+                        title = article.get("title", "Untitled")[:50]
+                        print(f"  • {title}...")
+
+        except Exception as e:
+            print(f"❌ Error: {e}")
+
+    async def demo_product_research(self):
+        """Demo: Product research and comparison"""
+        print("\n🛍️ Demo: Product Research & Comparison")
+        print("=" * 50)
+
+        try:
+            print("🚀 Submitting job: Research 'best wireless headphones 2024'")
+
+            task_id = await self.submit_adaptive_job(
+                start_url="https://www.cnet.com",
+                query="best wireless headphones 2024 reviews comparison features price",
+                config={
+                    "max_depth": 3,
+                    "confidence_threshold": 0.75,
+                    "max_pages": 20,
+                    "content_filters": ["review", "comparison"],
+                    "extract_structured_data": True,
+                },
+            )
+
+            print(f"📋 Job submitted with ID: {task_id}")
+
+            # Wait for completion
+            result = await self.wait_for_completion(task_id)
+
+            print("✅ Product research completed!")
+            print(f"🎯 Research confidence: {result['result']['confidence']:.2f}")
+
+            # Analyze product data
+            relevant_content = result["result"]["relevant_content"]
+            print(
+                f"\n🎧 Product research summary from {len(relevant_content)} sources:"
+            )
+
+            # Extract product mentions (simplified example)
+            products = {}
+            for content in relevant_content:
+                text = content.get("content", "").lower()
+                # Look for common headphone brands
+                brands = [
+                    "sony",
+                    "bose",
+                    "apple",
+                    "sennheiser",
+                    "jabra",
+                    "audio-technica",
+                ]
+                for brand in brands:
+                    if brand in text:
+                        if brand not in products:
+                            products[brand] = 0
+                        products[brand] += 1
+
+            print("🏷️ Product mentions:")
+            for product, mentions in sorted(
+                products.items(), key=lambda x: x[1], reverse=True
+            )[:5]:
+                print(f"  {product.title()}: {mentions} mentions")
+
+        except Exception as e:
+            print(f"❌ Error: {e}")
+
+    async def demo_monitoring_pipeline(self):
+        """Demo: Set up a monitoring pipeline for ongoing content tracking"""
+        print("\n📡 Demo: Content Monitoring Pipeline")
+        print("=" * 50)
+
+        monitoring_queries = [
+            {
+                "name": "Brand Mentions",
+                "start_url": "https://news.google.com",
+                "query": "YourBrand company news mentions",
+                "priority": "high",
+            },
+            {
+                "name": "Industry Trends",
+                "start_url": "https://techcrunch.com",
+                "query": "SaaS industry trends 2024",
+                "priority": "medium",
+            },
+            {
+                "name": "Competitor Activity",
+                "start_url": "https://crunchbase.com",
+                "query": "competitor funding announcements product launches",
+                "priority": "high",
+            },
+        ]
+
+        print("🚀 Starting monitoring pipeline with 3 queries...")
+
+        jobs = {}
+
+        # Submit all monitoring jobs
+        for query_config in monitoring_queries:
+            print(f"\n📋 Submitting: {query_config['name']}")
+
+            try:
+                task_id = await self.submit_adaptive_job(
+                    start_url=query_config["start_url"],
+                    query=query_config["query"],
+                    config={
+                        "max_depth": 2,
+                        "confidence_threshold": 0.6,
+                        "max_pages": 15,
+                    },
+                )
+
+                jobs[query_config["name"]] = {
+                    "task_id": task_id,
+                    "priority": query_config["priority"],
+                    "status": "submitted",
+                }
+
+                print(f"  ✅ Job ID: {task_id}")
+
+            except Exception as e:
+                print(f"  ❌ Failed: {e}")
+
+        # Monitor all jobs
+        print(f"\n⏳ Monitoring {len(jobs)} jobs...")
+
+        completed_jobs = {}
+        max_wait = 180  # 3 minutes total
+        start_time = time.time()
+
+        while jobs and (time.time() - start_time) < max_wait:
+            for name, job_info in list(jobs.items()):
+                try:
+                    status = await self.check_job_status(job_info["task_id"])
+
+                    if status["status"] == "COMPLETED":
+                        completed_jobs[name] = status
+                        del jobs[name]
+                        print(f"  ✅ {name} completed")
+                    elif status["status"] == "FAILED":
+                        print(f"  ❌ {name} failed: {status.get('error', 'Unknown')}")
+                        del jobs[name]
+
+                except Exception as e:
+                    print(f"  ⚠️ Error checking {name}: {e}")
+
+            if jobs:  # Still have pending jobs
+                await asyncio.sleep(5)
+
+        # Summary
+        print("\n📊 Monitoring Pipeline Summary:")
+        print(f"  ✅ Completed: {len(completed_jobs)} jobs")
+        print(f"  ⏳ Pending: {len(jobs)} jobs")
+
+        for name, result in completed_jobs.items():
+            confidence = result["result"]["confidence"]
+            content_count = len(result["result"]["relevant_content"])
+            print(f"    {name}: {content_count} items (confidence: {confidence:.2f})")
+
+
+async def main():
+    """Run all adaptive endpoint demos"""
+    print("🧠 Crawl4AI Adaptive Digest Endpoint - User Demo")
+    print("=" * 60)
+    print("This demo shows how developers use adaptive crawling")
+    print("to intelligently gather relevant content based on queries.\n")
+
+    demo = AdaptiveEndpointDemo()
+
+    try:
+        # Run individual demos
+        await demo.demo_research_assistant()
+        await demo.demo_market_intelligence()
+        await demo.demo_content_curation()
+        await demo.demo_product_research()
+
+        # Run monitoring pipeline demo
+        await demo.demo_monitoring_pipeline()
+
+        print("\n🎉 All demos completed successfully!")
+        print("\nReal-world usage patterns:")
+        print("1. Submit multiple jobs for parallel processing")
+        print("2. Poll job status to track progress")
+        print("3. Process results when jobs complete")
+        print("4. Use confidence scores to filter quality content")
+        print("5. Set up monitoring pipelines for ongoing intelligence")
+
+    except Exception as e:
+        print(f"\n❌ Demo failed: {e}")
+        print("Make sure the Crawl4AI server is running on localhost:11235")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())