Add link analysis tests and integration tests for /links/analyze endpoint

- Implemented `test_link_analysis` in `test_docker.py` to validate link analysis functionality. - Created `test_link_analysis.py` with comprehensive tests for link analysis, including basic functionality, configuration options, error handling, performance, and edge cases. - Added integration tests in `test_link_analysis_integration.py` to verify the /links/analyze endpoint, including health checks, authentication, and error handling.
2025-10-14 19:24:16 +08:00
parent 8cca9704eb
commit aebf5a3694
7 changed files with 1926 additions and 0 deletions
--- a/docs/examples/link_analysis_example.py
+++ b/docs/examples/link_analysis_example.py
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+"""
+Link Analysis Example
+====================
+
+This example demonstrates how to use the new /links/analyze endpoint
+to extract, analyze, and score links from web pages.
+
+Requirements:
+- Crawl4AI server running on localhost:11234
+- requests library: pip install requests
+"""
+
+import requests
+import json
+import time
+from typing import Dict, Any, List
+
+
+class LinkAnalyzer:
+    """Simple client for the link analysis endpoint"""
+
+    def __init__(self, base_url: str = "http://localhost:11234", token: str = None):
+        self.base_url = base_url
+        self.token = token or self._get_test_token()
+
+    def _get_test_token(self) -> str:
+        """Get a test token (for development only)"""
+        try:
+            response = requests.post(
+                f"{self.base_url}/token",
+                json={"email": "test@example.com"},
+                timeout=10
+            )
+            if response.status_code == 200:
+                return response.json()["access_token"]
+        except:
+            pass
+        return "test-token"  # Fallback for local testing
+
+    def analyze_links(self, url: str, config: Dict[str, Any] = None) -> Dict[str, Any]:
+        """Analyze links on a webpage"""
+        headers = {"Content-Type": "application/json"}
+
+        if self.token and self.token != "test-token":
+            headers["Authorization"] = f"Bearer {self.token}"
+
+        data = {"url": url}
+        if config:
+            data["config"] = config
+
+        response = requests.post(
+            f"{self.base_url}/links/analyze",
+            headers=headers,
+            json=data,
+            timeout=30
+        )
+
+        response.raise_for_status()
+        return response.json()
+
+    def print_summary(self, result: Dict[str, Any]):
+        """Print a summary of link analysis results"""
+        print("\n" + "="*60)
+        print("📊 LINK ANALYSIS SUMMARY")
+        print("="*60)
+
+        total_links = sum(len(links) for links in result.values())
+        print(f"Total links found: {total_links}")
+
+        for category, links in result.items():
+            if links:
+                print(f"\n📂 {category.upper()}: {len(links)} links")
+
+                # Show top 3 links by score
+                top_links = sorted(links, key=lambda x: x.get('total_score', 0), reverse=True)[:3]
+                for i, link in enumerate(top_links, 1):
+                    score = link.get('total_score', 0)
+                    text = link.get('text', 'No text')[:50]
+                    url = link.get('href', 'No URL')[:60]
+                    print(f"  {i}. [{score:.2f}] {text} → {url}")
+
+
+def example_1_basic_analysis():
+    """Example 1: Basic link analysis"""
+    print("\n🔍 Example 1: Basic Link Analysis")
+    print("-" * 40)
+
+    analyzer = LinkAnalyzer()
+
+    # Analyze a simple test page
+    url = "https://httpbin.org/links/10"
+    print(f"Analyzing: {url}")
+
+    try:
+        result = analyzer.analyze_links(url)
+        analyzer.print_summary(result)
+        return result
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        return None
+
+
+def example_2_custom_config():
+    """Example 2: Analysis with custom configuration"""
+    print("\n🔍 Example 2: Custom Configuration")
+    print("-" * 40)
+
+    analyzer = LinkAnalyzer()
+
+    # Custom configuration
+    config = {
+        "include_internal": True,
+        "include_external": True,
+        "max_links": 50,
+        "timeout": 10,
+        "verbose": True
+    }
+
+    url = "https://httpbin.org/links/10"
+    print(f"Analyzing with custom config: {url}")
+    print(f"Config: {json.dumps(config, indent=2)}")
+
+    try:
+        result = analyzer.analyze_links(url, config)
+        analyzer.print_summary(result)
+        return result
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        return None
+
+
+def example_3_real_world_site():
+    """Example 3: Analyzing a real website"""
+    print("\n🔍 Example 3: Real Website Analysis")
+    print("-" * 40)
+
+    analyzer = LinkAnalyzer()
+
+    # Analyze Python official website
+    url = "https://www.python.org"
+    print(f"Analyzing real website: {url}")
+    print("This may take a moment...")
+
+    try:
+        result = analyzer.analyze_links(url)
+        analyzer.print_summary(result)
+
+        # Additional analysis
+        print("\n📈 DETAILED ANALYSIS")
+        print("-" * 20)
+
+        # Find external links with highest scores
+        external_links = result.get('external', [])
+        if external_links:
+            top_external = sorted(external_links, key=lambda x: x.get('total_score', 0), reverse=True)[:5]
+            print("\n🌐 Top External Links:")
+            for link in top_external:
+                print(f"  • {link.get('text', 'N/A')} (score: {link.get('total_score', 0):.2f})")
+                print(f"    {link.get('href', 'N/A')}")
+
+        # Find internal links
+        internal_links = result.get('internal', [])
+        if internal_links:
+            top_internal = sorted(internal_links, key=lambda x: x.get('total_score', 0), reverse=True)[:5]
+            print("\n🏠 Top Internal Links:")
+            for link in top_internal:
+                print(f"  • {link.get('text', 'N/A')} (score: {link.get('total_score', 0):.2f})")
+                print(f"    {link.get('href', 'N/A')}")
+
+        return result
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        print("⚠️  This example may fail due to network issues")
+        return None
+
+
+def example_4_comparative_analysis():
+    """Example 4: Comparing link structures across sites"""
+    print("\n🔍 Example 4: Comparative Analysis")
+    print("-" * 40)
+
+    analyzer = LinkAnalyzer()
+
+    sites = [
+        ("https://httpbin.org/links/10", "Test Page 1"),
+        ("https://httpbin.org/links/5", "Test Page 2")
+    ]
+
+    results = {}
+
+    for url, name in sites:
+        print(f"\nAnalyzing: {name}")
+        try:
+            result = analyzer.analyze_links(url)
+            results[name] = result
+
+            total_links = sum(len(links) for links in result.values())
+            categories = len([cat for cat, links in result.items() if links])
+            print(f"  Links: {total_links}, Categories: {categories}")
+
+        except Exception as e:
+            print(f"  ❌ Error: {e}")
+
+    # Compare results
+    if len(results) > 1:
+        print("\n📊 COMPARISON")
+        print("-" * 15)
+
+        for name, result in results.items():
+            total = sum(len(links) for links in result.values())
+            print(f"{name}: {total} total links")
+
+            # Calculate average scores
+            all_scores = []
+            for links in result.values():
+                for link in links:
+                    all_scores.append(link.get('total_score', 0))
+
+            if all_scores:
+                avg_score = sum(all_scores) / len(all_scores)
+                print(f"  Average link score: {avg_score:.3f}")
+
+
+def example_5_advanced_filtering():
+    """Example 5: Advanced filtering and analysis"""
+    print("\n🔍 Example 5: Advanced Filtering")
+    print("-" * 40)
+
+    analyzer = LinkAnalyzer()
+
+    url = "https://httpbin.org/links/10"
+
+    try:
+        result = analyzer.analyze_links(url)
+
+        # Filter links by score
+        min_score = 0.5
+        high_quality_links = {}
+
+        for category, links in result.items():
+            if links:
+                filtered = [link for link in links if link.get('total_score', 0) >= min_score]
+                if filtered:
+                    high_quality_links[category] = filtered
+
+        print(f"\n🎯 High-quality links (score >= {min_score}):")
+        total_high_quality = sum(len(links) for links in high_quality_links.values())
+        print(f"Total: {total_high_quality} links")
+
+        for category, links in high_quality_links.items():
+            print(f"\n{category.upper()}:")
+            for link in links:
+                score = link.get('total_score', 0)
+                text = link.get('text', 'No text')
+                print(f"  • [{score:.2f}] {text}")
+
+        # Extract unique domains from external links
+        external_links = result.get('external', [])
+        if external_links:
+            domains = set()
+            for link in external_links:
+                url = link.get('href', '')
+                if '://' in url:
+                    domain = url.split('://')[1].split('/')[0]
+                    domains.add(domain)
+
+            print(f"\n🌐 Unique external domains: {len(domains)}")
+            for domain in sorted(domains):
+                print(f"  • {domain}")
+
+    except Exception as e:
+        print(f"❌ Error: {e}")
+
+
+def main():
+    """Run all examples"""
+    print("🚀 Link Analysis Examples")
+    print("=" * 50)
+    print("Make sure the Crawl4AI server is running on localhost:11234")
+    print()
+
+    examples = [
+        example_1_basic_analysis,
+        example_2_custom_config,
+        example_3_real_world_site,
+        example_4_comparative_analysis,
+        example_5_advanced_filtering
+    ]
+
+    for i, example_func in enumerate(examples, 1):
+        print(f"\n{'='*60}")
+        print(f"Running Example {i}")
+        print('='*60)
+
+        try:
+            example_func()
+        except KeyboardInterrupt:
+            print("\n⏹️  Example interrupted by user")
+            break
+        except Exception as e:
+            print(f"\n❌ Example {i} failed: {e}")
+
+        if i < len(examples):
+            print("\n⏳ Press Enter to continue to next example...")
+            try:
+                input()
+            except KeyboardInterrupt:
+                break
+
+    print("\n🎉 Examples completed!")
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/md_v2/core/link-analysis.md
+++ b/docs/md_v2/core/link-analysis.md
@@ -0,0 +1,523 @@
+# Link Analysis and Scoring
+
+## Introduction
+
+**Link Analysis** is a powerful feature that extracts, analyzes, and scores all links found on a webpage. This endpoint helps you understand the link structure, identify high-value links, and get insights into the connectivity patterns of any website.
+
+Think of it as a smart link discovery tool that not only extracts links but also evaluates their importance, relevance, and quality through advanced scoring algorithms.
+
+## Key Concepts
+
+### What Link Analysis Does
+
+When you analyze a webpage, the system:
+
+1. **Extracts All Links** - Finds every hyperlink on the page
+2. **Scores Links** - Assigns relevance scores based on multiple factors
+3. **Categorizes Links** - Groups links by type (internal, external, etc.)
+4. **Provides Metadata** - URL text, attributes, and context information
+5. **Ranks by Importance** - Orders links from most to least valuable
+
+### Scoring Factors
+
+The link scoring algorithm considers:
+
+- **Text Content**: Link anchor text relevance and descriptiveness
+- **URL Structure**: Depth, parameters, and path patterns
+- **Context**: Surrounding text and page position
+- **Attributes**: Title, rel attributes, and other metadata
+- **Link Type**: Internal vs external classification
+
+## Quick Start
+
+### Basic Usage
+
+```python
+import requests
+
+# Analyze links on a webpage
+response = requests.post(
+    "http://localhost:8000/links/analyze",
+    headers={"Authorization": "Bearer YOUR_TOKEN"},
+    json={
+        "url": "https://example.com"
+    }
+)
+
+result = response.json()
+print(f"Found {len(result.get('internal', []))} internal links")
+print(f"Found {len(result.get('external', []))} external links")
+
+# Show top 3 links by score
+for link_type in ['internal', 'external']:
+    if link_type in result:
+        top_links = sorted(result[link_type], key=lambda x: x.get('score', 0), reverse=True)[:3]
+        print(f"\nTop {link_type} links:")
+        for link in top_links:
+            print(f"- {link.get('url', 'N/A')} (score: {link.get('score', 0):.2f})")
+```
+
+### With Custom Configuration
+
+```python
+response = requests.post(
+    "http://localhost:8000/links/analyze",
+    headers={"Authorization": "Bearer YOUR_TOKEN"},
+    json={
+        "url": "https://news.example.com",
+        "config": {
+            "force": False,           # Skip cache
+            "wait_for": 2.0,          # Wait for dynamic content
+            "simulate_user": True,     # User-like browsing
+            "override_navigator": True # Custom user agent
+        }
+    }
+)
+```
+
+## Configuration Options
+
+The `config` parameter accepts a `LinkPreviewConfig` dictionary:
+
+### Basic Options
+
+```python
+config = {
+    "force": False,                    # Force fresh crawl (default: False)
+    "wait_for": None,                  # CSS selector or timeout in seconds
+    "simulate_user": True,             # Simulate human behavior
+    "override_navigator": True,        # Override browser navigator
+    "headers": {                       # Custom headers
+        "Accept-Language": "en-US,en;q=0.9"
+    }
+}
+```
+
+### Advanced Options
+
+```python
+config = {
+    # Timing and behavior
+    "delay_before_return_html": 0.5,   # Delay before HTML extraction
+    "js_code": ["window.scrollTo(0, document.body.scrollHeight)"],  # JS to execute
+
+    # Content processing
+    "word_count_threshold": 1,         # Minimum word count
+    "exclusion_patterns": [            # Link patterns to exclude
+        r".*/logout.*",
+        r".*/admin.*"
+    ],
+
+    # Caching and session
+    "session_id": "my-session-123",    # Session identifier
+    "magic": False                     # Magic link processing
+}
+```
+
+## Response Structure
+
+The endpoint returns a JSON object with categorized links:
+
+```json
+{
+  "internal": [
+    {
+      "url": "https://example.com/about",
+      "text": "About Us",
+      "title": "Learn about our company",
+      "score": 0.85,
+      "context": "footer navigation",
+      "attributes": {
+        "rel": ["nofollow"],
+        "target": "_blank"
+      }
+    }
+  ],
+  "external": [
+    {
+      "url": "https://partner-site.com",
+      "text": "Partner Site",
+      "title": "Visit our partner",
+      "score": 0.72,
+      "context": "main content",
+      "attributes": {}
+    }
+  ],
+  "social": [...],
+  "download": [...],
+  "email": [...],
+  "phone": [...]
+}
+```
+
+### Link Categories
+
+| Category | Description | Example |
+|----------|-------------|---------|
+| **internal** | Links within the same domain | `/about`, `https://example.com/contact` |
+| **external** | Links to different domains | `https://google.com` |
+| **social** | Social media platform links | `https://twitter.com/user` |
+| **download** | File download links | `/files/document.pdf` |
+| **email** | Email addresses | `mailto:contact@example.com` |
+| **phone** | Phone numbers | `tel:+1234567890` |
+
+### Link Metadata
+
+Each link object contains:
+
+```python
+{
+    "url": str,           # The actual href value
+    "text": str,          # Anchor text content
+    "title": str,         # Title attribute (if any)
+    "score": float,       # Relevance score (0.0-1.0)
+    "context": str,       # Where the link was found
+    "attributes": dict,   # All HTML attributes
+    "hash": str,          # URL fragment (if any)
+    "domain": str,        # Extracted domain name
+    "scheme": str,        # URL scheme (http/https/etc)
+}
+```
+
+## Practical Examples
+
+### SEO Audit Tool
+
+```python
+def seo_audit(url: str):
+    """Perform SEO link analysis on a webpage"""
+    response = requests.post(
+        "http://localhost:8000/links/analyze",
+        headers={"Authorization": "Bearer YOUR_TOKEN"},
+        json={"url": url}
+    )
+
+    result = response.json()
+
+    print(f"📊 SEO Audit for {url}")
+    print(f"Internal links: {len(result.get('internal', []))}")
+    print(f"External links: {len(result.get('external', []))}")
+
+    # Check for SEO issues
+    internal_links = result.get('internal', [])
+    external_links = result.get('external', [])
+
+    # Find links with low scores
+    low_score_links = [link for link in internal_links if link.get('score', 0) < 0.3]
+    if low_score_links:
+        print(f"⚠️  Found {len(low_score_links)} low-quality internal links")
+
+    # Find external opportunities
+    high_value_external = [link for link in external_links if link.get('score', 0) > 0.7]
+    if high_value_external:
+        print(f"✅ Found {len(high_value_external)} high-value external links")
+
+    return result
+
+# Usage
+audit_result = seo_audit("https://example.com")
+```
+
+### Competitor Analysis
+
+```python
+def competitor_analysis(urls: list):
+    """Analyze link patterns across multiple competitor sites"""
+    all_results = {}
+
+    for url in urls:
+        response = requests.post(
+            "http://localhost:8000/links/analyze",
+            headers={"Authorization": "Bearer YOUR_TOKEN"},
+            json={"url": url}
+        )
+        all_results[url] = response.json()
+
+    # Compare external link strategies
+    print("🔍 Competitor Link Analysis")
+    for url, result in all_results.items():
+        external_links = result.get('external', [])
+        avg_score = sum(link.get('score', 0) for link in external_links) / len(external_links) if external_links else 0
+        print(f"{url}: {len(external_links)} external links (avg score: {avg_score:.2f})")
+
+    return all_results
+
+# Usage
+competitors = [
+    "https://competitor1.com",
+    "https://competitor2.com",
+    "https://competitor3.com"
+]
+analysis = competitor_analysis(competitors)
+```
+
+### Content Discovery
+
+```python
+def discover_related_content(start_url: str, max_depth: int = 2):
+    """Discover related content through link analysis"""
+    visited = set()
+    queue = [(start_url, 0)]
+
+    while queue and len(visited) < 20:
+        current_url, depth = queue.pop(0)
+
+        if current_url in visited or depth > max_depth:
+            continue
+
+        visited.add(current_url)
+
+        try:
+            response = requests.post(
+                "http://localhost:8000/links/analyze",
+                headers={"Authorization": "Bearer YOUR_TOKEN"},
+                json={"url": current_url}
+            )
+
+            result = response.json()
+            internal_links = result.get('internal', [])
+
+            # Sort by score and add top links to queue
+            top_links = sorted(internal_links, key=lambda x: x.get('score', 0), reverse=True)[:3]
+
+            for link in top_links:
+                if link['url'] not in visited:
+                    queue.append((link['url'], depth + 1))
+                    print(f"🔗 Found: {link['text']} ({link['score']:.2f})")
+
+        except Exception as e:
+            print(f"❌ Error analyzing {current_url}: {e}")
+
+    return visited
+
+# Usage
+related_pages = discover_related_content("https://blog.example.com")
+print(f"Discovered {len(related_pages)} related pages")
+```
+
+## Best Practices
+
+### 1. Request Optimization
+
+```python
+# ✅ Good: Use appropriate timeouts
+response = requests.post(
+    "http://localhost:8000/links/analyze",
+    headers={"Authorization": "Bearer YOUR_TOKEN"},
+    json={"url": url},
+    timeout=30  # 30 second timeout
+)
+
+# ✅ Good: Configure wait times for dynamic sites
+config = {
+    "wait_for": 2.0,  # Wait for JavaScript to load
+    "simulate_user": True
+}
+```
+
+### 2. Error Handling
+
+```python
+def safe_link_analysis(url: str):
+    try:
+        response = requests.post(
+            "http://localhost:8000/links/analyze",
+            headers={"Authorization": "Bearer YOUR_TOKEN"},
+            json={"url": url},
+            timeout=30
+        )
+
+        if response.status_code == 200:
+            return response.json()
+        elif response.status_code == 400:
+            print("❌ Invalid request format")
+        elif response.status_code == 500:
+            print("❌ Server error during analysis")
+        else:
+            print(f"❌ Unexpected status code: {response.status_code}")
+
+    except requests.Timeout:
+        print("⏰ Request timed out")
+    except requests.ConnectionError:
+        print("🔌 Connection error")
+    except Exception as e:
+        print(f"❌ Unexpected error: {e}")
+
+    return None
+```
+
+### 3. Data Processing
+
+```python
+def process_links_data(result: dict):
+    """Process and filter link analysis results"""
+
+    # Filter by minimum score
+    min_score = 0.5
+    high_quality_links = {}
+
+    for category, links in result.items():
+        filtered_links = [
+            link for link in links
+            if link.get('score', 0) >= min_score
+        ]
+        if filtered_links:
+            high_quality_links[category] = filtered_links
+
+    # Extract unique domains
+    domains = set()
+    for links in result.get('external', []):
+        domains.add(links.get('domain', ''))
+
+    return {
+        'filtered_links': high_quality_links,
+        'unique_domains': list(domains),
+        'total_links': sum(len(links) for links in result.values())
+    }
+```
+
+## Performance Considerations
+
+### Response Times
+
+- **Simple pages**: 2-5 seconds
+- **Complex pages**: 5-15 seconds
+- **JavaScript-heavy**: 10-30 seconds
+
+### Rate Limiting
+
+The endpoint includes built-in rate limiting. For bulk analysis:
+
+```python
+import time
+
+def bulk_link_analysis(urls: list, delay: float = 1.0):
+    """Analyze multiple URLs with rate limiting"""
+    results = {}
+
+    for url in urls:
+        result = safe_link_analysis(url)
+        if result:
+            results[url] = result
+
+        # Respect rate limits
+        time.sleep(delay)
+
+    return results
+```
+
+## Error Handling
+
+### Common Errors and Solutions
+
+| Error Code | Cause | Solution |
+|------------|-------|----------|
+| **400** | Invalid URL or config | Check URL format and config structure |
+| **401** | Invalid authentication | Verify your API token |
+| **429** | Rate limit exceeded | Add delays between requests |
+| **500** | Crawl failure | Check if site is accessible |
+| **503** | Service unavailable | Try again later |
+
+### Debug Mode
+
+```python
+# Enable verbose logging for debugging
+config = {
+    "headers": {
+        "User-Agent": "Crawl4AI-Debug/1.0"
+    }
+}
+
+# Include error details in response
+try:
+    response = requests.post(
+        "http://localhost:8000/links/analyze",
+        headers={"Authorization": "Bearer YOUR_TOKEN"},
+        json={"url": url, "config": config}
+    )
+    response.raise_for_status()
+except requests.HTTPError as e:
+    print(f"Error details: {e.response.text}")
+```
+
+## API Reference
+
+### Endpoint Details
+
+- **URL**: `/links/analyze`
+- **Method**: `POST`
+- **Content-Type**: `application/json`
+- **Authentication**: Bearer token required
+
+### Request Schema
+
+```python
+{
+    "url": str,                    # Required: URL to analyze
+    "config": {                    # Optional: LinkPreviewConfig
+        "force": bool,
+        "wait_for": float,
+        "simulate_user": bool,
+        "override_navigator": bool,
+        "headers": dict,
+        "js_code": list,
+        "delay_before_return_html": float,
+        "word_count_threshold": int,
+        "exclusion_patterns": list,
+        "session_id": str,
+        "magic": bool
+    }
+}
+```
+
+### Response Schema
+
+```python
+{
+    "internal": [LinkObject],
+    "external": [LinkObject],
+    "social": [LinkObject],
+    "download": [LinkObject],
+    "email": [LinkObject],
+    "phone": [LinkObject]
+}
+```
+
+### LinkObject Schema
+
+```python
+{
+    "url": str,
+    "text": str,
+    "title": str,
+    "score": float,
+    "context": str,
+    "attributes": dict,
+    "hash": str,
+    "domain": str,
+    "scheme": str
+}
+```
+
+## Next Steps
+
+- Learn about [Advanced Link Processing](../advanced/link-processing.md)
+- Explore the [Link Preview Configuration](../api/link-preview-config.md)
+- See more [Examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples/link-analysis)
+
+## FAQ
+
+**Q: How is the link score calculated?**
+A: The score considers multiple factors including anchor text relevance, URL structure, page context, and link attributes. Scores range from 0.0 (lowest quality) to 1.0 (highest quality).
+
+**Q: Can I analyze password-protected pages?**
+A: Yes! Use the `js_code` parameter to handle authentication, or include session cookies in the `headers` configuration.
+
+**Q: How many links can I analyze at once?**
+A: There's no hard limit on the number of links per page, but very large pages (>10,000 links) may take longer to process.
+
+**Q: Can I filter out certain types of links?**
+A: Use the `exclusion_patterns` parameter in the config to filter out unwanted links using regex patterns.
+
+**Q: Does this work with JavaScript-heavy sites?**
+A: Absolutely! The crawler waits for JavaScript execution and can even run custom JavaScript using the `js_code` parameter.