Add link analysis tests and integration tests for /links/analyze endpoint
- Implemented `test_link_analysis` in `test_docker.py` to validate link analysis functionality. - Created `test_link_analysis.py` with comprehensive tests for link analysis, including basic functionality, configuration options, error handling, performance, and edge cases. - Added integration tests in `test_link_analysis_integration.py` to verify the /links/analyze endpoint, including health checks, authentication, and error handling.
This commit is contained in:
@@ -240,3 +240,12 @@ class AdaptiveJobStatus(BaseModel):
|
||||
metrics: Optional[Dict[str, Any]] = None
|
||||
result: Optional[Dict[str, Any]] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
class LinkAnalysisRequest(BaseModel):
|
||||
"""Request body for the /links/analyze endpoint."""
|
||||
url: str = Field(..., description="URL to analyze for links")
|
||||
config: Optional[Dict] = Field(
|
||||
default_factory=dict,
|
||||
description="Optional LinkPreviewConfig dictionary"
|
||||
)
|
||||
|
||||
@@ -7,9 +7,37 @@ Crawl4AI FastAPI entry‑point
|
||||
"""
|
||||
|
||||
# ── stdlib & 3rd‑party imports ───────────────────────────────
|
||||
from crawler_pool import get_crawler, close_all, janitor
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LinkPreviewConfig
|
||||
from auth import create_access_token, get_token_dependency, TokenRequest
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, List, Dict
|
||||
from fastapi import Request, Depends
|
||||
from fastapi.responses import FileResponse
|
||||
import ast
|
||||
import asyncio
|
||||
import base64
|
||||
import re
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LinkPreviewConfig
|
||||
from api import (
|
||||
handle_markdown_request, handle_llm_qa,
|
||||
handle_stream_crawl_request, handle_crawl_request,
|
||||
stream_results
|
||||
)
|
||||
from schemas import (
|
||||
CrawlRequestWithHooks,
|
||||
MarkdownRequest,
|
||||
RawCode,
|
||||
HTMLRequest,
|
||||
ScreenshotRequest,
|
||||
PDFRequest,
|
||||
JSEndpointRequest,
|
||||
LinkAnalysisRequest,
|
||||
)
|
||||
|
||||
from utils import (
|
||||
FilterType, load_config, setup_logging, verify_email_domain
|
||||
)
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
@@ -1045,6 +1073,57 @@ async def execute_js(
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.post("/links/analyze")
|
||||
@limiter.limit(config["rate_limiting"]["default_limit"])
|
||||
@mcp_tool("links_analyze")
|
||||
async def analyze_links(
|
||||
request: Request,
|
||||
body: LinkAnalysisRequest,
|
||||
_td: Dict = Depends(token_dep),
|
||||
):
|
||||
"""
|
||||
Analyze and score links on a webpage.
|
||||
Returns a dictionary of links with their scores and metadata.
|
||||
"""
|
||||
try:
|
||||
# Create AsyncWebCrawler instance
|
||||
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
|
||||
# Deserialize config dict to LinkPreviewConfig, use default if not provided
|
||||
link_preview_config = LinkPreviewConfig.from_dict(body.config) if body.config else LinkPreviewConfig()
|
||||
|
||||
# Create CrawlerRunConfig with link analysis settings
|
||||
run_config = CrawlerRunConfig(
|
||||
link_preview_config=link_preview_config,
|
||||
score_links=True,
|
||||
screenshot=False,
|
||||
pdf=False,
|
||||
extraction_strategy=None
|
||||
)
|
||||
|
||||
# Execute the crawl
|
||||
result = await crawler.arun(url=body.url, config=run_config)
|
||||
|
||||
# Check if crawl was successful
|
||||
if not result.success:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=result.error_message or "Crawl failed"
|
||||
)
|
||||
|
||||
# Extract and return the links dictionary
|
||||
return JSONResponse(result.links)
|
||||
|
||||
except HTTPException:
|
||||
# Re-raise HTTP exceptions
|
||||
raise
|
||||
except Exception as e:
|
||||
# Handle any other exceptions
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Internal server error: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@app.get("/llm/{url:path}",
|
||||
summary="LLM Q&A",
|
||||
description="Ask questions about a webpage using LLM.",
|
||||
|
||||
315
docs/examples/link_analysis_example.py
Normal file
315
docs/examples/link_analysis_example.py
Normal file
@@ -0,0 +1,315 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Link Analysis Example
|
||||
====================
|
||||
|
||||
This example demonstrates how to use the new /links/analyze endpoint
|
||||
to extract, analyze, and score links from web pages.
|
||||
|
||||
Requirements:
|
||||
- Crawl4AI server running on localhost:11234
|
||||
- requests library: pip install requests
|
||||
"""
|
||||
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
from typing import Dict, Any, List
|
||||
|
||||
|
||||
class LinkAnalyzer:
|
||||
"""Simple client for the link analysis endpoint"""
|
||||
|
||||
def __init__(self, base_url: str = "http://localhost:11234", token: str = None):
|
||||
self.base_url = base_url
|
||||
self.token = token or self._get_test_token()
|
||||
|
||||
def _get_test_token(self) -> str:
|
||||
"""Get a test token (for development only)"""
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{self.base_url}/token",
|
||||
json={"email": "test@example.com"},
|
||||
timeout=10
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return response.json()["access_token"]
|
||||
except:
|
||||
pass
|
||||
return "test-token" # Fallback for local testing
|
||||
|
||||
def analyze_links(self, url: str, config: Dict[str, Any] = None) -> Dict[str, Any]:
|
||||
"""Analyze links on a webpage"""
|
||||
headers = {"Content-Type": "application/json"}
|
||||
|
||||
if self.token and self.token != "test-token":
|
||||
headers["Authorization"] = f"Bearer {self.token}"
|
||||
|
||||
data = {"url": url}
|
||||
if config:
|
||||
data["config"] = config
|
||||
|
||||
response = requests.post(
|
||||
f"{self.base_url}/links/analyze",
|
||||
headers=headers,
|
||||
json=data,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def print_summary(self, result: Dict[str, Any]):
|
||||
"""Print a summary of link analysis results"""
|
||||
print("\n" + "="*60)
|
||||
print("📊 LINK ANALYSIS SUMMARY")
|
||||
print("="*60)
|
||||
|
||||
total_links = sum(len(links) for links in result.values())
|
||||
print(f"Total links found: {total_links}")
|
||||
|
||||
for category, links in result.items():
|
||||
if links:
|
||||
print(f"\n📂 {category.upper()}: {len(links)} links")
|
||||
|
||||
# Show top 3 links by score
|
||||
top_links = sorted(links, key=lambda x: x.get('total_score', 0), reverse=True)[:3]
|
||||
for i, link in enumerate(top_links, 1):
|
||||
score = link.get('total_score', 0)
|
||||
text = link.get('text', 'No text')[:50]
|
||||
url = link.get('href', 'No URL')[:60]
|
||||
print(f" {i}. [{score:.2f}] {text} → {url}")
|
||||
|
||||
|
||||
def example_1_basic_analysis():
|
||||
"""Example 1: Basic link analysis"""
|
||||
print("\n🔍 Example 1: Basic Link Analysis")
|
||||
print("-" * 40)
|
||||
|
||||
analyzer = LinkAnalyzer()
|
||||
|
||||
# Analyze a simple test page
|
||||
url = "https://httpbin.org/links/10"
|
||||
print(f"Analyzing: {url}")
|
||||
|
||||
try:
|
||||
result = analyzer.analyze_links(url)
|
||||
analyzer.print_summary(result)
|
||||
return result
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def example_2_custom_config():
|
||||
"""Example 2: Analysis with custom configuration"""
|
||||
print("\n🔍 Example 2: Custom Configuration")
|
||||
print("-" * 40)
|
||||
|
||||
analyzer = LinkAnalyzer()
|
||||
|
||||
# Custom configuration
|
||||
config = {
|
||||
"include_internal": True,
|
||||
"include_external": True,
|
||||
"max_links": 50,
|
||||
"timeout": 10,
|
||||
"verbose": True
|
||||
}
|
||||
|
||||
url = "https://httpbin.org/links/10"
|
||||
print(f"Analyzing with custom config: {url}")
|
||||
print(f"Config: {json.dumps(config, indent=2)}")
|
||||
|
||||
try:
|
||||
result = analyzer.analyze_links(url, config)
|
||||
analyzer.print_summary(result)
|
||||
return result
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def example_3_real_world_site():
|
||||
"""Example 3: Analyzing a real website"""
|
||||
print("\n🔍 Example 3: Real Website Analysis")
|
||||
print("-" * 40)
|
||||
|
||||
analyzer = LinkAnalyzer()
|
||||
|
||||
# Analyze Python official website
|
||||
url = "https://www.python.org"
|
||||
print(f"Analyzing real website: {url}")
|
||||
print("This may take a moment...")
|
||||
|
||||
try:
|
||||
result = analyzer.analyze_links(url)
|
||||
analyzer.print_summary(result)
|
||||
|
||||
# Additional analysis
|
||||
print("\n📈 DETAILED ANALYSIS")
|
||||
print("-" * 20)
|
||||
|
||||
# Find external links with highest scores
|
||||
external_links = result.get('external', [])
|
||||
if external_links:
|
||||
top_external = sorted(external_links, key=lambda x: x.get('total_score', 0), reverse=True)[:5]
|
||||
print("\n🌐 Top External Links:")
|
||||
for link in top_external:
|
||||
print(f" • {link.get('text', 'N/A')} (score: {link.get('total_score', 0):.2f})")
|
||||
print(f" {link.get('href', 'N/A')}")
|
||||
|
||||
# Find internal links
|
||||
internal_links = result.get('internal', [])
|
||||
if internal_links:
|
||||
top_internal = sorted(internal_links, key=lambda x: x.get('total_score', 0), reverse=True)[:5]
|
||||
print("\n🏠 Top Internal Links:")
|
||||
for link in top_internal:
|
||||
print(f" • {link.get('text', 'N/A')} (score: {link.get('total_score', 0):.2f})")
|
||||
print(f" {link.get('href', 'N/A')}")
|
||||
|
||||
return result
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
print("⚠️ This example may fail due to network issues")
|
||||
return None
|
||||
|
||||
|
||||
def example_4_comparative_analysis():
|
||||
"""Example 4: Comparing link structures across sites"""
|
||||
print("\n🔍 Example 4: Comparative Analysis")
|
||||
print("-" * 40)
|
||||
|
||||
analyzer = LinkAnalyzer()
|
||||
|
||||
sites = [
|
||||
("https://httpbin.org/links/10", "Test Page 1"),
|
||||
("https://httpbin.org/links/5", "Test Page 2")
|
||||
]
|
||||
|
||||
results = {}
|
||||
|
||||
for url, name in sites:
|
||||
print(f"\nAnalyzing: {name}")
|
||||
try:
|
||||
result = analyzer.analyze_links(url)
|
||||
results[name] = result
|
||||
|
||||
total_links = sum(len(links) for links in result.values())
|
||||
categories = len([cat for cat, links in result.items() if links])
|
||||
print(f" Links: {total_links}, Categories: {categories}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Error: {e}")
|
||||
|
||||
# Compare results
|
||||
if len(results) > 1:
|
||||
print("\n📊 COMPARISON")
|
||||
print("-" * 15)
|
||||
|
||||
for name, result in results.items():
|
||||
total = sum(len(links) for links in result.values())
|
||||
print(f"{name}: {total} total links")
|
||||
|
||||
# Calculate average scores
|
||||
all_scores = []
|
||||
for links in result.values():
|
||||
for link in links:
|
||||
all_scores.append(link.get('total_score', 0))
|
||||
|
||||
if all_scores:
|
||||
avg_score = sum(all_scores) / len(all_scores)
|
||||
print(f" Average link score: {avg_score:.3f}")
|
||||
|
||||
|
||||
def example_5_advanced_filtering():
|
||||
"""Example 5: Advanced filtering and analysis"""
|
||||
print("\n🔍 Example 5: Advanced Filtering")
|
||||
print("-" * 40)
|
||||
|
||||
analyzer = LinkAnalyzer()
|
||||
|
||||
url = "https://httpbin.org/links/10"
|
||||
|
||||
try:
|
||||
result = analyzer.analyze_links(url)
|
||||
|
||||
# Filter links by score
|
||||
min_score = 0.5
|
||||
high_quality_links = {}
|
||||
|
||||
for category, links in result.items():
|
||||
if links:
|
||||
filtered = [link for link in links if link.get('total_score', 0) >= min_score]
|
||||
if filtered:
|
||||
high_quality_links[category] = filtered
|
||||
|
||||
print(f"\n🎯 High-quality links (score >= {min_score}):")
|
||||
total_high_quality = sum(len(links) for links in high_quality_links.values())
|
||||
print(f"Total: {total_high_quality} links")
|
||||
|
||||
for category, links in high_quality_links.items():
|
||||
print(f"\n{category.upper()}:")
|
||||
for link in links:
|
||||
score = link.get('total_score', 0)
|
||||
text = link.get('text', 'No text')
|
||||
print(f" • [{score:.2f}] {text}")
|
||||
|
||||
# Extract unique domains from external links
|
||||
external_links = result.get('external', [])
|
||||
if external_links:
|
||||
domains = set()
|
||||
for link in external_links:
|
||||
url = link.get('href', '')
|
||||
if '://' in url:
|
||||
domain = url.split('://')[1].split('/')[0]
|
||||
domains.add(domain)
|
||||
|
||||
print(f"\n🌐 Unique external domains: {len(domains)}")
|
||||
for domain in sorted(domains):
|
||||
print(f" • {domain}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Run all examples"""
|
||||
print("🚀 Link Analysis Examples")
|
||||
print("=" * 50)
|
||||
print("Make sure the Crawl4AI server is running on localhost:11234")
|
||||
print()
|
||||
|
||||
examples = [
|
||||
example_1_basic_analysis,
|
||||
example_2_custom_config,
|
||||
example_3_real_world_site,
|
||||
example_4_comparative_analysis,
|
||||
example_5_advanced_filtering
|
||||
]
|
||||
|
||||
for i, example_func in enumerate(examples, 1):
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Running Example {i}")
|
||||
print('='*60)
|
||||
|
||||
try:
|
||||
example_func()
|
||||
except KeyboardInterrupt:
|
||||
print("\n⏹️ Example interrupted by user")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"\n❌ Example {i} failed: {e}")
|
||||
|
||||
if i < len(examples):
|
||||
print("\n⏳ Press Enter to continue to next example...")
|
||||
try:
|
||||
input()
|
||||
except KeyboardInterrupt:
|
||||
break
|
||||
|
||||
print("\n🎉 Examples completed!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
523
docs/md_v2/core/link-analysis.md
Normal file
523
docs/md_v2/core/link-analysis.md
Normal file
@@ -0,0 +1,523 @@
|
||||
# Link Analysis and Scoring
|
||||
|
||||
## Introduction
|
||||
|
||||
**Link Analysis** is a powerful feature that extracts, analyzes, and scores all links found on a webpage. This endpoint helps you understand the link structure, identify high-value links, and get insights into the connectivity patterns of any website.
|
||||
|
||||
Think of it as a smart link discovery tool that not only extracts links but also evaluates their importance, relevance, and quality through advanced scoring algorithms.
|
||||
|
||||
## Key Concepts
|
||||
|
||||
### What Link Analysis Does
|
||||
|
||||
When you analyze a webpage, the system:
|
||||
|
||||
1. **Extracts All Links** - Finds every hyperlink on the page
|
||||
2. **Scores Links** - Assigns relevance scores based on multiple factors
|
||||
3. **Categorizes Links** - Groups links by type (internal, external, etc.)
|
||||
4. **Provides Metadata** - URL text, attributes, and context information
|
||||
5. **Ranks by Importance** - Orders links from most to least valuable
|
||||
|
||||
### Scoring Factors
|
||||
|
||||
The link scoring algorithm considers:
|
||||
|
||||
- **Text Content**: Link anchor text relevance and descriptiveness
|
||||
- **URL Structure**: Depth, parameters, and path patterns
|
||||
- **Context**: Surrounding text and page position
|
||||
- **Attributes**: Title, rel attributes, and other metadata
|
||||
- **Link Type**: Internal vs external classification
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
# Analyze links on a webpage
|
||||
response = requests.post(
|
||||
"http://localhost:8000/links/analyze",
|
||||
headers={"Authorization": "Bearer YOUR_TOKEN"},
|
||||
json={
|
||||
"url": "https://example.com"
|
||||
}
|
||||
)
|
||||
|
||||
result = response.json()
|
||||
print(f"Found {len(result.get('internal', []))} internal links")
|
||||
print(f"Found {len(result.get('external', []))} external links")
|
||||
|
||||
# Show top 3 links by score
|
||||
for link_type in ['internal', 'external']:
|
||||
if link_type in result:
|
||||
top_links = sorted(result[link_type], key=lambda x: x.get('score', 0), reverse=True)[:3]
|
||||
print(f"\nTop {link_type} links:")
|
||||
for link in top_links:
|
||||
print(f"- {link.get('url', 'N/A')} (score: {link.get('score', 0):.2f})")
|
||||
```
|
||||
|
||||
### With Custom Configuration
|
||||
|
||||
```python
|
||||
response = requests.post(
|
||||
"http://localhost:8000/links/analyze",
|
||||
headers={"Authorization": "Bearer YOUR_TOKEN"},
|
||||
json={
|
||||
"url": "https://news.example.com",
|
||||
"config": {
|
||||
"force": False, # Skip cache
|
||||
"wait_for": 2.0, # Wait for dynamic content
|
||||
"simulate_user": True, # User-like browsing
|
||||
"override_navigator": True # Custom user agent
|
||||
}
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
## Configuration Options
|
||||
|
||||
The `config` parameter accepts a `LinkPreviewConfig` dictionary:
|
||||
|
||||
### Basic Options
|
||||
|
||||
```python
|
||||
config = {
|
||||
"force": False, # Force fresh crawl (default: False)
|
||||
"wait_for": None, # CSS selector or timeout in seconds
|
||||
"simulate_user": True, # Simulate human behavior
|
||||
"override_navigator": True, # Override browser navigator
|
||||
"headers": { # Custom headers
|
||||
"Accept-Language": "en-US,en;q=0.9"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Advanced Options
|
||||
|
||||
```python
|
||||
config = {
|
||||
# Timing and behavior
|
||||
"delay_before_return_html": 0.5, # Delay before HTML extraction
|
||||
"js_code": ["window.scrollTo(0, document.body.scrollHeight)"], # JS to execute
|
||||
|
||||
# Content processing
|
||||
"word_count_threshold": 1, # Minimum word count
|
||||
"exclusion_patterns": [ # Link patterns to exclude
|
||||
r".*/logout.*",
|
||||
r".*/admin.*"
|
||||
],
|
||||
|
||||
# Caching and session
|
||||
"session_id": "my-session-123", # Session identifier
|
||||
"magic": False # Magic link processing
|
||||
}
|
||||
```
|
||||
|
||||
## Response Structure
|
||||
|
||||
The endpoint returns a JSON object with categorized links:
|
||||
|
||||
```json
|
||||
{
|
||||
"internal": [
|
||||
{
|
||||
"url": "https://example.com/about",
|
||||
"text": "About Us",
|
||||
"title": "Learn about our company",
|
||||
"score": 0.85,
|
||||
"context": "footer navigation",
|
||||
"attributes": {
|
||||
"rel": ["nofollow"],
|
||||
"target": "_blank"
|
||||
}
|
||||
}
|
||||
],
|
||||
"external": [
|
||||
{
|
||||
"url": "https://partner-site.com",
|
||||
"text": "Partner Site",
|
||||
"title": "Visit our partner",
|
||||
"score": 0.72,
|
||||
"context": "main content",
|
||||
"attributes": {}
|
||||
}
|
||||
],
|
||||
"social": [...],
|
||||
"download": [...],
|
||||
"email": [...],
|
||||
"phone": [...]
|
||||
}
|
||||
```
|
||||
|
||||
### Link Categories
|
||||
|
||||
| Category | Description | Example |
|
||||
|----------|-------------|---------|
|
||||
| **internal** | Links within the same domain | `/about`, `https://example.com/contact` |
|
||||
| **external** | Links to different domains | `https://google.com` |
|
||||
| **social** | Social media platform links | `https://twitter.com/user` |
|
||||
| **download** | File download links | `/files/document.pdf` |
|
||||
| **email** | Email addresses | `mailto:contact@example.com` |
|
||||
| **phone** | Phone numbers | `tel:+1234567890` |
|
||||
|
||||
### Link Metadata
|
||||
|
||||
Each link object contains:
|
||||
|
||||
```python
|
||||
{
|
||||
"url": str, # The actual href value
|
||||
"text": str, # Anchor text content
|
||||
"title": str, # Title attribute (if any)
|
||||
"score": float, # Relevance score (0.0-1.0)
|
||||
"context": str, # Where the link was found
|
||||
"attributes": dict, # All HTML attributes
|
||||
"hash": str, # URL fragment (if any)
|
||||
"domain": str, # Extracted domain name
|
||||
"scheme": str, # URL scheme (http/https/etc)
|
||||
}
|
||||
```
|
||||
|
||||
## Practical Examples
|
||||
|
||||
### SEO Audit Tool
|
||||
|
||||
```python
|
||||
def seo_audit(url: str):
|
||||
"""Perform SEO link analysis on a webpage"""
|
||||
response = requests.post(
|
||||
"http://localhost:8000/links/analyze",
|
||||
headers={"Authorization": "Bearer YOUR_TOKEN"},
|
||||
json={"url": url}
|
||||
)
|
||||
|
||||
result = response.json()
|
||||
|
||||
print(f"📊 SEO Audit for {url}")
|
||||
print(f"Internal links: {len(result.get('internal', []))}")
|
||||
print(f"External links: {len(result.get('external', []))}")
|
||||
|
||||
# Check for SEO issues
|
||||
internal_links = result.get('internal', [])
|
||||
external_links = result.get('external', [])
|
||||
|
||||
# Find links with low scores
|
||||
low_score_links = [link for link in internal_links if link.get('score', 0) < 0.3]
|
||||
if low_score_links:
|
||||
print(f"⚠️ Found {len(low_score_links)} low-quality internal links")
|
||||
|
||||
# Find external opportunities
|
||||
high_value_external = [link for link in external_links if link.get('score', 0) > 0.7]
|
||||
if high_value_external:
|
||||
print(f"✅ Found {len(high_value_external)} high-value external links")
|
||||
|
||||
return result
|
||||
|
||||
# Usage
|
||||
audit_result = seo_audit("https://example.com")
|
||||
```
|
||||
|
||||
### Competitor Analysis
|
||||
|
||||
```python
|
||||
def competitor_analysis(urls: list):
|
||||
"""Analyze link patterns across multiple competitor sites"""
|
||||
all_results = {}
|
||||
|
||||
for url in urls:
|
||||
response = requests.post(
|
||||
"http://localhost:8000/links/analyze",
|
||||
headers={"Authorization": "Bearer YOUR_TOKEN"},
|
||||
json={"url": url}
|
||||
)
|
||||
all_results[url] = response.json()
|
||||
|
||||
# Compare external link strategies
|
||||
print("🔍 Competitor Link Analysis")
|
||||
for url, result in all_results.items():
|
||||
external_links = result.get('external', [])
|
||||
avg_score = sum(link.get('score', 0) for link in external_links) / len(external_links) if external_links else 0
|
||||
print(f"{url}: {len(external_links)} external links (avg score: {avg_score:.2f})")
|
||||
|
||||
return all_results
|
||||
|
||||
# Usage
|
||||
competitors = [
|
||||
"https://competitor1.com",
|
||||
"https://competitor2.com",
|
||||
"https://competitor3.com"
|
||||
]
|
||||
analysis = competitor_analysis(competitors)
|
||||
```
|
||||
|
||||
### Content Discovery
|
||||
|
||||
```python
|
||||
def discover_related_content(start_url: str, max_depth: int = 2):
|
||||
"""Discover related content through link analysis"""
|
||||
visited = set()
|
||||
queue = [(start_url, 0)]
|
||||
|
||||
while queue and len(visited) < 20:
|
||||
current_url, depth = queue.pop(0)
|
||||
|
||||
if current_url in visited or depth > max_depth:
|
||||
continue
|
||||
|
||||
visited.add(current_url)
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
"http://localhost:8000/links/analyze",
|
||||
headers={"Authorization": "Bearer YOUR_TOKEN"},
|
||||
json={"url": current_url}
|
||||
)
|
||||
|
||||
result = response.json()
|
||||
internal_links = result.get('internal', [])
|
||||
|
||||
# Sort by score and add top links to queue
|
||||
top_links = sorted(internal_links, key=lambda x: x.get('score', 0), reverse=True)[:3]
|
||||
|
||||
for link in top_links:
|
||||
if link['url'] not in visited:
|
||||
queue.append((link['url'], depth + 1))
|
||||
print(f"🔗 Found: {link['text']} ({link['score']:.2f})")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error analyzing {current_url}: {e}")
|
||||
|
||||
return visited
|
||||
|
||||
# Usage
|
||||
related_pages = discover_related_content("https://blog.example.com")
|
||||
print(f"Discovered {len(related_pages)} related pages")
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Request Optimization
|
||||
|
||||
```python
|
||||
# ✅ Good: Use appropriate timeouts
|
||||
response = requests.post(
|
||||
"http://localhost:8000/links/analyze",
|
||||
headers={"Authorization": "Bearer YOUR_TOKEN"},
|
||||
json={"url": url},
|
||||
timeout=30 # 30 second timeout
|
||||
)
|
||||
|
||||
# ✅ Good: Configure wait times for dynamic sites
|
||||
config = {
|
||||
"wait_for": 2.0, # Wait for JavaScript to load
|
||||
"simulate_user": True
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Error Handling
|
||||
|
||||
```python
|
||||
def safe_link_analysis(url: str):
|
||||
try:
|
||||
response = requests.post(
|
||||
"http://localhost:8000/links/analyze",
|
||||
headers={"Authorization": "Bearer YOUR_TOKEN"},
|
||||
json={"url": url},
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
elif response.status_code == 400:
|
||||
print("❌ Invalid request format")
|
||||
elif response.status_code == 500:
|
||||
print("❌ Server error during analysis")
|
||||
else:
|
||||
print(f"❌ Unexpected status code: {response.status_code}")
|
||||
|
||||
except requests.Timeout:
|
||||
print("⏰ Request timed out")
|
||||
except requests.ConnectionError:
|
||||
print("🔌 Connection error")
|
||||
except Exception as e:
|
||||
print(f"❌ Unexpected error: {e}")
|
||||
|
||||
return None
|
||||
```
|
||||
|
||||
### 3. Data Processing
|
||||
|
||||
```python
|
||||
def process_links_data(result: dict):
|
||||
"""Process and filter link analysis results"""
|
||||
|
||||
# Filter by minimum score
|
||||
min_score = 0.5
|
||||
high_quality_links = {}
|
||||
|
||||
for category, links in result.items():
|
||||
filtered_links = [
|
||||
link for link in links
|
||||
if link.get('score', 0) >= min_score
|
||||
]
|
||||
if filtered_links:
|
||||
high_quality_links[category] = filtered_links
|
||||
|
||||
# Extract unique domains
|
||||
domains = set()
|
||||
for links in result.get('external', []):
|
||||
domains.add(links.get('domain', ''))
|
||||
|
||||
return {
|
||||
'filtered_links': high_quality_links,
|
||||
'unique_domains': list(domains),
|
||||
'total_links': sum(len(links) for links in result.values())
|
||||
}
|
||||
```
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Response Times
|
||||
|
||||
- **Simple pages**: 2-5 seconds
|
||||
- **Complex pages**: 5-15 seconds
|
||||
- **JavaScript-heavy**: 10-30 seconds
|
||||
|
||||
### Rate Limiting
|
||||
|
||||
The endpoint includes built-in rate limiting. For bulk analysis:
|
||||
|
||||
```python
|
||||
import time
|
||||
|
||||
def bulk_link_analysis(urls: list, delay: float = 1.0):
|
||||
"""Analyze multiple URLs with rate limiting"""
|
||||
results = {}
|
||||
|
||||
for url in urls:
|
||||
result = safe_link_analysis(url)
|
||||
if result:
|
||||
results[url] = result
|
||||
|
||||
# Respect rate limits
|
||||
time.sleep(delay)
|
||||
|
||||
return results
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
### Common Errors and Solutions
|
||||
|
||||
| Error Code | Cause | Solution |
|
||||
|------------|-------|----------|
|
||||
| **400** | Invalid URL or config | Check URL format and config structure |
|
||||
| **401** | Invalid authentication | Verify your API token |
|
||||
| **429** | Rate limit exceeded | Add delays between requests |
|
||||
| **500** | Crawl failure | Check if site is accessible |
|
||||
| **503** | Service unavailable | Try again later |
|
||||
|
||||
### Debug Mode
|
||||
|
||||
```python
|
||||
# Enable verbose logging for debugging
|
||||
config = {
|
||||
"headers": {
|
||||
"User-Agent": "Crawl4AI-Debug/1.0"
|
||||
}
|
||||
}
|
||||
|
||||
# Include error details in response
|
||||
try:
|
||||
response = requests.post(
|
||||
"http://localhost:8000/links/analyze",
|
||||
headers={"Authorization": "Bearer YOUR_TOKEN"},
|
||||
json={"url": url, "config": config}
|
||||
)
|
||||
response.raise_for_status()
|
||||
except requests.HTTPError as e:
|
||||
print(f"Error details: {e.response.text}")
|
||||
```
|
||||
|
||||
## API Reference
|
||||
|
||||
### Endpoint Details
|
||||
|
||||
- **URL**: `/links/analyze`
|
||||
- **Method**: `POST`
|
||||
- **Content-Type**: `application/json`
|
||||
- **Authentication**: Bearer token required
|
||||
|
||||
### Request Schema
|
||||
|
||||
```python
|
||||
{
|
||||
"url": str, # Required: URL to analyze
|
||||
"config": { # Optional: LinkPreviewConfig
|
||||
"force": bool,
|
||||
"wait_for": float,
|
||||
"simulate_user": bool,
|
||||
"override_navigator": bool,
|
||||
"headers": dict,
|
||||
"js_code": list,
|
||||
"delay_before_return_html": float,
|
||||
"word_count_threshold": int,
|
||||
"exclusion_patterns": list,
|
||||
"session_id": str,
|
||||
"magic": bool
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Response Schema
|
||||
|
||||
```python
|
||||
{
|
||||
"internal": [LinkObject],
|
||||
"external": [LinkObject],
|
||||
"social": [LinkObject],
|
||||
"download": [LinkObject],
|
||||
"email": [LinkObject],
|
||||
"phone": [LinkObject]
|
||||
}
|
||||
```
|
||||
|
||||
### LinkObject Schema
|
||||
|
||||
```python
|
||||
{
|
||||
"url": str,
|
||||
"text": str,
|
||||
"title": str,
|
||||
"score": float,
|
||||
"context": str,
|
||||
"attributes": dict,
|
||||
"hash": str,
|
||||
"domain": str,
|
||||
"scheme": str
|
||||
}
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
- Learn about [Advanced Link Processing](../advanced/link-processing.md)
|
||||
- Explore the [Link Preview Configuration](../api/link-preview-config.md)
|
||||
- See more [Examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples/link-analysis)
|
||||
|
||||
## FAQ
|
||||
|
||||
**Q: How is the link score calculated?**
|
||||
A: The score considers multiple factors including anchor text relevance, URL structure, page context, and link attributes. Scores range from 0.0 (lowest quality) to 1.0 (highest quality).
|
||||
|
||||
**Q: Can I analyze password-protected pages?**
|
||||
A: Yes! Use the `js_code` parameter to handle authentication, or include session cookies in the `headers` configuration.
|
||||
|
||||
**Q: How many links can I analyze at once?**
|
||||
A: There's no hard limit on the number of links per page, but very large pages (>10,000 links) may take longer to process.
|
||||
|
||||
**Q: Can I filter out certain types of links?**
|
||||
A: Use the `exclusion_patterns` parameter in the config to filter out unwanted links using regex patterns.
|
||||
|
||||
**Q: Does this work with JavaScript-heavy sites?**
|
||||
A: Absolutely! The crawler waits for JavaScript execution and can even run custom JavaScript using the `js_code` parameter.
|
||||
@@ -70,6 +70,7 @@ def test_docker_deployment(version="basic"):
|
||||
# test_llm_extraction(tester)
|
||||
# test_llm_with_ollama(tester)
|
||||
# test_screenshot(tester)
|
||||
test_link_analysis(tester)
|
||||
|
||||
|
||||
def test_basic_crawl(tester: Crawl4AiTester):
|
||||
@@ -293,6 +294,77 @@ def test_screenshot(tester: Crawl4AiTester):
|
||||
assert result["result"]["success"]
|
||||
|
||||
|
||||
def test_link_analysis(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Link Analysis ===")
|
||||
|
||||
# Get auth token first
|
||||
try:
|
||||
token_response = requests.post(f"{tester.base_url}/token", json={"email": "test@example.com"})
|
||||
token = token_response.json()["access_token"]
|
||||
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
||||
except Exception as e:
|
||||
print(f"Could not get auth token: {e}")
|
||||
headers = {"Content-Type": "application/json"}
|
||||
|
||||
# Test basic link analysis
|
||||
request_data = {
|
||||
"url": "https://www.nbcnews.com/business"
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
f"{tester.base_url}/links/analyze",
|
||||
headers=headers,
|
||||
json=request_data,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
total_links = sum(len(links) for links in result.values())
|
||||
print(f"Link analysis successful: found {total_links} links")
|
||||
|
||||
# Check for expected categories
|
||||
categories_found = []
|
||||
for category in ['internal', 'external', 'social', 'download', 'email', 'phone']:
|
||||
if category in result and result[category]:
|
||||
categories_found.append(category)
|
||||
|
||||
print(f"Link categories found: {categories_found}")
|
||||
|
||||
# Verify we have some links
|
||||
assert total_links > 0, "Should find at least one link"
|
||||
assert len(categories_found) > 0, "Should find at least one link category"
|
||||
|
||||
# Test with configuration
|
||||
request_data_with_config = {
|
||||
"url": "https://www.nbcnews.com/business",
|
||||
"config": {
|
||||
"simulate_user": True,
|
||||
"override_navigator": True,
|
||||
"word_count_threshold": 1
|
||||
}
|
||||
}
|
||||
|
||||
response_with_config = requests.post(
|
||||
f"{tester.base_url}/links/analyze",
|
||||
headers=headers,
|
||||
json=request_data_with_config,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
if response_with_config.status_code == 200:
|
||||
result_with_config = response_with_config.json()
|
||||
total_links_config = sum(len(links) for links in result_with_config.values())
|
||||
print(f"Link analysis with config: found {total_links_config} links")
|
||||
assert total_links_config > 0, "Should find links even with config"
|
||||
|
||||
print("✅ Link analysis tests passed")
|
||||
else:
|
||||
print(f"❌ Link analysis failed: {response.status_code} - {response.text}")
|
||||
# Don't fail the entire test suite for this endpoint
|
||||
print("⚠️ Link analysis test failed, but continuing with other tests")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
version = sys.argv[1] if len(sys.argv) > 1 else "basic"
|
||||
# version = "full"
|
||||
|
||||
759
tests/test_link_analysis.py
Normal file
759
tests/test_link_analysis.py
Normal file
@@ -0,0 +1,759 @@
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
import os
|
||||
from typing import Dict, Any, List
|
||||
|
||||
|
||||
class LinkAnalysisTester:
|
||||
def __init__(self, base_url: str = "http://localhost:11234"):
|
||||
self.base_url = base_url
|
||||
self.token = self.get_test_token()
|
||||
|
||||
def get_test_token(self) -> str:
|
||||
"""Get authentication token for testing"""
|
||||
try:
|
||||
# Try to get token using test email
|
||||
response = requests.post(
|
||||
f"{self.base_url}/token",
|
||||
json={"email": "test@example.com"},
|
||||
timeout=10
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return response.json()["access_token"]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback: try with common test token or skip auth for local testing
|
||||
return "test-token"
|
||||
|
||||
def analyze_links(
|
||||
self,
|
||||
url: str,
|
||||
config: Dict[str, Any] = None,
|
||||
timeout: int = 60
|
||||
) -> Dict[str, Any]:
|
||||
"""Analyze links on a webpage"""
|
||||
headers = {
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
# Add auth if token is available
|
||||
if self.token and self.token != "test-token":
|
||||
headers["Authorization"] = f"Bearer {self.token}"
|
||||
|
||||
request_data = {"url": url}
|
||||
if config:
|
||||
request_data["config"] = config
|
||||
|
||||
response = requests.post(
|
||||
f"{self.base_url}/links/analyze",
|
||||
headers=headers,
|
||||
json=request_data,
|
||||
timeout=timeout
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise Exception(f"Link analysis failed: {response.status_code} - {response.text}")
|
||||
|
||||
return response.json()
|
||||
|
||||
|
||||
def test_link_analysis_basic():
|
||||
"""Test basic link analysis functionality"""
|
||||
print("\n=== Testing Basic Link Analysis ===")
|
||||
|
||||
tester = LinkAnalysisTester()
|
||||
|
||||
# Test with a simple page
|
||||
test_url = "https://httpbin.org/links/10"
|
||||
|
||||
try:
|
||||
result = tester.analyze_links(test_url)
|
||||
print(f"✅ Successfully analyzed links on {test_url}")
|
||||
|
||||
# Check response structure
|
||||
expected_categories = ['internal', 'external', 'social', 'download', 'email', 'phone']
|
||||
found_categories = [cat for cat in expected_categories if cat in result]
|
||||
|
||||
print(f"📊 Found link categories: {found_categories}")
|
||||
|
||||
# Count total links
|
||||
total_links = sum(len(links) for links in result.values())
|
||||
print(f"🔗 Total links found: {total_links}")
|
||||
|
||||
# Verify link objects have expected fields
|
||||
for category, links in result.items():
|
||||
if links and len(links) > 0:
|
||||
sample_link = links[0]
|
||||
expected_fields = ['href', 'text']
|
||||
optional_fields = ['title', 'base_domain', 'intrinsic_score', 'contextual_score', 'total_score']
|
||||
|
||||
missing_required = [field for field in expected_fields if field not in sample_link]
|
||||
found_optional = [field for field in optional_fields if field in sample_link]
|
||||
|
||||
if missing_required:
|
||||
print(f"⚠️ Missing required fields in {category}: {missing_required}")
|
||||
else:
|
||||
print(f"✅ {category} links have proper structure (has {len(found_optional)} optional fields: {found_optional})")
|
||||
|
||||
assert total_links > 0, "Should find at least one link"
|
||||
print("✅ Basic link analysis test passed")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Basic link analysis test failed: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
def test_link_analysis_with_config():
|
||||
"""Test link analysis with custom configuration"""
|
||||
print("\n=== Testing Link Analysis with Config ===")
|
||||
|
||||
tester = LinkAnalysisTester()
|
||||
|
||||
# Test with valid LinkPreviewConfig options
|
||||
config = {
|
||||
"include_internal": True,
|
||||
"include_external": True,
|
||||
"max_links": 50,
|
||||
"score_threshold": 0.3,
|
||||
"verbose": True
|
||||
}
|
||||
|
||||
test_url = "https://httpbin.org/links/10"
|
||||
|
||||
try:
|
||||
result = tester.analyze_links(test_url, config)
|
||||
print(f"✅ Successfully analyzed links with custom config")
|
||||
|
||||
# Verify configuration was applied
|
||||
total_links = sum(len(links) for links in result.values())
|
||||
print(f"🔗 Links found with config: {total_links}")
|
||||
|
||||
assert total_links > 0, "Should find links even with config"
|
||||
print("✅ Config test passed")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Config test failed: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
def test_link_analysis_complex_page():
|
||||
"""Test link analysis on a more complex page"""
|
||||
print("\n=== Testing Link Analysis on Complex Page ===")
|
||||
|
||||
tester = LinkAnalysisTester()
|
||||
|
||||
# Test with a real-world page
|
||||
test_url = "https://www.python.org"
|
||||
|
||||
try:
|
||||
result = tester.analyze_links(test_url)
|
||||
print(f"✅ Successfully analyzed links on {test_url}")
|
||||
|
||||
# Analyze link distribution
|
||||
category_counts = {}
|
||||
for category, links in result.items():
|
||||
if links:
|
||||
category_counts[category] = len(links)
|
||||
print(f"📂 {category}: {len(links)} links")
|
||||
|
||||
# Find top-scoring links
|
||||
all_links = []
|
||||
for category, links in result.items():
|
||||
if links:
|
||||
for link in links:
|
||||
link['category'] = category
|
||||
all_links.append(link)
|
||||
|
||||
if all_links:
|
||||
# Use intrinsic_score or total_score if available, fallback to 0
|
||||
top_links = sorted(all_links, key=lambda x: x.get('total_score', x.get('intrinsic_score', 0)), reverse=True)[:5]
|
||||
print("\n🏆 Top 5 links by score:")
|
||||
for i, link in enumerate(top_links, 1):
|
||||
score = link.get('total_score', link.get('intrinsic_score', 0))
|
||||
print(f" {i}. {link.get('text', 'N/A')} ({score:.2f}) - {link.get('category', 'unknown')}")
|
||||
|
||||
# Verify we found different types of links
|
||||
assert len(category_counts) > 0, "Should find at least one link category"
|
||||
print("✅ Complex page analysis test passed")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Complex page analysis test failed: {str(e)}")
|
||||
# Don't fail the test suite for network issues
|
||||
print("⚠️ This test may fail due to network connectivity issues")
|
||||
|
||||
|
||||
def test_link_analysis_scoring():
|
||||
"""Test link scoring functionality"""
|
||||
print("\n=== Testing Link Scoring ===")
|
||||
|
||||
tester = LinkAnalysisTester()
|
||||
|
||||
test_url = "https://httpbin.org/links/10"
|
||||
|
||||
try:
|
||||
result = tester.analyze_links(test_url)
|
||||
|
||||
# Analyze score distribution
|
||||
all_scores = []
|
||||
for category, links in result.items():
|
||||
if links:
|
||||
for link in links:
|
||||
# Use total_score or intrinsic_score if available
|
||||
score = link.get('total_score', link.get('intrinsic_score', 0))
|
||||
if score is not None: # Only include links that have scores
|
||||
all_scores.append(score)
|
||||
|
||||
if all_scores:
|
||||
avg_score = sum(all_scores) / len(all_scores)
|
||||
max_score = max(all_scores)
|
||||
min_score = min(all_scores)
|
||||
|
||||
print(f"📊 Score statistics:")
|
||||
print(f" Average: {avg_score:.3f}")
|
||||
print(f" Maximum: {max_score:.3f}")
|
||||
print(f" Minimum: {min_score:.3f}")
|
||||
print(f" Total links scored: {len(all_scores)}")
|
||||
|
||||
# Verify scores are in expected range
|
||||
assert all(0 <= score <= 1 for score in all_scores), "Scores should be between 0 and 1"
|
||||
print("✅ All scores are in valid range")
|
||||
|
||||
print("✅ Link scoring test passed")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Link scoring test failed: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
def test_link_analysis_error_handling():
|
||||
"""Test error handling for invalid requests"""
|
||||
print("\n=== Testing Error Handling ===")
|
||||
|
||||
tester = LinkAnalysisTester()
|
||||
|
||||
# Test with invalid URL
|
||||
try:
|
||||
tester.analyze_links("not-a-valid-url")
|
||||
print("⚠️ Expected error for invalid URL, but got success")
|
||||
except Exception as e:
|
||||
print(f"✅ Correctly handled invalid URL: {str(e)}")
|
||||
|
||||
# Test with non-existent URL
|
||||
try:
|
||||
result = tester.analyze_links("https://this-domain-does-not-exist-12345.com")
|
||||
print("⚠️ This should have failed for non-existent domain")
|
||||
except Exception as e:
|
||||
print(f"✅ Correctly handled non-existent domain: {str(e)}")
|
||||
|
||||
print("✅ Error handling test passed")
|
||||
|
||||
|
||||
def test_link_analysis_performance():
|
||||
"""Test performance of link analysis"""
|
||||
print("\n=== Testing Performance ===")
|
||||
|
||||
tester = LinkAnalysisTester()
|
||||
|
||||
test_url = "https://httpbin.org/links/50"
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
result = tester.analyze_links(test_url)
|
||||
end_time = time.time()
|
||||
|
||||
duration = end_time - start_time
|
||||
total_links = sum(len(links) for links in result.values())
|
||||
|
||||
print(f"⏱️ Analysis completed in {duration:.2f} seconds")
|
||||
print(f"🔗 Found {total_links} links")
|
||||
print(f"📈 Rate: {total_links/duration:.1f} links/second")
|
||||
|
||||
# Performance should be reasonable
|
||||
assert duration < 60, f"Analysis took too long: {duration:.2f}s"
|
||||
print("✅ Performance test passed")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Performance test failed: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
def test_link_analysis_categorization():
|
||||
"""Test link categorization functionality"""
|
||||
print("\n=== Testing Link Categorization ===")
|
||||
|
||||
tester = LinkAnalysisTester()
|
||||
|
||||
test_url = "https://www.python.org"
|
||||
|
||||
try:
|
||||
result = tester.analyze_links(test_url)
|
||||
|
||||
# Check categorization
|
||||
categories_found = []
|
||||
for category, links in result.items():
|
||||
if links:
|
||||
categories_found.append(category)
|
||||
print(f"📂 {category}: {len(links)} links")
|
||||
|
||||
# Analyze a sample link from each category
|
||||
sample_link = links[0]
|
||||
url = sample_link.get('href', '')
|
||||
text = sample_link.get('text', '')
|
||||
score = sample_link.get('total_score', sample_link.get('intrinsic_score', 0))
|
||||
|
||||
print(f" Sample: {text[:50]}... ({url[:50]}...) - score: {score:.2f}")
|
||||
|
||||
print(f"✅ Found {len(categories_found)} link categories")
|
||||
print("✅ Categorization test passed")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Categorization test failed: {str(e)}")
|
||||
# Don't fail for network issues
|
||||
print("⚠️ This test may fail due to network connectivity issues")
|
||||
|
||||
|
||||
def test_link_analysis_all_config_options():
|
||||
"""Test all available LinkPreviewConfig options"""
|
||||
print("\n=== Testing All Configuration Options ===")
|
||||
|
||||
tester = LinkAnalysisTester()
|
||||
test_url = "https://httpbin.org/links/10"
|
||||
|
||||
# Test 1: include_internal and include_external
|
||||
print("\n🔍 Testing include_internal/include_external options...")
|
||||
|
||||
configs = [
|
||||
{
|
||||
"name": "Internal only",
|
||||
"config": {"include_internal": True, "include_external": False}
|
||||
},
|
||||
{
|
||||
"name": "External only",
|
||||
"config": {"include_internal": False, "include_external": True}
|
||||
},
|
||||
{
|
||||
"name": "Both internal and external",
|
||||
"config": {"include_internal": True, "include_external": True}
|
||||
}
|
||||
]
|
||||
|
||||
for test_case in configs:
|
||||
try:
|
||||
result = tester.analyze_links(test_url, test_case["config"])
|
||||
internal_count = len(result.get('internal', []))
|
||||
external_count = len(result.get('external', []))
|
||||
|
||||
print(f" {test_case['name']}: {internal_count} internal, {external_count} external links")
|
||||
|
||||
# Verify configuration behavior
|
||||
if test_case["config"]["include_internal"] and not test_case["config"]["include_external"]:
|
||||
assert internal_count >= 0, "Should have internal links"
|
||||
elif not test_case["config"]["include_internal"] and test_case["config"]["include_external"]:
|
||||
assert external_count >= 0, "Should have external links"
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ {test_case['name']} failed: {e}")
|
||||
|
||||
# Test 2: include_patterns and exclude_patterns
|
||||
print("\n🔍 Testing include/exclude patterns...")
|
||||
|
||||
pattern_configs = [
|
||||
{
|
||||
"name": "Include specific patterns",
|
||||
"config": {
|
||||
"include_patterns": ["*/links/*", "*/test*"],
|
||||
"include_internal": True,
|
||||
"include_external": True
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Exclude specific patterns",
|
||||
"config": {
|
||||
"exclude_patterns": ["*/admin*", "*/login*"],
|
||||
"include_internal": True,
|
||||
"include_external": True
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Both include and exclude patterns",
|
||||
"config": {
|
||||
"include_patterns": ["*"],
|
||||
"exclude_patterns": ["*/exclude*"],
|
||||
"include_internal": True,
|
||||
"include_external": True
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
for test_case in pattern_configs:
|
||||
try:
|
||||
result = tester.analyze_links(test_url, test_case["config"])
|
||||
total_links = sum(len(links) for links in result.values())
|
||||
print(f" {test_case['name']}: {total_links} links found")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ {test_case['name']} failed: {e}")
|
||||
|
||||
# Test 3: Performance options (concurrency, timeout, max_links)
|
||||
print("\n🔍 Testing performance options...")
|
||||
|
||||
perf_configs = [
|
||||
{
|
||||
"name": "Low concurrency",
|
||||
"config": {
|
||||
"concurrency": 1,
|
||||
"timeout": 10,
|
||||
"max_links": 50,
|
||||
"include_internal": True,
|
||||
"include_external": True
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "High concurrency",
|
||||
"config": {
|
||||
"concurrency": 5,
|
||||
"timeout": 15,
|
||||
"max_links": 200,
|
||||
"include_internal": True,
|
||||
"include_external": True
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Very limited",
|
||||
"config": {
|
||||
"concurrency": 1,
|
||||
"timeout": 2,
|
||||
"max_links": 5,
|
||||
"include_internal": True,
|
||||
"include_external": True
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
for test_case in perf_configs:
|
||||
try:
|
||||
start_time = time.time()
|
||||
result = tester.analyze_links(test_url, test_case["config"])
|
||||
end_time = time.time()
|
||||
|
||||
total_links = sum(len(links) for links in result.values())
|
||||
duration = end_time - start_time
|
||||
|
||||
print(f" {test_case['name']}: {total_links} links in {duration:.2f}s")
|
||||
|
||||
# Verify max_links constraint
|
||||
if total_links > test_case["config"]["max_links"]:
|
||||
print(f" ⚠️ Found {total_links} links, expected max {test_case['config']['max_links']}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ {test_case['name']} failed: {e}")
|
||||
|
||||
# Test 4: Scoring and filtering options
|
||||
print("\n🔍 Testing scoring and filtering options...")
|
||||
|
||||
scoring_configs = [
|
||||
{
|
||||
"name": "No score threshold",
|
||||
"config": {
|
||||
"score_threshold": None,
|
||||
"include_internal": True,
|
||||
"include_external": True
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Low score threshold",
|
||||
"config": {
|
||||
"score_threshold": 0.1,
|
||||
"include_internal": True,
|
||||
"include_external": True
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "High score threshold",
|
||||
"config": {
|
||||
"score_threshold": 0.8,
|
||||
"include_internal": True,
|
||||
"include_external": True
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "With query for contextual scoring",
|
||||
"config": {
|
||||
"query": "test links",
|
||||
"score_threshold": 0.3,
|
||||
"include_internal": True,
|
||||
"include_external": True
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
for test_case in scoring_configs:
|
||||
try:
|
||||
result = tester.analyze_links(test_url, test_case["config"])
|
||||
total_links = sum(len(links) for links in result.values())
|
||||
|
||||
# Check score threshold
|
||||
if test_case["config"]["score_threshold"] is not None:
|
||||
min_score = test_case["config"]["score_threshold"]
|
||||
low_score_links = 0
|
||||
|
||||
for links in result.values():
|
||||
for link in links:
|
||||
score = link.get('total_score', link.get('intrinsic_score', 0))
|
||||
if score is not None and score < min_score:
|
||||
low_score_links += 1
|
||||
|
||||
if low_score_links > 0:
|
||||
print(f" ⚠️ Found {low_score_links} links below threshold {min_score}")
|
||||
else:
|
||||
print(f" ✅ All links meet threshold {min_score}")
|
||||
|
||||
print(f" {test_case['name']}: {total_links} links")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ {test_case['name']} failed: {e}")
|
||||
|
||||
# Test 5: Verbose mode
|
||||
print("\n🔍 Testing verbose mode...")
|
||||
|
||||
try:
|
||||
result = tester.analyze_links(test_url, {
|
||||
"verbose": True,
|
||||
"include_internal": True,
|
||||
"include_external": True
|
||||
})
|
||||
total_links = sum(len(links) for links in result.values())
|
||||
print(f" Verbose mode: {total_links} links")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Verbose mode failed: {e}")
|
||||
|
||||
print("✅ All configuration options test passed")
|
||||
|
||||
|
||||
def test_link_analysis_edge_cases():
|
||||
"""Test edge cases and error scenarios for configuration options"""
|
||||
print("\n=== Testing Edge Cases ===")
|
||||
|
||||
tester = LinkAnalysisTester()
|
||||
test_url = "https://httpbin.org/links/10"
|
||||
|
||||
# Test 1: Invalid configuration values
|
||||
print("\n🔍 Testing invalid configuration values...")
|
||||
|
||||
invalid_configs = [
|
||||
{
|
||||
"name": "Negative concurrency",
|
||||
"config": {"concurrency": -1}
|
||||
},
|
||||
{
|
||||
"name": "Zero timeout",
|
||||
"config": {"timeout": 0}
|
||||
},
|
||||
{
|
||||
"name": "Negative max_links",
|
||||
"config": {"max_links": -5}
|
||||
},
|
||||
{
|
||||
"name": "Invalid score threshold (too high)",
|
||||
"config": {"score_threshold": 1.5}
|
||||
},
|
||||
{
|
||||
"name": "Invalid score threshold (too low)",
|
||||
"config": {"score_threshold": -0.1}
|
||||
},
|
||||
{
|
||||
"name": "Both include flags false",
|
||||
"config": {"include_internal": False, "include_external": False}
|
||||
}
|
||||
]
|
||||
|
||||
for test_case in invalid_configs:
|
||||
try:
|
||||
result = tester.analyze_links(test_url, test_case["config"])
|
||||
print(f" ⚠️ {test_case['name']}: Expected to fail but succeeded")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✅ {test_case['name']}: Correctly failed - {str(e)}")
|
||||
|
||||
# Test 2: Extreme but valid values
|
||||
print("\n🔍 Testing extreme valid values...")
|
||||
|
||||
extreme_configs = [
|
||||
{
|
||||
"name": "Very high concurrency",
|
||||
"config": {
|
||||
"concurrency": 50,
|
||||
"timeout": 30,
|
||||
"max_links": 1000,
|
||||
"include_internal": True,
|
||||
"include_external": True
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Very low score threshold",
|
||||
"config": {
|
||||
"score_threshold": 0.0,
|
||||
"include_internal": True,
|
||||
"include_external": True
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Very high score threshold",
|
||||
"config": {
|
||||
"score_threshold": 1.0,
|
||||
"include_internal": True,
|
||||
"include_external": True
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
for test_case in extreme_configs:
|
||||
try:
|
||||
result = tester.analyze_links(test_url, test_case["config"])
|
||||
total_links = sum(len(links) for links in result.values())
|
||||
print(f" ✅ {test_case['name']}: {total_links} links")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ {test_case['name']} failed: {e}")
|
||||
|
||||
# Test 3: Complex pattern matching
|
||||
print("\n🔍 Testing complex pattern matching...")
|
||||
|
||||
pattern_configs = [
|
||||
{
|
||||
"name": "Multiple include patterns",
|
||||
"config": {
|
||||
"include_patterns": ["*/links/*", "*/test*", "*/httpbin*"],
|
||||
"include_internal": True,
|
||||
"include_external": True
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Multiple exclude patterns",
|
||||
"config": {
|
||||
"exclude_patterns": ["*/admin*", "*/login*", "*/logout*", "*/private*"],
|
||||
"include_internal": True,
|
||||
"include_external": True
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Overlapping include/exclude patterns",
|
||||
"config": {
|
||||
"include_patterns": ["*"],
|
||||
"exclude_patterns": ["*/admin*", "*/private*"],
|
||||
"include_internal": True,
|
||||
"include_external": True
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
for test_case in pattern_configs:
|
||||
try:
|
||||
result = tester.analyze_links(test_url, test_case["config"])
|
||||
total_links = sum(len(links) for links in result.values())
|
||||
print(f" {test_case['name']}: {total_links} links")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ {test_case['name']} failed: {e}")
|
||||
|
||||
print("✅ Edge cases test passed")
|
||||
|
||||
|
||||
def test_link_analysis_batch():
|
||||
"""Test batch link analysis"""
|
||||
print("\n=== Testing Batch Analysis ===")
|
||||
|
||||
tester = LinkAnalysisTester()
|
||||
|
||||
test_urls = [
|
||||
"https://httpbin.org/links/10",
|
||||
"https://httpbin.org/links/5",
|
||||
"https://httpbin.org/links/2"
|
||||
]
|
||||
|
||||
try:
|
||||
results = {}
|
||||
for url in test_urls:
|
||||
print(f"🔍 Analyzing: {url}")
|
||||
result = tester.analyze_links(url)
|
||||
results[url] = result
|
||||
|
||||
# Small delay to be respectful
|
||||
time.sleep(0.5)
|
||||
|
||||
print(f"✅ Successfully analyzed {len(results)} URLs")
|
||||
|
||||
for url, result in results.items():
|
||||
total_links = sum(len(links) for links in result.values())
|
||||
print(f" {url}: {total_links} links")
|
||||
|
||||
print("✅ Batch analysis test passed")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Batch analysis test failed: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
def run_all_link_analysis_tests():
|
||||
"""Run all link analysis tests"""
|
||||
print("🚀 Starting Link Analysis Test Suite")
|
||||
print("=" * 50)
|
||||
|
||||
tests = [
|
||||
test_link_analysis_basic,
|
||||
test_link_analysis_with_config,
|
||||
test_link_analysis_complex_page,
|
||||
test_link_analysis_scoring,
|
||||
test_link_analysis_error_handling,
|
||||
test_link_analysis_performance,
|
||||
test_link_analysis_categorization,
|
||||
test_link_analysis_batch
|
||||
]
|
||||
|
||||
passed = 0
|
||||
failed = 0
|
||||
|
||||
for test_func in tests:
|
||||
try:
|
||||
test_func()
|
||||
passed += 1
|
||||
print(f"✅ {test_func.__name__} PASSED")
|
||||
except Exception as e:
|
||||
failed += 1
|
||||
print(f"❌ {test_func.__name__} FAILED: {str(e)}")
|
||||
|
||||
print("-" * 50)
|
||||
|
||||
print(f"\n📊 Test Results: {passed} passed, {failed} failed")
|
||||
|
||||
if failed > 0:
|
||||
print("⚠️ Some tests failed, but this may be due to network or server issues")
|
||||
return False
|
||||
|
||||
print("🎉 All tests passed!")
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Check if server is running
|
||||
import socket
|
||||
|
||||
def check_server(host="localhost", port=11234):
|
||||
try:
|
||||
socket.create_connection((host, port), timeout=5)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
if not check_server():
|
||||
print("❌ Server is not running on localhost:11234")
|
||||
print("Please start the Crawl4AI server first:")
|
||||
print(" cd deploy/docker && python server.py")
|
||||
sys.exit(1)
|
||||
|
||||
success = run_all_link_analysis_tests()
|
||||
sys.exit(0 if success else 1)
|
||||
169
tests/test_link_analysis_integration.py
Normal file
169
tests/test_link_analysis_integration.py
Normal file
@@ -0,0 +1,169 @@
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
|
||||
|
||||
def test_links_analyze_endpoint():
|
||||
"""Integration test for the /links/analyze endpoint"""
|
||||
|
||||
base_url = "http://localhost:11234"
|
||||
|
||||
# Health check
|
||||
try:
|
||||
health_response = requests.get(f"{base_url}/health", timeout=5)
|
||||
if health_response.status_code != 200:
|
||||
print("❌ Server health check failed")
|
||||
return False
|
||||
print("✅ Server health check passed")
|
||||
except Exception as e:
|
||||
print(f"❌ Cannot connect to server: {e}")
|
||||
return False
|
||||
|
||||
# Get auth token
|
||||
token = None
|
||||
try:
|
||||
token_response = requests.post(
|
||||
f"{base_url}/token",
|
||||
json={"email": "test@example.com"},
|
||||
timeout=5
|
||||
)
|
||||
if token_response.status_code == 200:
|
||||
token = token_response.json()["access_token"]
|
||||
print("✅ Authentication token obtained")
|
||||
except Exception as e:
|
||||
print(f"⚠️ Could not get auth token: {e}")
|
||||
|
||||
# Test the links/analyze endpoint
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if token:
|
||||
headers["Authorization"] = f"Bearer {token}"
|
||||
|
||||
# Test 1: Basic request
|
||||
print("\n🔍 Testing basic link analysis...")
|
||||
test_data = {
|
||||
"url": "https://httpbin.org/links/10",
|
||||
"config": {
|
||||
"include_internal": True,
|
||||
"include_external": True,
|
||||
"max_links": 50,
|
||||
"verbose": True
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{base_url}/links/analyze",
|
||||
headers=headers,
|
||||
json=test_data,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
print("✅ Basic link analysis successful")
|
||||
print(f"📄 Response structure: {list(result.keys())}")
|
||||
|
||||
# Verify response structure
|
||||
total_links = sum(len(links) for links in result.values())
|
||||
print(f"📊 Found {total_links} total links")
|
||||
|
||||
# Debug: Show what was actually returned
|
||||
if total_links == 0:
|
||||
print("⚠️ No links found - showing full response:")
|
||||
print(json.dumps(result, indent=2))
|
||||
|
||||
# Check for expected categories
|
||||
found_categories = []
|
||||
for category in ['internal', 'external', 'social', 'download', 'email', 'phone']:
|
||||
if category in result and result[category]:
|
||||
found_categories.append(category)
|
||||
|
||||
print(f"📂 Found categories: {found_categories}")
|
||||
|
||||
# Verify link objects have required fields
|
||||
if total_links > 0:
|
||||
sample_found = False
|
||||
for category, links in result.items():
|
||||
if links:
|
||||
sample_link = links[0]
|
||||
if 'href' in sample_link and 'total_score' in sample_link:
|
||||
sample_found = True
|
||||
break
|
||||
|
||||
if sample_found:
|
||||
print("✅ Link objects have required fields")
|
||||
else:
|
||||
print("⚠️ Link objects missing required fields")
|
||||
|
||||
else:
|
||||
print(f"❌ Basic link analysis failed: {response.status_code}")
|
||||
print(f"Response: {response.text}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Basic link analysis error: {e}")
|
||||
return False
|
||||
|
||||
# Test 2: With configuration
|
||||
print("\n🔍 Testing link analysis with configuration...")
|
||||
test_data_with_config = {
|
||||
"url": "https://httpbin.org/links/10",
|
||||
"config": {
|
||||
"include_internal": True,
|
||||
"include_external": True,
|
||||
"max_links": 50,
|
||||
"timeout": 10,
|
||||
"verbose": True
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{base_url}/links/analyze",
|
||||
headers=headers,
|
||||
json=test_data_with_config,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
total_links = sum(len(links) for links in result.values())
|
||||
print(f"✅ Link analysis with config successful ({total_links} links)")
|
||||
else:
|
||||
print(f"❌ Link analysis with config failed: {response.status_code}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Link analysis with config error: {e}")
|
||||
return False
|
||||
|
||||
# Test 3: Error handling
|
||||
print("\n🔍 Testing error handling...")
|
||||
invalid_data = {
|
||||
"url": "not-a-valid-url"
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{base_url}/links/analyze",
|
||||
headers=headers,
|
||||
json=invalid_data,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if response.status_code >= 400:
|
||||
print("✅ Error handling works correctly")
|
||||
else:
|
||||
print("⚠️ Expected error for invalid URL, but got success")
|
||||
|
||||
except Exception as e:
|
||||
print(f"✅ Error handling caught exception: {e}")
|
||||
|
||||
print("\n🎉 All integration tests passed!")
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = test_links_analyze_endpoint()
|
||||
sys.exit(0 if success else 1)
|
||||
Reference in New Issue
Block a user