Add link analysis tests and integration tests for /links/analyze endpoint

- Implemented `test_link_analysis` in `test_docker.py` to validate link analysis functionality.
- Created `test_link_analysis.py` with comprehensive tests for link analysis, including basic functionality, configuration options, error handling, performance, and edge cases.
- Added integration tests in `test_link_analysis_integration.py` to verify the /links/analyze endpoint, including health checks, authentication, and error handling.
This commit is contained in:
AHMET YILMAZ
2025-10-14 19:24:16 +08:00
parent 8cca9704eb
commit aebf5a3694
7 changed files with 1926 additions and 0 deletions

View File

@@ -0,0 +1,315 @@
#!/usr/bin/env python3
"""
Link Analysis Example
====================
This example demonstrates how to use the new /links/analyze endpoint
to extract, analyze, and score links from web pages.
Requirements:
- Crawl4AI server running on localhost:11234
- requests library: pip install requests
"""
import requests
import json
import time
from typing import Dict, Any, List
class LinkAnalyzer:
"""Simple client for the link analysis endpoint"""
def __init__(self, base_url: str = "http://localhost:11234", token: str = None):
self.base_url = base_url
self.token = token or self._get_test_token()
def _get_test_token(self) -> str:
"""Get a test token (for development only)"""
try:
response = requests.post(
f"{self.base_url}/token",
json={"email": "test@example.com"},
timeout=10
)
if response.status_code == 200:
return response.json()["access_token"]
except:
pass
return "test-token" # Fallback for local testing
def analyze_links(self, url: str, config: Dict[str, Any] = None) -> Dict[str, Any]:
"""Analyze links on a webpage"""
headers = {"Content-Type": "application/json"}
if self.token and self.token != "test-token":
headers["Authorization"] = f"Bearer {self.token}"
data = {"url": url}
if config:
data["config"] = config
response = requests.post(
f"{self.base_url}/links/analyze",
headers=headers,
json=data,
timeout=30
)
response.raise_for_status()
return response.json()
def print_summary(self, result: Dict[str, Any]):
"""Print a summary of link analysis results"""
print("\n" + "="*60)
print("📊 LINK ANALYSIS SUMMARY")
print("="*60)
total_links = sum(len(links) for links in result.values())
print(f"Total links found: {total_links}")
for category, links in result.items():
if links:
print(f"\n📂 {category.upper()}: {len(links)} links")
# Show top 3 links by score
top_links = sorted(links, key=lambda x: x.get('total_score', 0), reverse=True)[:3]
for i, link in enumerate(top_links, 1):
score = link.get('total_score', 0)
text = link.get('text', 'No text')[:50]
url = link.get('href', 'No URL')[:60]
print(f" {i}. [{score:.2f}] {text}{url}")
def example_1_basic_analysis():
"""Example 1: Basic link analysis"""
print("\n🔍 Example 1: Basic Link Analysis")
print("-" * 40)
analyzer = LinkAnalyzer()
# Analyze a simple test page
url = "https://httpbin.org/links/10"
print(f"Analyzing: {url}")
try:
result = analyzer.analyze_links(url)
analyzer.print_summary(result)
return result
except Exception as e:
print(f"❌ Error: {e}")
return None
def example_2_custom_config():
"""Example 2: Analysis with custom configuration"""
print("\n🔍 Example 2: Custom Configuration")
print("-" * 40)
analyzer = LinkAnalyzer()
# Custom configuration
config = {
"include_internal": True,
"include_external": True,
"max_links": 50,
"timeout": 10,
"verbose": True
}
url = "https://httpbin.org/links/10"
print(f"Analyzing with custom config: {url}")
print(f"Config: {json.dumps(config, indent=2)}")
try:
result = analyzer.analyze_links(url, config)
analyzer.print_summary(result)
return result
except Exception as e:
print(f"❌ Error: {e}")
return None
def example_3_real_world_site():
"""Example 3: Analyzing a real website"""
print("\n🔍 Example 3: Real Website Analysis")
print("-" * 40)
analyzer = LinkAnalyzer()
# Analyze Python official website
url = "https://www.python.org"
print(f"Analyzing real website: {url}")
print("This may take a moment...")
try:
result = analyzer.analyze_links(url)
analyzer.print_summary(result)
# Additional analysis
print("\n📈 DETAILED ANALYSIS")
print("-" * 20)
# Find external links with highest scores
external_links = result.get('external', [])
if external_links:
top_external = sorted(external_links, key=lambda x: x.get('total_score', 0), reverse=True)[:5]
print("\n🌐 Top External Links:")
for link in top_external:
print(f"{link.get('text', 'N/A')} (score: {link.get('total_score', 0):.2f})")
print(f" {link.get('href', 'N/A')}")
# Find internal links
internal_links = result.get('internal', [])
if internal_links:
top_internal = sorted(internal_links, key=lambda x: x.get('total_score', 0), reverse=True)[:5]
print("\n🏠 Top Internal Links:")
for link in top_internal:
print(f"{link.get('text', 'N/A')} (score: {link.get('total_score', 0):.2f})")
print(f" {link.get('href', 'N/A')}")
return result
except Exception as e:
print(f"❌ Error: {e}")
print("⚠️ This example may fail due to network issues")
return None
def example_4_comparative_analysis():
"""Example 4: Comparing link structures across sites"""
print("\n🔍 Example 4: Comparative Analysis")
print("-" * 40)
analyzer = LinkAnalyzer()
sites = [
("https://httpbin.org/links/10", "Test Page 1"),
("https://httpbin.org/links/5", "Test Page 2")
]
results = {}
for url, name in sites:
print(f"\nAnalyzing: {name}")
try:
result = analyzer.analyze_links(url)
results[name] = result
total_links = sum(len(links) for links in result.values())
categories = len([cat for cat, links in result.items() if links])
print(f" Links: {total_links}, Categories: {categories}")
except Exception as e:
print(f" ❌ Error: {e}")
# Compare results
if len(results) > 1:
print("\n📊 COMPARISON")
print("-" * 15)
for name, result in results.items():
total = sum(len(links) for links in result.values())
print(f"{name}: {total} total links")
# Calculate average scores
all_scores = []
for links in result.values():
for link in links:
all_scores.append(link.get('total_score', 0))
if all_scores:
avg_score = sum(all_scores) / len(all_scores)
print(f" Average link score: {avg_score:.3f}")
def example_5_advanced_filtering():
"""Example 5: Advanced filtering and analysis"""
print("\n🔍 Example 5: Advanced Filtering")
print("-" * 40)
analyzer = LinkAnalyzer()
url = "https://httpbin.org/links/10"
try:
result = analyzer.analyze_links(url)
# Filter links by score
min_score = 0.5
high_quality_links = {}
for category, links in result.items():
if links:
filtered = [link for link in links if link.get('total_score', 0) >= min_score]
if filtered:
high_quality_links[category] = filtered
print(f"\n🎯 High-quality links (score >= {min_score}):")
total_high_quality = sum(len(links) for links in high_quality_links.values())
print(f"Total: {total_high_quality} links")
for category, links in high_quality_links.items():
print(f"\n{category.upper()}:")
for link in links:
score = link.get('total_score', 0)
text = link.get('text', 'No text')
print(f" • [{score:.2f}] {text}")
# Extract unique domains from external links
external_links = result.get('external', [])
if external_links:
domains = set()
for link in external_links:
url = link.get('href', '')
if '://' in url:
domain = url.split('://')[1].split('/')[0]
domains.add(domain)
print(f"\n🌐 Unique external domains: {len(domains)}")
for domain in sorted(domains):
print(f"{domain}")
except Exception as e:
print(f"❌ Error: {e}")
def main():
"""Run all examples"""
print("🚀 Link Analysis Examples")
print("=" * 50)
print("Make sure the Crawl4AI server is running on localhost:11234")
print()
examples = [
example_1_basic_analysis,
example_2_custom_config,
example_3_real_world_site,
example_4_comparative_analysis,
example_5_advanced_filtering
]
for i, example_func in enumerate(examples, 1):
print(f"\n{'='*60}")
print(f"Running Example {i}")
print('='*60)
try:
example_func()
except KeyboardInterrupt:
print("\n⏹️ Example interrupted by user")
break
except Exception as e:
print(f"\n❌ Example {i} failed: {e}")
if i < len(examples):
print("\n⏳ Press Enter to continue to next example...")
try:
input()
except KeyboardInterrupt:
break
print("\n🎉 Examples completed!")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,523 @@
# Link Analysis and Scoring
## Introduction
**Link Analysis** is a powerful feature that extracts, analyzes, and scores all links found on a webpage. This endpoint helps you understand the link structure, identify high-value links, and get insights into the connectivity patterns of any website.
Think of it as a smart link discovery tool that not only extracts links but also evaluates their importance, relevance, and quality through advanced scoring algorithms.
## Key Concepts
### What Link Analysis Does
When you analyze a webpage, the system:
1. **Extracts All Links** - Finds every hyperlink on the page
2. **Scores Links** - Assigns relevance scores based on multiple factors
3. **Categorizes Links** - Groups links by type (internal, external, etc.)
4. **Provides Metadata** - URL text, attributes, and context information
5. **Ranks by Importance** - Orders links from most to least valuable
### Scoring Factors
The link scoring algorithm considers:
- **Text Content**: Link anchor text relevance and descriptiveness
- **URL Structure**: Depth, parameters, and path patterns
- **Context**: Surrounding text and page position
- **Attributes**: Title, rel attributes, and other metadata
- **Link Type**: Internal vs external classification
## Quick Start
### Basic Usage
```python
import requests
# Analyze links on a webpage
response = requests.post(
"http://localhost:8000/links/analyze",
headers={"Authorization": "Bearer YOUR_TOKEN"},
json={
"url": "https://example.com"
}
)
result = response.json()
print(f"Found {len(result.get('internal', []))} internal links")
print(f"Found {len(result.get('external', []))} external links")
# Show top 3 links by score
for link_type in ['internal', 'external']:
if link_type in result:
top_links = sorted(result[link_type], key=lambda x: x.get('score', 0), reverse=True)[:3]
print(f"\nTop {link_type} links:")
for link in top_links:
print(f"- {link.get('url', 'N/A')} (score: {link.get('score', 0):.2f})")
```
### With Custom Configuration
```python
response = requests.post(
"http://localhost:8000/links/analyze",
headers={"Authorization": "Bearer YOUR_TOKEN"},
json={
"url": "https://news.example.com",
"config": {
"force": False, # Skip cache
"wait_for": 2.0, # Wait for dynamic content
"simulate_user": True, # User-like browsing
"override_navigator": True # Custom user agent
}
}
)
```
## Configuration Options
The `config` parameter accepts a `LinkPreviewConfig` dictionary:
### Basic Options
```python
config = {
"force": False, # Force fresh crawl (default: False)
"wait_for": None, # CSS selector or timeout in seconds
"simulate_user": True, # Simulate human behavior
"override_navigator": True, # Override browser navigator
"headers": { # Custom headers
"Accept-Language": "en-US,en;q=0.9"
}
}
```
### Advanced Options
```python
config = {
# Timing and behavior
"delay_before_return_html": 0.5, # Delay before HTML extraction
"js_code": ["window.scrollTo(0, document.body.scrollHeight)"], # JS to execute
# Content processing
"word_count_threshold": 1, # Minimum word count
"exclusion_patterns": [ # Link patterns to exclude
r".*/logout.*",
r".*/admin.*"
],
# Caching and session
"session_id": "my-session-123", # Session identifier
"magic": False # Magic link processing
}
```
## Response Structure
The endpoint returns a JSON object with categorized links:
```json
{
"internal": [
{
"url": "https://example.com/about",
"text": "About Us",
"title": "Learn about our company",
"score": 0.85,
"context": "footer navigation",
"attributes": {
"rel": ["nofollow"],
"target": "_blank"
}
}
],
"external": [
{
"url": "https://partner-site.com",
"text": "Partner Site",
"title": "Visit our partner",
"score": 0.72,
"context": "main content",
"attributes": {}
}
],
"social": [...],
"download": [...],
"email": [...],
"phone": [...]
}
```
### Link Categories
| Category | Description | Example |
|----------|-------------|---------|
| **internal** | Links within the same domain | `/about`, `https://example.com/contact` |
| **external** | Links to different domains | `https://google.com` |
| **social** | Social media platform links | `https://twitter.com/user` |
| **download** | File download links | `/files/document.pdf` |
| **email** | Email addresses | `mailto:contact@example.com` |
| **phone** | Phone numbers | `tel:+1234567890` |
### Link Metadata
Each link object contains:
```python
{
"url": str, # The actual href value
"text": str, # Anchor text content
"title": str, # Title attribute (if any)
"score": float, # Relevance score (0.0-1.0)
"context": str, # Where the link was found
"attributes": dict, # All HTML attributes
"hash": str, # URL fragment (if any)
"domain": str, # Extracted domain name
"scheme": str, # URL scheme (http/https/etc)
}
```
## Practical Examples
### SEO Audit Tool
```python
def seo_audit(url: str):
"""Perform SEO link analysis on a webpage"""
response = requests.post(
"http://localhost:8000/links/analyze",
headers={"Authorization": "Bearer YOUR_TOKEN"},
json={"url": url}
)
result = response.json()
print(f"📊 SEO Audit for {url}")
print(f"Internal links: {len(result.get('internal', []))}")
print(f"External links: {len(result.get('external', []))}")
# Check for SEO issues
internal_links = result.get('internal', [])
external_links = result.get('external', [])
# Find links with low scores
low_score_links = [link for link in internal_links if link.get('score', 0) < 0.3]
if low_score_links:
print(f"⚠️ Found {len(low_score_links)} low-quality internal links")
# Find external opportunities
high_value_external = [link for link in external_links if link.get('score', 0) > 0.7]
if high_value_external:
print(f"✅ Found {len(high_value_external)} high-value external links")
return result
# Usage
audit_result = seo_audit("https://example.com")
```
### Competitor Analysis
```python
def competitor_analysis(urls: list):
"""Analyze link patterns across multiple competitor sites"""
all_results = {}
for url in urls:
response = requests.post(
"http://localhost:8000/links/analyze",
headers={"Authorization": "Bearer YOUR_TOKEN"},
json={"url": url}
)
all_results[url] = response.json()
# Compare external link strategies
print("🔍 Competitor Link Analysis")
for url, result in all_results.items():
external_links = result.get('external', [])
avg_score = sum(link.get('score', 0) for link in external_links) / len(external_links) if external_links else 0
print(f"{url}: {len(external_links)} external links (avg score: {avg_score:.2f})")
return all_results
# Usage
competitors = [
"https://competitor1.com",
"https://competitor2.com",
"https://competitor3.com"
]
analysis = competitor_analysis(competitors)
```
### Content Discovery
```python
def discover_related_content(start_url: str, max_depth: int = 2):
"""Discover related content through link analysis"""
visited = set()
queue = [(start_url, 0)]
while queue and len(visited) < 20:
current_url, depth = queue.pop(0)
if current_url in visited or depth > max_depth:
continue
visited.add(current_url)
try:
response = requests.post(
"http://localhost:8000/links/analyze",
headers={"Authorization": "Bearer YOUR_TOKEN"},
json={"url": current_url}
)
result = response.json()
internal_links = result.get('internal', [])
# Sort by score and add top links to queue
top_links = sorted(internal_links, key=lambda x: x.get('score', 0), reverse=True)[:3]
for link in top_links:
if link['url'] not in visited:
queue.append((link['url'], depth + 1))
print(f"🔗 Found: {link['text']} ({link['score']:.2f})")
except Exception as e:
print(f"❌ Error analyzing {current_url}: {e}")
return visited
# Usage
related_pages = discover_related_content("https://blog.example.com")
print(f"Discovered {len(related_pages)} related pages")
```
## Best Practices
### 1. Request Optimization
```python
# ✅ Good: Use appropriate timeouts
response = requests.post(
"http://localhost:8000/links/analyze",
headers={"Authorization": "Bearer YOUR_TOKEN"},
json={"url": url},
timeout=30 # 30 second timeout
)
# ✅ Good: Configure wait times for dynamic sites
config = {
"wait_for": 2.0, # Wait for JavaScript to load
"simulate_user": True
}
```
### 2. Error Handling
```python
def safe_link_analysis(url: str):
try:
response = requests.post(
"http://localhost:8000/links/analyze",
headers={"Authorization": "Bearer YOUR_TOKEN"},
json={"url": url},
timeout=30
)
if response.status_code == 200:
return response.json()
elif response.status_code == 400:
print("❌ Invalid request format")
elif response.status_code == 500:
print("❌ Server error during analysis")
else:
print(f"❌ Unexpected status code: {response.status_code}")
except requests.Timeout:
print("⏰ Request timed out")
except requests.ConnectionError:
print("🔌 Connection error")
except Exception as e:
print(f"❌ Unexpected error: {e}")
return None
```
### 3. Data Processing
```python
def process_links_data(result: dict):
"""Process and filter link analysis results"""
# Filter by minimum score
min_score = 0.5
high_quality_links = {}
for category, links in result.items():
filtered_links = [
link for link in links
if link.get('score', 0) >= min_score
]
if filtered_links:
high_quality_links[category] = filtered_links
# Extract unique domains
domains = set()
for links in result.get('external', []):
domains.add(links.get('domain', ''))
return {
'filtered_links': high_quality_links,
'unique_domains': list(domains),
'total_links': sum(len(links) for links in result.values())
}
```
## Performance Considerations
### Response Times
- **Simple pages**: 2-5 seconds
- **Complex pages**: 5-15 seconds
- **JavaScript-heavy**: 10-30 seconds
### Rate Limiting
The endpoint includes built-in rate limiting. For bulk analysis:
```python
import time
def bulk_link_analysis(urls: list, delay: float = 1.0):
"""Analyze multiple URLs with rate limiting"""
results = {}
for url in urls:
result = safe_link_analysis(url)
if result:
results[url] = result
# Respect rate limits
time.sleep(delay)
return results
```
## Error Handling
### Common Errors and Solutions
| Error Code | Cause | Solution |
|------------|-------|----------|
| **400** | Invalid URL or config | Check URL format and config structure |
| **401** | Invalid authentication | Verify your API token |
| **429** | Rate limit exceeded | Add delays between requests |
| **500** | Crawl failure | Check if site is accessible |
| **503** | Service unavailable | Try again later |
### Debug Mode
```python
# Enable verbose logging for debugging
config = {
"headers": {
"User-Agent": "Crawl4AI-Debug/1.0"
}
}
# Include error details in response
try:
response = requests.post(
"http://localhost:8000/links/analyze",
headers={"Authorization": "Bearer YOUR_TOKEN"},
json={"url": url, "config": config}
)
response.raise_for_status()
except requests.HTTPError as e:
print(f"Error details: {e.response.text}")
```
## API Reference
### Endpoint Details
- **URL**: `/links/analyze`
- **Method**: `POST`
- **Content-Type**: `application/json`
- **Authentication**: Bearer token required
### Request Schema
```python
{
"url": str, # Required: URL to analyze
"config": { # Optional: LinkPreviewConfig
"force": bool,
"wait_for": float,
"simulate_user": bool,
"override_navigator": bool,
"headers": dict,
"js_code": list,
"delay_before_return_html": float,
"word_count_threshold": int,
"exclusion_patterns": list,
"session_id": str,
"magic": bool
}
}
```
### Response Schema
```python
{
"internal": [LinkObject],
"external": [LinkObject],
"social": [LinkObject],
"download": [LinkObject],
"email": [LinkObject],
"phone": [LinkObject]
}
```
### LinkObject Schema
```python
{
"url": str,
"text": str,
"title": str,
"score": float,
"context": str,
"attributes": dict,
"hash": str,
"domain": str,
"scheme": str
}
```
## Next Steps
- Learn about [Advanced Link Processing](../advanced/link-processing.md)
- Explore the [Link Preview Configuration](../api/link-preview-config.md)
- See more [Examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples/link-analysis)
## FAQ
**Q: How is the link score calculated?**
A: The score considers multiple factors including anchor text relevance, URL structure, page context, and link attributes. Scores range from 0.0 (lowest quality) to 1.0 (highest quality).
**Q: Can I analyze password-protected pages?**
A: Yes! Use the `js_code` parameter to handle authentication, or include session cookies in the `headers` configuration.
**Q: How many links can I analyze at once?**
A: There's no hard limit on the number of links per page, but very large pages (>10,000 links) may take longer to process.
**Q: Can I filter out certain types of links?**
A: Use the `exclusion_patterns` parameter in the config to filter out unwanted links using regex patterns.
**Q: Does this work with JavaScript-heavy sites?**
A: Absolutely! The crawler waits for JavaScript execution and can even run custom JavaScript using the `js_code` parameter.