feat: Add advanced link head extraction with three-layer scoring system (#1)
Squashed commit from feature/link-extractor branch implementing comprehensive link analysis: - Extract HTML head content from discovered links with parallel processing - Three-layer scoring: Intrinsic (URL quality), Contextual (BM25), and Total scores - New LinkExtractionConfig class for type-safe configuration - Pattern-based filtering for internal/external links - Comprehensive documentation and examples
This commit is contained in:
376
docs/examples/link_head_extraction_example.py
Normal file
376
docs/examples/link_head_extraction_example.py
Normal file
@@ -0,0 +1,376 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Link Head Extraction & Scoring Example
|
||||
|
||||
This example demonstrates Crawl4AI's advanced link analysis capabilities:
|
||||
1. Basic link head extraction
|
||||
2. Three-layer scoring system (intrinsic, contextual, total)
|
||||
3. Pattern-based filtering
|
||||
4. Multiple practical use cases
|
||||
|
||||
Requirements:
|
||||
- crawl4ai installed
|
||||
- Internet connection
|
||||
|
||||
Usage:
|
||||
python link_head_extraction_example.py
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import LinkExtractionConfig
|
||||
|
||||
|
||||
async def basic_link_head_extraction():
|
||||
"""
|
||||
Basic example: Extract head content from internal links with scoring
|
||||
"""
|
||||
print("🔗 Basic Link Head Extraction Example")
|
||||
print("=" * 50)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
# Enable link head extraction
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
include_internal=True, # Process internal links
|
||||
include_external=False, # Skip external links for this demo
|
||||
max_links=5, # Limit to 5 links
|
||||
concurrency=3, # Process 3 links simultaneously
|
||||
timeout=10, # 10 second timeout per link
|
||||
query="API documentation guide", # Query for relevance scoring
|
||||
verbose=True # Show detailed progress
|
||||
),
|
||||
# Enable intrinsic link scoring
|
||||
score_links=True,
|
||||
only_text=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://docs.python.org/3/", config=config)
|
||||
|
||||
if result.success:
|
||||
print(f"\n✅ Successfully crawled: {result.url}")
|
||||
|
||||
internal_links = result.links.get("internal", [])
|
||||
links_with_head = [link for link in internal_links
|
||||
if link.get("head_data") is not None]
|
||||
|
||||
print(f"🧠 Links with head data: {len(links_with_head)}")
|
||||
|
||||
# Show detailed results
|
||||
for i, link in enumerate(links_with_head[:3]):
|
||||
print(f"\n📄 Link {i+1}: {link['href']}")
|
||||
print(f" Text: '{link.get('text', 'No text')[:50]}...'")
|
||||
|
||||
# Show all three score types
|
||||
intrinsic = link.get('intrinsic_score')
|
||||
contextual = link.get('contextual_score')
|
||||
total = link.get('total_score')
|
||||
|
||||
print(f" 📊 Scores:")
|
||||
if intrinsic is not None:
|
||||
print(f" • Intrinsic: {intrinsic:.2f}/10.0")
|
||||
if contextual is not None:
|
||||
print(f" • Contextual: {contextual:.3f}")
|
||||
if total is not None:
|
||||
print(f" • Total: {total:.3f}")
|
||||
|
||||
# Show head data
|
||||
head_data = link.get("head_data", {})
|
||||
if head_data:
|
||||
title = head_data.get("title", "No title")
|
||||
description = head_data.get("meta", {}).get("description", "")
|
||||
print(f" 📰 Title: {title[:60]}...")
|
||||
if description:
|
||||
print(f" 📝 Description: {description[:80]}...")
|
||||
else:
|
||||
print(f"❌ Crawl failed: {result.error_message}")
|
||||
|
||||
|
||||
async def research_assistant_example():
|
||||
"""
|
||||
Research Assistant: Find highly relevant documentation pages
|
||||
"""
|
||||
print("\n\n🔍 Research Assistant Example")
|
||||
print("=" * 50)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
include_external=True,
|
||||
include_patterns=["*/docs/*", "*/tutorial/*", "*/guide/*"],
|
||||
exclude_patterns=["*/login*", "*/admin*"],
|
||||
query="machine learning neural networks deep learning",
|
||||
max_links=15,
|
||||
score_threshold=0.4, # Only include high-relevance links
|
||||
concurrency=8,
|
||||
verbose=False # Clean output for this example
|
||||
),
|
||||
score_links=True
|
||||
)
|
||||
|
||||
# Test with scikit-learn documentation
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://scikit-learn.org/stable/", config=config)
|
||||
|
||||
if result.success:
|
||||
print(f"✅ Analyzed: {result.url}")
|
||||
|
||||
all_links = result.links.get("internal", []) + result.links.get("external", [])
|
||||
|
||||
# Filter for high-scoring links
|
||||
high_scoring_links = [link for link in all_links
|
||||
if link.get("total_score", 0) > 0.6]
|
||||
|
||||
# Sort by total score (highest first)
|
||||
high_scoring_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
|
||||
|
||||
print(f"\n🎯 Found {len(high_scoring_links)} highly relevant links:")
|
||||
print(" (Showing top 5 by relevance score)")
|
||||
|
||||
for i, link in enumerate(high_scoring_links[:5]):
|
||||
score = link.get("total_score", 0)
|
||||
title = link.get("head_data", {}).get("title", "No title")
|
||||
print(f"\n{i+1}. ⭐ {score:.3f} - {title[:70]}...")
|
||||
print(f" 🔗 {link['href']}")
|
||||
|
||||
# Show score breakdown
|
||||
intrinsic = link.get('intrinsic_score', 0)
|
||||
contextual = link.get('contextual_score', 0)
|
||||
print(f" 📊 Quality: {intrinsic:.1f}/10 | Relevance: {contextual:.3f}")
|
||||
else:
|
||||
print(f"❌ Research failed: {result.error_message}")
|
||||
|
||||
|
||||
async def api_discovery_example():
|
||||
"""
|
||||
API Discovery: Find API endpoints and references
|
||||
"""
|
||||
print("\n\n🔧 API Discovery Example")
|
||||
print("=" * 50)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
include_patterns=["*/api/*", "*/reference/*", "*/endpoint/*"],
|
||||
exclude_patterns=["*/deprecated/*", "*/v1/*"], # Skip old versions
|
||||
max_links=25,
|
||||
concurrency=10,
|
||||
timeout=8,
|
||||
verbose=False
|
||||
),
|
||||
score_links=True
|
||||
)
|
||||
|
||||
# Example with a documentation site that has API references
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://httpbin.org/", config=config)
|
||||
|
||||
if result.success:
|
||||
print(f"✅ Discovered APIs at: {result.url}")
|
||||
|
||||
api_links = result.links.get("internal", [])
|
||||
|
||||
# Categorize by detected content
|
||||
endpoints = {"GET": [], "POST": [], "PUT": [], "DELETE": [], "OTHER": []}
|
||||
|
||||
for link in api_links:
|
||||
if link.get("head_data"):
|
||||
title = link.get("head_data", {}).get("title", "").upper()
|
||||
text = link.get("text", "").upper()
|
||||
|
||||
# Simple categorization based on content
|
||||
if "GET" in title or "GET" in text:
|
||||
endpoints["GET"].append(link)
|
||||
elif "POST" in title or "POST" in text:
|
||||
endpoints["POST"].append(link)
|
||||
elif "PUT" in title or "PUT" in text:
|
||||
endpoints["PUT"].append(link)
|
||||
elif "DELETE" in title or "DELETE" in text:
|
||||
endpoints["DELETE"].append(link)
|
||||
else:
|
||||
endpoints["OTHER"].append(link)
|
||||
|
||||
# Display results
|
||||
total_found = sum(len(links) for links in endpoints.values())
|
||||
print(f"\n📡 Found {total_found} API-related links:")
|
||||
|
||||
for method, links in endpoints.items():
|
||||
if links:
|
||||
print(f"\n{method} Endpoints ({len(links)}):")
|
||||
for link in links[:3]: # Show first 3 of each type
|
||||
title = link.get("head_data", {}).get("title", "No title")
|
||||
score = link.get("intrinsic_score", 0)
|
||||
print(f" • [{score:.1f}] {title[:50]}...")
|
||||
print(f" {link['href']}")
|
||||
else:
|
||||
print(f"❌ API discovery failed: {result.error_message}")
|
||||
|
||||
|
||||
async def link_quality_analysis():
|
||||
"""
|
||||
Link Quality Analysis: Analyze website structure and link quality
|
||||
"""
|
||||
print("\n\n📊 Link Quality Analysis Example")
|
||||
print("=" * 50)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
max_links=30, # Analyze more links for better statistics
|
||||
concurrency=15,
|
||||
timeout=6,
|
||||
verbose=False
|
||||
),
|
||||
score_links=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Test with a content-rich site
|
||||
result = await crawler.arun("https://docs.python.org/3/", config=config)
|
||||
|
||||
if result.success:
|
||||
print(f"✅ Analyzed: {result.url}")
|
||||
|
||||
links = result.links.get("internal", [])
|
||||
|
||||
# Extract intrinsic scores for analysis
|
||||
scores = [link.get('intrinsic_score', 0) for link in links if link.get('intrinsic_score') is not None]
|
||||
|
||||
if scores:
|
||||
avg_score = sum(scores) / len(scores)
|
||||
high_quality = len([s for s in scores if s >= 7.0])
|
||||
medium_quality = len([s for s in scores if 4.0 <= s < 7.0])
|
||||
low_quality = len([s for s in scores if s < 4.0])
|
||||
|
||||
print(f"\n📈 Quality Analysis Results:")
|
||||
print(f" 📊 Average Score: {avg_score:.2f}/10.0")
|
||||
print(f" 🟢 High Quality (≥7.0): {high_quality} links")
|
||||
print(f" 🟡 Medium Quality (4.0-6.9): {medium_quality} links")
|
||||
print(f" 🔴 Low Quality (<4.0): {low_quality} links")
|
||||
|
||||
# Show best and worst links
|
||||
scored_links = [(link, link.get('intrinsic_score', 0)) for link in links
|
||||
if link.get('intrinsic_score') is not None]
|
||||
scored_links.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
print(f"\n🏆 Top 3 Quality Links:")
|
||||
for i, (link, score) in enumerate(scored_links[:3]):
|
||||
text = link.get('text', 'No text')[:40]
|
||||
print(f" {i+1}. [{score:.1f}] {text}...")
|
||||
print(f" {link['href']}")
|
||||
|
||||
print(f"\n⚠️ Bottom 3 Quality Links:")
|
||||
for i, (link, score) in enumerate(scored_links[-3:]):
|
||||
text = link.get('text', 'No text')[:40]
|
||||
print(f" {i+1}. [{score:.1f}] {text}...")
|
||||
print(f" {link['href']}")
|
||||
else:
|
||||
print("❌ No scoring data available")
|
||||
else:
|
||||
print(f"❌ Analysis failed: {result.error_message}")
|
||||
|
||||
|
||||
async def pattern_filtering_example():
|
||||
"""
|
||||
Pattern Filtering: Demonstrate advanced filtering capabilities
|
||||
"""
|
||||
print("\n\n🎯 Pattern Filtering Example")
|
||||
print("=" * 50)
|
||||
|
||||
# Example with multiple filtering strategies
|
||||
filters = [
|
||||
{
|
||||
"name": "Documentation Only",
|
||||
"config": LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
verbose=False,
|
||||
include_patterns=["*/docs/*", "*/documentation/*"],
|
||||
exclude_patterns=["*/api/*"]
|
||||
)
|
||||
},
|
||||
{
|
||||
"name": "API References Only",
|
||||
"config": LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
verbose=False,
|
||||
include_patterns=["*/api/*", "*/reference/*"],
|
||||
exclude_patterns=["*/tutorial/*"]
|
||||
)
|
||||
},
|
||||
{
|
||||
"name": "Exclude Admin Areas",
|
||||
"config": LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
verbose=False,
|
||||
exclude_patterns=["*/admin/*", "*/login/*", "*/dashboard/*"]
|
||||
)
|
||||
}
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
for filter_example in filters:
|
||||
print(f"\n🔍 Testing: {filter_example['name']}")
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
link_extraction_config=filter_example['config'],
|
||||
score_links=True
|
||||
)
|
||||
|
||||
result = await crawler.arun("https://docs.python.org/3/", config=config)
|
||||
|
||||
if result.success:
|
||||
links = result.links.get("internal", [])
|
||||
links_with_head = [link for link in links if link.get("head_data")]
|
||||
|
||||
print(f" 📊 Found {len(links_with_head)} matching links")
|
||||
|
||||
if links_with_head:
|
||||
# Show sample matches
|
||||
for link in links_with_head[:2]:
|
||||
title = link.get("head_data", {}).get("title", "No title")
|
||||
print(f" • {title[:50]}...")
|
||||
print(f" {link['href']}")
|
||||
else:
|
||||
print(f" ❌ Failed: {result.error_message}")
|
||||
|
||||
|
||||
async def main():
|
||||
"""
|
||||
Run all examples
|
||||
"""
|
||||
print("🚀 Crawl4AI Link Head Extraction Examples")
|
||||
print("=" * 60)
|
||||
print("This will demonstrate various link analysis capabilities.\n")
|
||||
|
||||
try:
|
||||
# Run all examples
|
||||
await basic_link_head_extraction()
|
||||
await research_assistant_example()
|
||||
await api_discovery_example()
|
||||
await link_quality_analysis()
|
||||
await pattern_filtering_example()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("✨ All examples completed successfully!")
|
||||
print("\nNext steps:")
|
||||
print("1. Try modifying the queries and patterns above")
|
||||
print("2. Test with your own websites")
|
||||
print("3. Experiment with different score thresholds")
|
||||
print("4. Check out the full documentation for more options")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n⏹️ Examples interrupted by user")
|
||||
except Exception as e:
|
||||
print(f"\n💥 Error running examples: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -105,7 +105,366 @@ result.links = {
|
||||
|
||||
---
|
||||
|
||||
## 2. Domain Filtering
|
||||
## 2. Advanced Link Head Extraction & Scoring
|
||||
|
||||
Ever wanted to not just extract links, but also get the actual content (title, description, metadata) from those linked pages? And score them for relevance? This is exactly what Link Head Extraction does - it fetches the `<head>` section from each discovered link and scores them using multiple algorithms.
|
||||
|
||||
### 2.1 Why Link Head Extraction?
|
||||
|
||||
When you crawl a page, you get hundreds of links. But which ones are actually valuable? Link Head Extraction solves this by:
|
||||
|
||||
1. **Fetching head content** from each link (title, description, meta tags)
|
||||
2. **Scoring links intrinsically** based on URL quality, text relevance, and context
|
||||
3. **Scoring links contextually** using BM25 algorithm when you provide a search query
|
||||
4. **Combining scores intelligently** to give you a final relevance ranking
|
||||
|
||||
### 2.2 Complete Working Example
|
||||
|
||||
Here's a full example you can copy, paste, and run immediately:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import LinkExtractionConfig
|
||||
|
||||
async def extract_link_heads_example():
|
||||
"""
|
||||
Complete example showing link head extraction with scoring.
|
||||
This will crawl a documentation site and extract head content from internal links.
|
||||
"""
|
||||
|
||||
# Configure link head extraction
|
||||
config = CrawlerRunConfig(
|
||||
# Enable link head extraction with detailed configuration
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
include_internal=True, # Extract from internal links
|
||||
include_external=False, # Skip external links for this example
|
||||
max_links=10, # Limit to 10 links for demo
|
||||
concurrency=5, # Process 5 links simultaneously
|
||||
timeout=10, # 10 second timeout per link
|
||||
query="API documentation guide", # Query for contextual scoring
|
||||
score_threshold=0.3, # Only include links scoring above 0.3
|
||||
verbose=True # Show detailed progress
|
||||
),
|
||||
# Enable intrinsic scoring (URL quality, text relevance)
|
||||
score_links=True,
|
||||
# Keep output clean
|
||||
only_text=True,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Crawl a documentation site (great for testing)
|
||||
result = await crawler.arun("https://docs.python.org/3/", config=config)
|
||||
|
||||
if result.success:
|
||||
print(f"✅ Successfully crawled: {result.url}")
|
||||
print(f"📄 Page title: {result.metadata.get('title', 'No title')}")
|
||||
|
||||
# Access links (now enhanced with head data and scores)
|
||||
internal_links = result.links.get("internal", [])
|
||||
external_links = result.links.get("external", [])
|
||||
|
||||
print(f"\n🔗 Found {len(internal_links)} internal links")
|
||||
print(f"🌍 Found {len(external_links)} external links")
|
||||
|
||||
# Count links with head data
|
||||
links_with_head = [link for link in internal_links
|
||||
if link.get("head_data") is not None]
|
||||
print(f"🧠 Links with head data extracted: {len(links_with_head)}")
|
||||
|
||||
# Show the top 3 scoring links
|
||||
print(f"\n🏆 Top 3 Links with Full Scoring:")
|
||||
for i, link in enumerate(links_with_head[:3]):
|
||||
print(f"\n{i+1}. {link['href']}")
|
||||
print(f" Link Text: '{link.get('text', 'No text')[:50]}...'")
|
||||
|
||||
# Show all three score types
|
||||
intrinsic = link.get('intrinsic_score')
|
||||
contextual = link.get('contextual_score')
|
||||
total = link.get('total_score')
|
||||
|
||||
if intrinsic is not None:
|
||||
print(f" 📊 Intrinsic Score: {intrinsic:.2f}/10.0 (URL quality & context)")
|
||||
if contextual is not None:
|
||||
print(f" 🎯 Contextual Score: {contextual:.3f} (BM25 relevance to query)")
|
||||
if total is not None:
|
||||
print(f" ⭐ Total Score: {total:.3f} (combined final score)")
|
||||
|
||||
# Show extracted head data
|
||||
head_data = link.get("head_data", {})
|
||||
if head_data:
|
||||
title = head_data.get("title", "No title")
|
||||
description = head_data.get("meta", {}).get("description", "No description")
|
||||
|
||||
print(f" 📰 Title: {title[:60]}...")
|
||||
if description:
|
||||
print(f" 📝 Description: {description[:80]}...")
|
||||
|
||||
# Show extraction status
|
||||
status = link.get("head_extraction_status", "unknown")
|
||||
print(f" ✅ Extraction Status: {status}")
|
||||
else:
|
||||
print(f"❌ Crawl failed: {result.error_message}")
|
||||
|
||||
# Run the example
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(extract_link_heads_example())
|
||||
```
|
||||
|
||||
**Expected Output:**
|
||||
```
|
||||
✅ Successfully crawled: https://docs.python.org/3/
|
||||
📄 Page title: 3.13.5 Documentation
|
||||
🔗 Found 53 internal links
|
||||
🌍 Found 1 external links
|
||||
🧠 Links with head data extracted: 10
|
||||
|
||||
🏆 Top 3 Links with Full Scoring:
|
||||
|
||||
1. https://docs.python.org/3.15/
|
||||
Link Text: 'Python 3.15 (in development)...'
|
||||
📊 Intrinsic Score: 4.17/10.0 (URL quality & context)
|
||||
🎯 Contextual Score: 1.000 (BM25 relevance to query)
|
||||
⭐ Total Score: 5.917 (combined final score)
|
||||
📰 Title: 3.15.0a0 Documentation...
|
||||
📝 Description: The official Python documentation...
|
||||
✅ Extraction Status: valid
|
||||
```
|
||||
|
||||
### 2.3 Configuration Deep Dive
|
||||
|
||||
The `LinkExtractionConfig` class supports these options:
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import LinkExtractionConfig
|
||||
|
||||
link_extraction_config = LinkExtractionConfig(
|
||||
# BASIC SETTINGS
|
||||
verbose=True, # Show detailed logs (recommended for learning)
|
||||
|
||||
# LINK FILTERING
|
||||
include_internal=True, # Include same-domain links
|
||||
include_external=True, # Include different-domain links
|
||||
max_links=50, # Maximum links to process (prevents overload)
|
||||
|
||||
# PATTERN FILTERING
|
||||
include_patterns=[ # Only process links matching these patterns
|
||||
"*/docs/*",
|
||||
"*/api/*",
|
||||
"*/reference/*"
|
||||
],
|
||||
exclude_patterns=[ # Skip links matching these patterns
|
||||
"*/login*",
|
||||
"*/admin*"
|
||||
],
|
||||
|
||||
# PERFORMANCE SETTINGS
|
||||
concurrency=10, # How many links to process simultaneously
|
||||
timeout=5, # Seconds to wait per link
|
||||
|
||||
# RELEVANCE SCORING
|
||||
query="machine learning API", # Query for BM25 contextual scoring
|
||||
score_threshold=0.3, # Only include links above this score
|
||||
)
|
||||
```
|
||||
|
||||
### 2.4 Understanding the Three Score Types
|
||||
|
||||
Each extracted link gets three different scores:
|
||||
|
||||
#### 1. **Intrinsic Score (0-10)** - URL and Content Quality
|
||||
Based on URL structure, link text quality, and page context:
|
||||
|
||||
```python
|
||||
# High intrinsic score indicators:
|
||||
# ✅ Clean URL structure (docs.python.org/api/reference)
|
||||
# ✅ Meaningful link text ("API Reference Guide")
|
||||
# ✅ Relevant to page context
|
||||
# ✅ Not buried deep in navigation
|
||||
|
||||
# Low intrinsic score indicators:
|
||||
# ❌ Random URLs (site.com/x7f9g2h)
|
||||
# ❌ No link text or generic text ("Click here")
|
||||
# ❌ Unrelated to page content
|
||||
```
|
||||
|
||||
#### 2. **Contextual Score (0-1)** - BM25 Relevance to Query
|
||||
Only available when you provide a `query`. Uses BM25 algorithm against head content:
|
||||
|
||||
```python
|
||||
# Example: query = "machine learning tutorial"
|
||||
# High contextual score: Link to "Complete Machine Learning Guide"
|
||||
# Low contextual score: Link to "Privacy Policy"
|
||||
```
|
||||
|
||||
#### 3. **Total Score** - Smart Combination
|
||||
Intelligently combines intrinsic and contextual scores with fallbacks:
|
||||
|
||||
```python
|
||||
# When both scores available: (intrinsic * 0.3) + (contextual * 0.7)
|
||||
# When only intrinsic: uses intrinsic score
|
||||
# When only contextual: uses contextual score
|
||||
# When neither: not calculated
|
||||
```
|
||||
|
||||
### 2.5 Practical Use Cases
|
||||
|
||||
#### Use Case 1: Research Assistant
|
||||
Find the most relevant documentation pages:
|
||||
|
||||
```python
|
||||
async def research_assistant():
|
||||
config = CrawlerRunConfig(
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
include_external=True,
|
||||
include_patterns=["*/docs/*", "*/tutorial/*", "*/guide/*"],
|
||||
query="machine learning neural networks",
|
||||
max_links=20,
|
||||
score_threshold=0.5, # Only high-relevance links
|
||||
verbose=True
|
||||
),
|
||||
score_links=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://scikit-learn.org/", config=config)
|
||||
|
||||
if result.success:
|
||||
# Get high-scoring links
|
||||
good_links = [link for link in result.links.get("internal", [])
|
||||
if link.get("total_score", 0) > 0.7]
|
||||
|
||||
print(f"🎯 Found {len(good_links)} highly relevant links:")
|
||||
for link in good_links[:5]:
|
||||
print(f"⭐ {link['total_score']:.3f} - {link['href']}")
|
||||
print(f" {link.get('head_data', {}).get('title', 'No title')}")
|
||||
```
|
||||
|
||||
#### Use Case 2: Content Discovery
|
||||
Find all API endpoints and references:
|
||||
|
||||
```python
|
||||
async def api_discovery():
|
||||
config = CrawlerRunConfig(
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
include_patterns=["*/api/*", "*/reference/*"],
|
||||
exclude_patterns=["*/deprecated/*"],
|
||||
max_links=100,
|
||||
concurrency=15,
|
||||
verbose=False # Clean output
|
||||
),
|
||||
score_links=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://docs.example-api.com/", config=config)
|
||||
|
||||
if result.success:
|
||||
api_links = result.links.get("internal", [])
|
||||
|
||||
# Group by endpoint type
|
||||
endpoints = {}
|
||||
for link in api_links:
|
||||
if link.get("head_data"):
|
||||
title = link["head_data"].get("title", "")
|
||||
if "GET" in title:
|
||||
endpoints.setdefault("GET", []).append(link)
|
||||
elif "POST" in title:
|
||||
endpoints.setdefault("POST", []).append(link)
|
||||
|
||||
for method, links in endpoints.items():
|
||||
print(f"\n{method} Endpoints ({len(links)}):")
|
||||
for link in links[:3]:
|
||||
print(f" • {link['href']}")
|
||||
```
|
||||
|
||||
#### Use Case 3: Link Quality Analysis
|
||||
Analyze website structure and content quality:
|
||||
|
||||
```python
|
||||
async def quality_analysis():
|
||||
config = CrawlerRunConfig(
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
max_links=200,
|
||||
concurrency=20,
|
||||
),
|
||||
score_links=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://your-website.com/", config=config)
|
||||
|
||||
if result.success:
|
||||
links = result.links.get("internal", [])
|
||||
|
||||
# Analyze intrinsic scores
|
||||
scores = [link.get('intrinsic_score', 0) for link in links]
|
||||
avg_score = sum(scores) / len(scores) if scores else 0
|
||||
|
||||
print(f"📊 Link Quality Analysis:")
|
||||
print(f" Average intrinsic score: {avg_score:.2f}/10.0")
|
||||
print(f" High quality links (>7.0): {len([s for s in scores if s > 7.0])}")
|
||||
print(f" Low quality links (<3.0): {len([s for s in scores if s < 3.0])}")
|
||||
|
||||
# Find problematic links
|
||||
bad_links = [link for link in links
|
||||
if link.get('intrinsic_score', 0) < 2.0]
|
||||
|
||||
if bad_links:
|
||||
print(f"\n⚠️ Links needing attention:")
|
||||
for link in bad_links[:5]:
|
||||
print(f" {link['href']} (score: {link.get('intrinsic_score', 0):.1f})")
|
||||
```
|
||||
|
||||
### 2.6 Performance Tips
|
||||
|
||||
1. **Start Small**: Begin with `max_links: 10` to understand the feature
|
||||
2. **Use Patterns**: Filter with `include_patterns` to focus on relevant sections
|
||||
3. **Adjust Concurrency**: Higher concurrency = faster but more resource usage
|
||||
4. **Set Timeouts**: Use `timeout: 5` to prevent hanging on slow sites
|
||||
5. **Use Score Thresholds**: Filter out low-quality links with `score_threshold`
|
||||
|
||||
### 2.7 Troubleshooting
|
||||
|
||||
**No head data extracted?**
|
||||
```python
|
||||
# Check your configuration:
|
||||
config = CrawlerRunConfig(
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
verbose=True # ← Enable to see what's happening
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
**Scores showing as None?**
|
||||
```python
|
||||
# Make sure scoring is enabled:
|
||||
config = CrawlerRunConfig(
|
||||
score_links=True, # ← Enable intrinsic scoring
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
query="your search terms" # ← For contextual scoring
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
**Process taking too long?**
|
||||
```python
|
||||
# Optimize performance:
|
||||
link_extraction_config = LinkExtractionConfig(
|
||||
max_links=20, # ← Reduce number
|
||||
concurrency=10, # ← Increase parallelism
|
||||
timeout=3, # ← Shorter timeout
|
||||
include_patterns=["*/important/*"] # ← Focus on key areas
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Domain Filtering
|
||||
|
||||
Some websites contain hundreds of third-party or affiliate links. You can filter out certain domains at **crawl time** by configuring the crawler. The most relevant parameters in `CrawlerRunConfig` are:
|
||||
|
||||
@@ -114,7 +473,7 @@ Some websites contain hundreds of third-party or affiliate links. You can filter
|
||||
- **`exclude_social_media_links`**: If `True`, automatically skip known social platforms.
|
||||
- **`exclude_domains`**: Provide a list of custom domains you want to exclude (e.g., `["spammyads.com", "tracker.net"]`).
|
||||
|
||||
### 2.1 Example: Excluding External & Social Media Links
|
||||
### 3.1 Example: Excluding External & Social Media Links
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
@@ -143,7 +502,7 @@ if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### 2.2 Example: Excluding Specific Domains
|
||||
### 3.2 Example: Excluding Specific Domains
|
||||
|
||||
If you want to let external links in, but specifically exclude a domain (e.g., `suspiciousads.com`), do this:
|
||||
|
||||
@@ -157,9 +516,9 @@ This approach is handy when you still want external links but need to block cert
|
||||
|
||||
---
|
||||
|
||||
## 3. Media Extraction
|
||||
## 4. Media Extraction
|
||||
|
||||
### 3.1 Accessing `result.media`
|
||||
### 4.1 Accessing `result.media`
|
||||
|
||||
By default, Crawl4AI collects images, audio, video URLs, and data tables it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`, `tables`).
|
||||
|
||||
@@ -237,7 +596,7 @@ Depending on your Crawl4AI version or scraping strategy, these dictionaries can
|
||||
|
||||
With these details, you can easily filter out or focus on certain images (for instance, ignoring images with very low scores or a different domain), or gather metadata for analytics.
|
||||
|
||||
### 3.2 Excluding External Images
|
||||
### 4.2 Excluding External Images
|
||||
|
||||
If you’re dealing with heavy pages or want to skip third-party images (advertisements, for example), you can turn on:
|
||||
|
||||
|
||||
Reference in New Issue
Block a user