Files
crawl4ai/tests/test_link_extractor.py
UncleCode 8a04351406 feat(crawl4ai): Update to version 0.7.1 with improvements and new tests
This commit includes several updates to the crawl4ai package, including changes to the browser manager and content scraping strategy. The version number has been updated to 0.7.1. Significant modifications have been made to the documentation, including updates to the release notes for version 0.7.0 and the addition of release notes for version 0.7.1. Examples and core documentation have also been updated to reflect the changes in this version. Additionally, a new simple API test has been added to the Docker tests.

These changes were made to improve the functionality of the crawl4ai package and to provide clearer, more up-to-date documentation for users. The new test will help ensure the API is working as expected.

BREAKING CHANGE: The updates to the browser manager and content scraping strategy may affect how these components interact with the rest of the package. Users should review the updated documentation for details on these changes.
2025-07-18 16:27:19 +08:00

263 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Test script for Link Extractor functionality
"""
from crawl4ai.models import Link
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai import LinkPreviewConfig
import asyncio
import sys
import os
# Add the crawl4ai directory to the path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'crawl4ai'))
async def test_link_extractor():
"""Test the link extractor functionality"""
print("🔗 Testing Link Extractor Functionality")
print("=" * 50)
# Test configuration with link extraction AND scoring enabled
config = CrawlerRunConfig(
link_preview_config=LinkPreviewConfig(
include_internal=True,
include_external=False, # Only internal links for this test
# No include/exclude patterns for first test - let's see what we get
query="API documentation reference guide",
score_threshold=0.3,
concurrency=5,
timeout=10,
max_links=5, # Just test with 5 links first
verbose=True # Show detailed progress
),
score_links=True, # Enable intrinsic link scoring
only_text=True,
verbose=True
)
# Test URLs
test_urls = [
"https://docs.python.org/3/", # Python docs - should have many internal links
"https://httpbin.org/", # Simple site for testing
]
async with AsyncWebCrawler() as crawler:
for url in test_urls:
print(f"\n🌐 Testing URL: {url}")
print("-" * 40)
try:
result = await crawler.arun(url, config=config)
# Debug: Check if link extraction config is being passed
print(f"🔍 Debug - Link extraction config: {config.link_preview_config.to_dict() if config.link_preview_config else None}")
print(f"🔍 Debug - Score links: {config.score_links}")
if result.success:
print(f"✅ Crawl successful!")
print(
f"📄 Page title: {result.metadata.get('title', 'No title')}")
# Check links - handle both dict and Links object structure
if isinstance(result.links, dict):
internal_links = [
Link(**link) for link in result.links.get('internal', [])]
external_links = [
Link(**link) for link in result.links.get('external', [])]
else:
internal_links = result.links.internal
external_links = result.links.external
print(f"🔗 Found {len(internal_links)} internal links")
print(f"🌍 Found {len(external_links)} external links")
# Show links with head data
links_with_head = [link for link in internal_links + external_links
if hasattr(link, 'head_data') and link.head_data]
print(
f"🧠 Links with head data extracted: {len(links_with_head)}")
# Show all score types for all links (first 3)
all_links = internal_links + external_links
if all_links:
print(f"\n🔢 Sample link scores (first 3 links):")
for i, link in enumerate(all_links[:3]):
print(f"\n {i+1}. {link.href}")
# Show intrinsic score
if hasattr(link, 'intrinsic_score') and link.intrinsic_score is not None:
if link.intrinsic_score == float('inf'):
print(f" Intrinsic Score: ∞ (scoring disabled)")
else:
print(f" Intrinsic Score: {link.intrinsic_score:.2f}/10.0")
else:
print(f" Intrinsic Score: Not available")
# Show contextual score (BM25)
if hasattr(link, 'contextual_score') and link.contextual_score is not None:
print(f" Contextual Score: {link.contextual_score:.3f}")
else:
print(f" Contextual Score: Not available")
# Show total score
if hasattr(link, 'total_score') and link.total_score is not None:
print(f" Total Score: {link.total_score:.3f}")
else:
print(f" Total Score: Not available")
print(f" Text: '{link.text[:50]}...' " if link.text else " Text: (no text)")
if links_with_head:
print("\n📊 Sample links with head data:")
# Show top 3
for i, link in enumerate(links_with_head[:3]):
print(f"\n {i+1}. {link.href}")
print(
f" Status: {link.head_extraction_status}")
# Show all three score types
print(f" 📊 Scoring Summary:")
if hasattr(link, 'intrinsic_score') and link.intrinsic_score is not None:
if link.intrinsic_score == float('inf'):
print(f" • Intrinsic Score: ∞ (scoring disabled)")
else:
print(f" • Intrinsic Score: {link.intrinsic_score:.2f}/10.0")
else:
print(f" • Intrinsic Score: Not available")
if hasattr(link, 'contextual_score') and link.contextual_score is not None:
print(f" • Contextual Score: {link.contextual_score:.3f}")
else:
print(f" • Contextual Score: Not available")
if hasattr(link, 'total_score') and link.total_score is not None:
print(f" • Total Score: {link.total_score:.3f}")
else:
print(f" • Total Score: Not available")
if link.head_data:
title = link.head_data.get('title', 'No title')
if title:
print(f" Title: {title[:60]}...")
meta = link.head_data.get('meta', {})
if 'description' in meta and meta['description']:
desc = meta['description']
print(f" Description: {desc[:80]}...")
# Show link metadata keys (should now be properly formatted)
link_data = link.head_data.get('link', {})
if link_data:
keys = list(link_data.keys())[:3]
print(f" Link types: {keys}")
# Show failed extractions
failed_links = [link for link in internal_links + external_links
if hasattr(link, 'head_extraction_status') and
link.head_extraction_status == 'failed']
if failed_links:
print(
f"\n❌ Failed head extractions: {len(failed_links)}")
for link in failed_links[:2]: # Show first 2 failures
print(f" - {link.href}")
if hasattr(link, 'head_extraction_error') and link.head_extraction_error:
print(
f" Error: {link.head_extraction_error}")
else:
print(f"❌ Crawl failed: {result.error_message}")
except Exception as e:
print(f"💥 Error testing {url}: {str(e)}")
import traceback
traceback.print_exc()
def test_config_examples():
"""Show example configurations"""
print("\n📚 Example Configurations")
print("=" * 50)
examples = [
{
"name": "BM25 Scored Documentation Links",
"config": LinkPreviewConfig(
include_internal=True,
include_external=False,
include_patterns=["*/docs/*", "*/api/*", "*/reference/*"],
query="API documentation reference guide",
score_threshold=0.3,
max_links=30,
verbose=True
)
},
{
"name": "Internal Links Only",
"config": LinkPreviewConfig(
include_internal=True,
include_external=False,
max_links=50,
verbose=True
)
},
{
"name": "External Links with Patterns",
"config": LinkPreviewConfig(
include_internal=False,
include_external=True,
include_patterns=["*github.com*", "*stackoverflow.com*"],
max_links=20,
concurrency=10
)
},
{
"name": "High-Performance Mode",
"config": LinkPreviewConfig(
include_internal=True,
include_external=False,
concurrency=20,
timeout=3,
max_links=100,
verbose=False
)
}
]
for example in examples:
print(f"\n📝 {example['name']}:")
print(" Configuration:")
config_dict = example['config'].to_dict()
for key, value in config_dict.items():
print(f" {key}: {value}")
print(" Usage:")
print(" from crawl4ai import LinkPreviewConfig")
print(" config = CrawlerRunConfig(")
print(" link_preview_config=LinkPreviewConfig(")
for key, value in config_dict.items():
if isinstance(value, str):
print(f" {key}='{value}',")
elif isinstance(value, list) and value:
print(f" {key}={value},")
elif value is not None:
print(f" {key}={value},")
print(" )")
print(" )")
if __name__ == "__main__":
# Show configuration examples first
test_config_examples()
# Run the actual test
print("\n🚀 Running Link Extractor Tests...")
asyncio.run(test_link_extractor())
print("\n✨ Test completed!")