refactor(link_extractor): remove link_extractor and rename to link_preview

This change removes the link_extractor module and renames it to link_preview, streamlining the codebase. The removal of 395 lines of code reduces complexity and improves maintainability. Other files have been updated to reflect this change, ensuring consistency across the project.

BREAKING CHANGE: The link_extractor module has been deleted and replaced with link_preview. Update imports accordingly.
This commit is contained in:
UncleCode
2025-06-27 21:54:22 +08:00
parent 5c9c305dbf
commit 539a324cf6
7 changed files with 71 additions and 71 deletions

View File

@@ -5,7 +5,7 @@ Test script for Link Extractor functionality
from crawl4ai.models import Link
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.async_configs import LinkExtractionConfig
from crawl4ai.async_configs import LinkPreviewConfig
import asyncio
import sys
import os
@@ -22,7 +22,7 @@ async def test_link_extractor():
# Test configuration with link extraction AND scoring enabled
config = CrawlerRunConfig(
link_extraction_config=LinkExtractionConfig(
link_preview_config=LinkPreviewConfig(
include_internal=True,
include_external=False, # Only internal links for this test
# No include/exclude patterns for first test - let's see what we get
@@ -53,7 +53,7 @@ async def test_link_extractor():
result = await crawler.arun(url, config=config)
# Debug: Check if link extraction config is being passed
print(f"🔍 Debug - Link extraction config: {config.link_extraction_config.to_dict() if config.link_extraction_config else None}")
print(f"🔍 Debug - Link extraction config: {config.link_preview_config.to_dict() if config.link_preview_config else None}")
print(f"🔍 Debug - Score links: {config.score_links}")
if result.success:
@@ -187,7 +187,7 @@ def test_config_examples():
examples = [
{
"name": "BM25 Scored Documentation Links",
"config": LinkExtractionConfig(
"config": LinkPreviewConfig(
include_internal=True,
include_external=False,
include_patterns=["*/docs/*", "*/api/*", "*/reference/*"],
@@ -199,7 +199,7 @@ def test_config_examples():
},
{
"name": "Internal Links Only",
"config": LinkExtractionConfig(
"config": LinkPreviewConfig(
include_internal=True,
include_external=False,
max_links=50,
@@ -208,7 +208,7 @@ def test_config_examples():
},
{
"name": "External Links with Patterns",
"config": LinkExtractionConfig(
"config": LinkPreviewConfig(
include_internal=False,
include_external=True,
include_patterns=["*github.com*", "*stackoverflow.com*"],
@@ -218,7 +218,7 @@ def test_config_examples():
},
{
"name": "High-Performance Mode",
"config": LinkExtractionConfig(
"config": LinkPreviewConfig(
include_internal=True,
include_external=False,
concurrency=20,
@@ -237,9 +237,9 @@ def test_config_examples():
print(f" {key}: {value}")
print(" Usage:")
print(" from crawl4ai.async_configs import LinkExtractionConfig")
print(" from crawl4ai.async_configs import LinkPreviewConfig")
print(" config = CrawlerRunConfig(")
print(" link_extraction_config=LinkExtractionConfig(")
print(" link_preview_config=LinkPreviewConfig(")
for key, value in config_dict.items():
if isinstance(value, str):
print(f" {key}='{value}',")