refactor(link_extractor): remove link_extractor and rename to link_preview
This change removes the link_extractor module and renames it to link_preview, streamlining the codebase. The removal of 395 lines of code reduces complexity and improves maintainability. Other files have been updated to reflect this change, ensuring consistency across the project. BREAKING CHANGE: The link_extractor module has been deleted and replaced with link_preview. Update imports accordingly.
This commit is contained in:
@@ -37,7 +37,7 @@ from .content_filter_strategy import (
|
||||
)
|
||||
from .models import CrawlResult, MarkdownGenerationResult, DisplayMode
|
||||
from .components.crawler_monitor import CrawlerMonitor
|
||||
from .link_extractor import LinkExtractor
|
||||
from .link_preview import LinkPreview
|
||||
from .async_dispatcher import (
|
||||
MemoryAdaptiveDispatcher,
|
||||
SemaphoreDispatcher,
|
||||
@@ -142,7 +142,7 @@ __all__ = [
|
||||
"SemaphoreDispatcher",
|
||||
"RateLimiter",
|
||||
"CrawlerMonitor",
|
||||
"LinkExtractor",
|
||||
"LinkPreview",
|
||||
"DisplayMode",
|
||||
"MarkdownGenerationResult",
|
||||
"Crawl4aiDockerClient",
|
||||
|
||||
@@ -594,7 +594,7 @@ class BrowserConfig:
|
||||
return config
|
||||
return BrowserConfig.from_kwargs(config)
|
||||
|
||||
class LinkExtractionConfig:
|
||||
class LinkPreviewConfig:
|
||||
"""Configuration for link head extraction and scoring."""
|
||||
|
||||
def __init__(
|
||||
@@ -649,12 +649,12 @@ class LinkExtractionConfig:
|
||||
raise ValueError("At least one of include_internal or include_external must be True")
|
||||
|
||||
@staticmethod
|
||||
def from_dict(config_dict: Dict[str, Any]) -> "LinkExtractionConfig":
|
||||
"""Create LinkExtractionConfig from dictionary (for backward compatibility)."""
|
||||
def from_dict(config_dict: Dict[str, Any]) -> "LinkPreviewConfig":
|
||||
"""Create LinkPreviewConfig from dictionary (for backward compatibility)."""
|
||||
if not config_dict:
|
||||
return None
|
||||
|
||||
return LinkExtractionConfig(
|
||||
return LinkPreviewConfig(
|
||||
include_internal=config_dict.get("include_internal", True),
|
||||
include_external=config_dict.get("include_external", False),
|
||||
include_patterns=config_dict.get("include_patterns"),
|
||||
@@ -682,11 +682,11 @@ class LinkExtractionConfig:
|
||||
"verbose": self.verbose
|
||||
}
|
||||
|
||||
def clone(self, **kwargs) -> "LinkExtractionConfig":
|
||||
def clone(self, **kwargs) -> "LinkPreviewConfig":
|
||||
"""Create a copy with updated values."""
|
||||
config_dict = self.to_dict()
|
||||
config_dict.update(kwargs)
|
||||
return LinkExtractionConfig.from_dict(config_dict)
|
||||
return LinkPreviewConfig.from_dict(config_dict)
|
||||
|
||||
|
||||
class HTTPCrawlerConfig:
|
||||
@@ -925,7 +925,7 @@ class CrawlerRunConfig():
|
||||
exclude_internal_links (bool): If True, exclude internal links from the results.
|
||||
Default: False.
|
||||
score_links (bool): If True, calculate intrinsic quality scores for all links using URL structure,
|
||||
text quality, and contextual relevance metrics. Separate from link_extraction_config.
|
||||
text quality, and contextual relevance metrics. Separate from link_preview_config.
|
||||
Default: False.
|
||||
|
||||
# Debugging and Logging Parameters
|
||||
@@ -1055,7 +1055,7 @@ class CrawlerRunConfig():
|
||||
# Deep Crawl Parameters
|
||||
deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
|
||||
# Link Extraction Parameters
|
||||
link_extraction_config: Union[LinkExtractionConfig, Dict[str, Any]] = None,
|
||||
link_preview_config: Union[LinkPreviewConfig, Dict[str, Any]] = None,
|
||||
# Experimental Parameters
|
||||
experimental: Dict[str, Any] = None,
|
||||
):
|
||||
@@ -1187,15 +1187,15 @@ class CrawlerRunConfig():
|
||||
self.deep_crawl_strategy = deep_crawl_strategy
|
||||
|
||||
# Link Extraction Parameters
|
||||
if link_extraction_config is None:
|
||||
self.link_extraction_config = None
|
||||
elif isinstance(link_extraction_config, LinkExtractionConfig):
|
||||
self.link_extraction_config = link_extraction_config
|
||||
elif isinstance(link_extraction_config, dict):
|
||||
if link_preview_config is None:
|
||||
self.link_preview_config = None
|
||||
elif isinstance(link_preview_config, LinkPreviewConfig):
|
||||
self.link_preview_config = link_preview_config
|
||||
elif isinstance(link_preview_config, dict):
|
||||
# Convert dict to config object for backward compatibility
|
||||
self.link_extraction_config = LinkExtractionConfig.from_dict(link_extraction_config)
|
||||
self.link_preview_config = LinkPreviewConfig.from_dict(link_preview_config)
|
||||
else:
|
||||
raise ValueError("link_extraction_config must be LinkExtractionConfig object or dict")
|
||||
raise ValueError("link_preview_config must be LinkPreviewConfig object or dict")
|
||||
|
||||
# Experimental Parameters
|
||||
self.experimental = experimental or {}
|
||||
@@ -1371,7 +1371,7 @@ class CrawlerRunConfig():
|
||||
# Deep Crawl Parameters
|
||||
deep_crawl_strategy=kwargs.get("deep_crawl_strategy"),
|
||||
# Link Extraction Parameters
|
||||
link_extraction_config=kwargs.get("link_extraction_config"),
|
||||
link_preview_config=kwargs.get("link_preview_config"),
|
||||
url=kwargs.get("url"),
|
||||
# Experimental Parameters
|
||||
experimental=kwargs.get("experimental"),
|
||||
@@ -1467,7 +1467,7 @@ class CrawlerRunConfig():
|
||||
"user_agent_mode": self.user_agent_mode,
|
||||
"user_agent_generator_config": self.user_agent_generator_config,
|
||||
"deep_crawl_strategy": self.deep_crawl_strategy,
|
||||
"link_extraction_config": self.link_extraction_config.to_dict() if self.link_extraction_config else None,
|
||||
"link_preview_config": self.link_preview_config.to_dict() if self.link_preview_config else None,
|
||||
"url": self.url,
|
||||
"experimental": self.experimental,
|
||||
}
|
||||
|
||||
@@ -948,14 +948,14 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
links["external"] = list(external_links_dict.values())
|
||||
|
||||
# Extract head content for links if configured
|
||||
link_extraction_config = kwargs.get("link_extraction_config")
|
||||
if link_extraction_config is not None:
|
||||
link_preview_config = kwargs.get("link_preview_config")
|
||||
if link_preview_config is not None:
|
||||
try:
|
||||
import asyncio
|
||||
from .link_extractor import LinkExtractor
|
||||
from .link_preview import LinkPreview
|
||||
from .models import Links, Link
|
||||
|
||||
verbose = link_extraction_config.verbose
|
||||
verbose = link_preview_config.verbose
|
||||
|
||||
if verbose:
|
||||
self._log("info", "Starting link head extraction for {internal} internal and {external} external links",
|
||||
@@ -966,17 +966,17 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
external_links = [Link(**link_data) for link_data in links["external"]]
|
||||
links_obj = Links(internal=internal_links, external=external_links)
|
||||
|
||||
# Create a config object for LinkExtractor
|
||||
# Create a config object for LinkPreview
|
||||
class TempCrawlerRunConfig:
|
||||
def __init__(self, link_config, score_links):
|
||||
self.link_extraction_config = link_config
|
||||
self.link_preview_config = link_config
|
||||
self.score_links = score_links
|
||||
|
||||
config = TempCrawlerRunConfig(link_extraction_config, kwargs.get("score_links", False))
|
||||
config = TempCrawlerRunConfig(link_preview_config, kwargs.get("score_links", False))
|
||||
|
||||
# Extract head content (run async operation in sync context)
|
||||
async def extract_links():
|
||||
async with LinkExtractor(self.logger) as extractor:
|
||||
async with LinkPreview(self.logger) as extractor:
|
||||
return await extractor.extract_link_heads(links_obj, config)
|
||||
|
||||
# Run the async operation
|
||||
@@ -1740,21 +1740,21 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
with_tail=False,
|
||||
).strip()
|
||||
|
||||
# Create links dictionary in the format expected by LinkExtractor
|
||||
# Create links dictionary in the format expected by LinkPreview
|
||||
links = {
|
||||
"internal": list(internal_links_dict.values()),
|
||||
"external": list(external_links_dict.values()),
|
||||
}
|
||||
|
||||
# Extract head content for links if configured
|
||||
link_extraction_config = kwargs.get("link_extraction_config")
|
||||
if link_extraction_config is not None:
|
||||
link_preview_config = kwargs.get("link_preview_config")
|
||||
if link_preview_config is not None:
|
||||
try:
|
||||
import asyncio
|
||||
from .link_extractor import LinkExtractor
|
||||
from .link_preview import LinkPreview
|
||||
from .models import Links, Link
|
||||
|
||||
verbose = link_extraction_config.verbose
|
||||
verbose = link_preview_config.verbose
|
||||
|
||||
if verbose:
|
||||
self._log("info", "Starting link head extraction for {internal} internal and {external} external links",
|
||||
@@ -1765,17 +1765,17 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
external_links = [Link(**link_data) for link_data in links["external"]]
|
||||
links_obj = Links(internal=internal_links, external=external_links)
|
||||
|
||||
# Create a config object for LinkExtractor
|
||||
# Create a config object for LinkPreview
|
||||
class TempCrawlerRunConfig:
|
||||
def __init__(self, link_config, score_links):
|
||||
self.link_extraction_config = link_config
|
||||
self.link_preview_config = link_config
|
||||
self.score_links = score_links
|
||||
|
||||
config = TempCrawlerRunConfig(link_extraction_config, kwargs.get("score_links", False))
|
||||
config = TempCrawlerRunConfig(link_preview_config, kwargs.get("score_links", False))
|
||||
|
||||
# Extract head content (run async operation in sync context)
|
||||
async def extract_links():
|
||||
async with LinkExtractor(self.logger) as extractor:
|
||||
async with LinkPreview(self.logger) as extractor:
|
||||
return await extractor.extract_link_heads(links_obj, config)
|
||||
|
||||
# Run the async operation
|
||||
|
||||
@@ -15,7 +15,7 @@ from .models import Links, Link
|
||||
from .utils import calculate_total_score
|
||||
|
||||
|
||||
class LinkExtractor:
|
||||
class LinkPreview:
|
||||
"""
|
||||
Extracts head content from links using URLSeeder's parallel processing infrastructure.
|
||||
|
||||
@@ -29,7 +29,7 @@ class LinkExtractor:
|
||||
|
||||
def __init__(self, logger: Optional[AsyncLogger] = None):
|
||||
"""
|
||||
Initialize the LinkExtractor.
|
||||
Initialize the LinkPreview.
|
||||
|
||||
Args:
|
||||
logger: Optional logger instance for recording events
|
||||
@@ -78,12 +78,12 @@ class LinkExtractor:
|
||||
|
||||
Args:
|
||||
links: Links object containing internal and external links
|
||||
config: CrawlerRunConfig with link_extraction_config settings
|
||||
config: CrawlerRunConfig with link_preview_config settings
|
||||
|
||||
Returns:
|
||||
Links object with head_data attached to filtered Link objects
|
||||
"""
|
||||
link_config = config.link_extraction_config
|
||||
link_config = config.link_preview_config
|
||||
|
||||
# Ensure seeder is initialized
|
||||
await self.start()
|
||||
@@ -331,7 +331,7 @@ class LinkExtractor:
|
||||
intrinsic_score=updated_link.intrinsic_score,
|
||||
contextual_score=updated_link.contextual_score,
|
||||
score_links_enabled=getattr(config, 'score_links', False),
|
||||
query_provided=bool(config.link_extraction_config.query)
|
||||
query_provided=bool(config.link_preview_config.query)
|
||||
)
|
||||
|
||||
updated_internal.append(updated_link)
|
||||
@@ -369,7 +369,7 @@ class LinkExtractor:
|
||||
intrinsic_score=updated_link.intrinsic_score,
|
||||
contextual_score=updated_link.contextual_score,
|
||||
score_links_enabled=getattr(config, 'score_links', False),
|
||||
query_provided=bool(config.link_extraction_config.query)
|
||||
query_provided=bool(config.link_preview_config.query)
|
||||
)
|
||||
|
||||
updated_external.append(updated_link)
|
||||
Reference in New Issue
Block a user