diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 43181d04..dea9ff5d 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -37,7 +37,7 @@ from .content_filter_strategy import ( ) from .models import CrawlResult, MarkdownGenerationResult, DisplayMode from .components.crawler_monitor import CrawlerMonitor -from .link_extractor import LinkExtractor +from .link_preview import LinkPreview from .async_dispatcher import ( MemoryAdaptiveDispatcher, SemaphoreDispatcher, @@ -142,7 +142,7 @@ __all__ = [ "SemaphoreDispatcher", "RateLimiter", "CrawlerMonitor", - "LinkExtractor", + "LinkPreview", "DisplayMode", "MarkdownGenerationResult", "Crawl4aiDockerClient", diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 9483ca8b..313e2e01 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -594,7 +594,7 @@ class BrowserConfig: return config return BrowserConfig.from_kwargs(config) -class LinkExtractionConfig: +class LinkPreviewConfig: """Configuration for link head extraction and scoring.""" def __init__( @@ -649,12 +649,12 @@ class LinkExtractionConfig: raise ValueError("At least one of include_internal or include_external must be True") @staticmethod - def from_dict(config_dict: Dict[str, Any]) -> "LinkExtractionConfig": - """Create LinkExtractionConfig from dictionary (for backward compatibility).""" + def from_dict(config_dict: Dict[str, Any]) -> "LinkPreviewConfig": + """Create LinkPreviewConfig from dictionary (for backward compatibility).""" if not config_dict: return None - return LinkExtractionConfig( + return LinkPreviewConfig( include_internal=config_dict.get("include_internal", True), include_external=config_dict.get("include_external", False), include_patterns=config_dict.get("include_patterns"), @@ -682,11 +682,11 @@ class LinkExtractionConfig: "verbose": self.verbose } - def clone(self, **kwargs) -> "LinkExtractionConfig": + def clone(self, **kwargs) -> "LinkPreviewConfig": """Create a copy with updated values.""" config_dict = self.to_dict() config_dict.update(kwargs) - return LinkExtractionConfig.from_dict(config_dict) + return LinkPreviewConfig.from_dict(config_dict) class HTTPCrawlerConfig: @@ -925,7 +925,7 @@ class CrawlerRunConfig(): exclude_internal_links (bool): If True, exclude internal links from the results. Default: False. score_links (bool): If True, calculate intrinsic quality scores for all links using URL structure, - text quality, and contextual relevance metrics. Separate from link_extraction_config. + text quality, and contextual relevance metrics. Separate from link_preview_config. Default: False. # Debugging and Logging Parameters @@ -1055,7 +1055,7 @@ class CrawlerRunConfig(): # Deep Crawl Parameters deep_crawl_strategy: Optional[DeepCrawlStrategy] = None, # Link Extraction Parameters - link_extraction_config: Union[LinkExtractionConfig, Dict[str, Any]] = None, + link_preview_config: Union[LinkPreviewConfig, Dict[str, Any]] = None, # Experimental Parameters experimental: Dict[str, Any] = None, ): @@ -1187,15 +1187,15 @@ class CrawlerRunConfig(): self.deep_crawl_strategy = deep_crawl_strategy # Link Extraction Parameters - if link_extraction_config is None: - self.link_extraction_config = None - elif isinstance(link_extraction_config, LinkExtractionConfig): - self.link_extraction_config = link_extraction_config - elif isinstance(link_extraction_config, dict): + if link_preview_config is None: + self.link_preview_config = None + elif isinstance(link_preview_config, LinkPreviewConfig): + self.link_preview_config = link_preview_config + elif isinstance(link_preview_config, dict): # Convert dict to config object for backward compatibility - self.link_extraction_config = LinkExtractionConfig.from_dict(link_extraction_config) + self.link_preview_config = LinkPreviewConfig.from_dict(link_preview_config) else: - raise ValueError("link_extraction_config must be LinkExtractionConfig object or dict") + raise ValueError("link_preview_config must be LinkPreviewConfig object or dict") # Experimental Parameters self.experimental = experimental or {} @@ -1371,7 +1371,7 @@ class CrawlerRunConfig(): # Deep Crawl Parameters deep_crawl_strategy=kwargs.get("deep_crawl_strategy"), # Link Extraction Parameters - link_extraction_config=kwargs.get("link_extraction_config"), + link_preview_config=kwargs.get("link_preview_config"), url=kwargs.get("url"), # Experimental Parameters experimental=kwargs.get("experimental"), @@ -1467,7 +1467,7 @@ class CrawlerRunConfig(): "user_agent_mode": self.user_agent_mode, "user_agent_generator_config": self.user_agent_generator_config, "deep_crawl_strategy": self.deep_crawl_strategy, - "link_extraction_config": self.link_extraction_config.to_dict() if self.link_extraction_config else None, + "link_preview_config": self.link_preview_config.to_dict() if self.link_preview_config else None, "url": self.url, "experimental": self.experimental, } diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 6e630d73..268b599a 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -948,14 +948,14 @@ class WebScrapingStrategy(ContentScrapingStrategy): links["external"] = list(external_links_dict.values()) # Extract head content for links if configured - link_extraction_config = kwargs.get("link_extraction_config") - if link_extraction_config is not None: + link_preview_config = kwargs.get("link_preview_config") + if link_preview_config is not None: try: import asyncio - from .link_extractor import LinkExtractor + from .link_preview import LinkPreview from .models import Links, Link - verbose = link_extraction_config.verbose + verbose = link_preview_config.verbose if verbose: self._log("info", "Starting link head extraction for {internal} internal and {external} external links", @@ -966,17 +966,17 @@ class WebScrapingStrategy(ContentScrapingStrategy): external_links = [Link(**link_data) for link_data in links["external"]] links_obj = Links(internal=internal_links, external=external_links) - # Create a config object for LinkExtractor + # Create a config object for LinkPreview class TempCrawlerRunConfig: def __init__(self, link_config, score_links): - self.link_extraction_config = link_config + self.link_preview_config = link_config self.score_links = score_links - config = TempCrawlerRunConfig(link_extraction_config, kwargs.get("score_links", False)) + config = TempCrawlerRunConfig(link_preview_config, kwargs.get("score_links", False)) # Extract head content (run async operation in sync context) async def extract_links(): - async with LinkExtractor(self.logger) as extractor: + async with LinkPreview(self.logger) as extractor: return await extractor.extract_link_heads(links_obj, config) # Run the async operation @@ -1740,21 +1740,21 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): with_tail=False, ).strip() - # Create links dictionary in the format expected by LinkExtractor + # Create links dictionary in the format expected by LinkPreview links = { "internal": list(internal_links_dict.values()), "external": list(external_links_dict.values()), } # Extract head content for links if configured - link_extraction_config = kwargs.get("link_extraction_config") - if link_extraction_config is not None: + link_preview_config = kwargs.get("link_preview_config") + if link_preview_config is not None: try: import asyncio - from .link_extractor import LinkExtractor + from .link_preview import LinkPreview from .models import Links, Link - verbose = link_extraction_config.verbose + verbose = link_preview_config.verbose if verbose: self._log("info", "Starting link head extraction for {internal} internal and {external} external links", @@ -1765,17 +1765,17 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): external_links = [Link(**link_data) for link_data in links["external"]] links_obj = Links(internal=internal_links, external=external_links) - # Create a config object for LinkExtractor + # Create a config object for LinkPreview class TempCrawlerRunConfig: def __init__(self, link_config, score_links): - self.link_extraction_config = link_config + self.link_preview_config = link_config self.score_links = score_links - config = TempCrawlerRunConfig(link_extraction_config, kwargs.get("score_links", False)) + config = TempCrawlerRunConfig(link_preview_config, kwargs.get("score_links", False)) # Extract head content (run async operation in sync context) async def extract_links(): - async with LinkExtractor(self.logger) as extractor: + async with LinkPreview(self.logger) as extractor: return await extractor.extract_link_heads(links_obj, config) # Run the async operation diff --git a/crawl4ai/link_extractor.py b/crawl4ai/link_preview.py similarity index 97% rename from crawl4ai/link_extractor.py rename to crawl4ai/link_preview.py index 63ebd5f9..13d32d58 100644 --- a/crawl4ai/link_extractor.py +++ b/crawl4ai/link_preview.py @@ -15,7 +15,7 @@ from .models import Links, Link from .utils import calculate_total_score -class LinkExtractor: +class LinkPreview: """ Extracts head content from links using URLSeeder's parallel processing infrastructure. @@ -29,7 +29,7 @@ class LinkExtractor: def __init__(self, logger: Optional[AsyncLogger] = None): """ - Initialize the LinkExtractor. + Initialize the LinkPreview. Args: logger: Optional logger instance for recording events @@ -78,12 +78,12 @@ class LinkExtractor: Args: links: Links object containing internal and external links - config: CrawlerRunConfig with link_extraction_config settings + config: CrawlerRunConfig with link_preview_config settings Returns: Links object with head_data attached to filtered Link objects """ - link_config = config.link_extraction_config + link_config = config.link_preview_config # Ensure seeder is initialized await self.start() @@ -331,7 +331,7 @@ class LinkExtractor: intrinsic_score=updated_link.intrinsic_score, contextual_score=updated_link.contextual_score, score_links_enabled=getattr(config, 'score_links', False), - query_provided=bool(config.link_extraction_config.query) + query_provided=bool(config.link_preview_config.query) ) updated_internal.append(updated_link) @@ -369,7 +369,7 @@ class LinkExtractor: intrinsic_score=updated_link.intrinsic_score, contextual_score=updated_link.contextual_score, score_links_enabled=getattr(config, 'score_links', False), - query_provided=bool(config.link_extraction_config.query) + query_provided=bool(config.link_preview_config.query) ) updated_external.append(updated_link) diff --git a/docs/examples/link_head_extraction_example.py b/docs/examples/link_head_extraction_example.py index 30e2b0ba..ef146d95 100644 --- a/docs/examples/link_head_extraction_example.py +++ b/docs/examples/link_head_extraction_example.py @@ -18,7 +18,7 @@ Usage: import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.async_configs import LinkExtractionConfig +from crawl4ai.async_configs import LinkPreviewConfig async def basic_link_head_extraction(): @@ -30,7 +30,7 @@ async def basic_link_head_extraction(): config = CrawlerRunConfig( # Enable link head extraction - link_extraction_config=LinkExtractionConfig( + link_preview_config=LinkPreviewConfig( include_internal=True, # Process internal links include_external=False, # Skip external links for this demo max_links=5, # Limit to 5 links @@ -94,7 +94,7 @@ async def research_assistant_example(): print("=" * 50) config = CrawlerRunConfig( - link_extraction_config=LinkExtractionConfig( + link_preview_config=LinkPreviewConfig( include_internal=True, include_external=True, include_patterns=["*/docs/*", "*/tutorial/*", "*/guide/*"], @@ -149,7 +149,7 @@ async def api_discovery_example(): print("=" * 50) config = CrawlerRunConfig( - link_extraction_config=LinkExtractionConfig( + link_preview_config=LinkPreviewConfig( include_internal=True, include_patterns=["*/api/*", "*/reference/*", "*/endpoint/*"], exclude_patterns=["*/deprecated/*", "*/v1/*"], # Skip old versions @@ -214,7 +214,7 @@ async def link_quality_analysis(): print("=" * 50) config = CrawlerRunConfig( - link_extraction_config=LinkExtractionConfig( + link_preview_config=LinkPreviewConfig( include_internal=True, max_links=30, # Analyze more links for better statistics concurrency=15, @@ -281,7 +281,7 @@ async def pattern_filtering_example(): filters = [ { "name": "Documentation Only", - "config": LinkExtractionConfig( + "config": LinkPreviewConfig( include_internal=True, max_links=10, concurrency=5, @@ -292,7 +292,7 @@ async def pattern_filtering_example(): }, { "name": "API References Only", - "config": LinkExtractionConfig( + "config": LinkPreviewConfig( include_internal=True, max_links=10, concurrency=5, @@ -303,7 +303,7 @@ async def pattern_filtering_example(): }, { "name": "Exclude Admin Areas", - "config": LinkExtractionConfig( + "config": LinkPreviewConfig( include_internal=True, max_links=10, concurrency=5, @@ -318,7 +318,7 @@ async def pattern_filtering_example(): print(f"\nšŸ” Testing: {filter_example['name']}") config = CrawlerRunConfig( - link_extraction_config=filter_example['config'], + link_preview_config=filter_example['config'], score_links=True ) diff --git a/docs/md_v2/core/link-media.md b/docs/md_v2/core/link-media.md index a229f710..f6305ccc 100644 --- a/docs/md_v2/core/link-media.md +++ b/docs/md_v2/core/link-media.md @@ -125,7 +125,7 @@ Here's a full example you can copy, paste, and run immediately: ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.async_configs import LinkExtractionConfig +from crawl4ai.async_configs import LinkPreviewConfig async def extract_link_heads_example(): """ @@ -136,7 +136,7 @@ async def extract_link_heads_example(): # Configure link head extraction config = CrawlerRunConfig( # Enable link head extraction with detailed configuration - link_extraction_config=LinkExtractionConfig( + link_preview_config=LinkPreviewConfig( include_internal=True, # Extract from internal links include_external=False, # Skip external links for this example max_links=10, # Limit to 10 links for demo @@ -234,12 +234,12 @@ if __name__ == "__main__": ### 2.3 Configuration Deep Dive -The `LinkExtractionConfig` class supports these options: +The `LinkPreviewConfig` class supports these options: ```python -from crawl4ai.async_configs import LinkExtractionConfig +from crawl4ai.async_configs import LinkPreviewConfig -link_extraction_config = LinkExtractionConfig( +link_preview_config = LinkPreviewConfig( # BASIC SETTINGS verbose=True, # Show detailed logs (recommended for learning) @@ -316,7 +316,7 @@ Find the most relevant documentation pages: ```python async def research_assistant(): config = CrawlerRunConfig( - link_extraction_config=LinkExtractionConfig( + link_preview_config=LinkPreviewConfig( include_internal=True, include_external=True, include_patterns=["*/docs/*", "*/tutorial/*", "*/guide/*"], @@ -348,7 +348,7 @@ Find all API endpoints and references: ```python async def api_discovery(): config = CrawlerRunConfig( - link_extraction_config=LinkExtractionConfig( + link_preview_config=LinkPreviewConfig( include_internal=True, include_patterns=["*/api/*", "*/reference/*"], exclude_patterns=["*/deprecated/*"], @@ -387,7 +387,7 @@ Analyze website structure and content quality: ```python async def quality_analysis(): config = CrawlerRunConfig( - link_extraction_config=LinkExtractionConfig( + link_preview_config=LinkPreviewConfig( include_internal=True, max_links=200, concurrency=20, @@ -434,7 +434,7 @@ async def quality_analysis(): ```python # Check your configuration: config = CrawlerRunConfig( - link_extraction_config=LinkExtractionConfig( + link_preview_config=LinkPreviewConfig( verbose=True # ← Enable to see what's happening ) ) @@ -445,7 +445,7 @@ config = CrawlerRunConfig( # Make sure scoring is enabled: config = CrawlerRunConfig( score_links=True, # ← Enable intrinsic scoring - link_extraction_config=LinkExtractionConfig( + link_preview_config=LinkPreviewConfig( query="your search terms" # ← For contextual scoring ) ) @@ -454,7 +454,7 @@ config = CrawlerRunConfig( **Process taking too long?** ```python # Optimize performance: -link_extraction_config = LinkExtractionConfig( +link_preview_config = LinkPreviewConfig( max_links=20, # ← Reduce number concurrency=10, # ← Increase parallelism timeout=3, # ← Shorter timeout diff --git a/tests/test_link_extractor.py b/tests/test_link_extractor.py index 1b5fa8a0..3f64d7a3 100644 --- a/tests/test_link_extractor.py +++ b/tests/test_link_extractor.py @@ -5,7 +5,7 @@ Test script for Link Extractor functionality from crawl4ai.models import Link from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.async_configs import LinkExtractionConfig +from crawl4ai.async_configs import LinkPreviewConfig import asyncio import sys import os @@ -22,7 +22,7 @@ async def test_link_extractor(): # Test configuration with link extraction AND scoring enabled config = CrawlerRunConfig( - link_extraction_config=LinkExtractionConfig( + link_preview_config=LinkPreviewConfig( include_internal=True, include_external=False, # Only internal links for this test # No include/exclude patterns for first test - let's see what we get @@ -53,7 +53,7 @@ async def test_link_extractor(): result = await crawler.arun(url, config=config) # Debug: Check if link extraction config is being passed - print(f"šŸ” Debug - Link extraction config: {config.link_extraction_config.to_dict() if config.link_extraction_config else None}") + print(f"šŸ” Debug - Link extraction config: {config.link_preview_config.to_dict() if config.link_preview_config else None}") print(f"šŸ” Debug - Score links: {config.score_links}") if result.success: @@ -187,7 +187,7 @@ def test_config_examples(): examples = [ { "name": "BM25 Scored Documentation Links", - "config": LinkExtractionConfig( + "config": LinkPreviewConfig( include_internal=True, include_external=False, include_patterns=["*/docs/*", "*/api/*", "*/reference/*"], @@ -199,7 +199,7 @@ def test_config_examples(): }, { "name": "Internal Links Only", - "config": LinkExtractionConfig( + "config": LinkPreviewConfig( include_internal=True, include_external=False, max_links=50, @@ -208,7 +208,7 @@ def test_config_examples(): }, { "name": "External Links with Patterns", - "config": LinkExtractionConfig( + "config": LinkPreviewConfig( include_internal=False, include_external=True, include_patterns=["*github.com*", "*stackoverflow.com*"], @@ -218,7 +218,7 @@ def test_config_examples(): }, { "name": "High-Performance Mode", - "config": LinkExtractionConfig( + "config": LinkPreviewConfig( include_internal=True, include_external=False, concurrency=20, @@ -237,9 +237,9 @@ def test_config_examples(): print(f" {key}: {value}") print(" Usage:") - print(" from crawl4ai.async_configs import LinkExtractionConfig") + print(" from crawl4ai.async_configs import LinkPreviewConfig") print(" config = CrawlerRunConfig(") - print(" link_extraction_config=LinkExtractionConfig(") + print(" link_preview_config=LinkPreviewConfig(") for key, value in config_dict.items(): if isinstance(value, str): print(f" {key}='{value}',")