feat: Add advanced link head extraction with three-layer scoring system (#1)

Squashed commit from feature/link-extractor branch implementing comprehensive link analysis: - Extract HTML head content from discovered links with parallel processing - Three-layer scoring: Intrinsic (URL quality), Contextual (BM25), and Total scores - New LinkExtractionConfig class for type-safe configuration - Pattern-based filtering for internal/external links - Comprehensive documentation and examples
2025-06-27 20:06:04 +08:00
parent e528086341
commit 5c9c305dbf
10 changed files with 2126 additions and 15 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -37,6 +37,7 @@ from .content_filter_strategy import (
 )
 from .models import CrawlResult, MarkdownGenerationResult, DisplayMode
 from .components.crawler_monitor import CrawlerMonitor
+from .link_extractor import LinkExtractor
 from .async_dispatcher import (
    MemoryAdaptiveDispatcher,
    SemaphoreDispatcher,
@@ -141,6 +142,7 @@ __all__ = [
    "SemaphoreDispatcher",
    "RateLimiter",
    "CrawlerMonitor",
+    "LinkExtractor",
    "DisplayMode",
    "MarkdownGenerationResult",
    "Crawl4aiDockerClient",
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -17,7 +17,7 @@ from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy
 from .chunking_strategy import ChunkingStrategy, RegexChunking

 from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
-from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
+from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy, LXMLWebScrapingStrategy
 from .deep_crawling import DeepCrawlStrategy

 from .cache_context import CacheMode
@@ -594,6 +594,101 @@ class BrowserConfig:
            return config
        return BrowserConfig.from_kwargs(config)

+class LinkExtractionConfig:
+    """Configuration for link head extraction and scoring."""
+    
+    def __init__(
+        self,
+        include_internal: bool = True,
+        include_external: bool = False,
+        include_patterns: Optional[List[str]] = None,
+        exclude_patterns: Optional[List[str]] = None,
+        concurrency: int = 10,
+        timeout: int = 5,
+        max_links: int = 100,
+        query: Optional[str] = None,
+        score_threshold: Optional[float] = None,
+        verbose: bool = False
+    ):
+        """
+        Initialize link extraction configuration.
+        
+        Args:
+            include_internal: Whether to include same-domain links
+            include_external: Whether to include different-domain links  
+            include_patterns: List of glob patterns to include (e.g., ["*/docs/*", "*/api/*"])
+            exclude_patterns: List of glob patterns to exclude (e.g., ["*/login*", "*/admin*"])
+            concurrency: Number of links to process simultaneously
+            timeout: Timeout in seconds for each link's head extraction
+            max_links: Maximum number of links to process (prevents overload)
+            query: Query string for BM25 contextual scoring (optional)
+            score_threshold: Minimum relevance score to include links (0.0-1.0, optional)
+            verbose: Show detailed progress during extraction
+        """
+        self.include_internal = include_internal
+        self.include_external = include_external
+        self.include_patterns = include_patterns
+        self.exclude_patterns = exclude_patterns
+        self.concurrency = concurrency
+        self.timeout = timeout
+        self.max_links = max_links
+        self.query = query
+        self.score_threshold = score_threshold
+        self.verbose = verbose
+        
+        # Validation
+        if concurrency <= 0:
+            raise ValueError("concurrency must be positive")
+        if timeout <= 0:
+            raise ValueError("timeout must be positive")
+        if max_links <= 0:
+            raise ValueError("max_links must be positive")
+        if score_threshold is not None and not (0.0 <= score_threshold <= 1.0):
+            raise ValueError("score_threshold must be between 0.0 and 1.0")
+        if not include_internal and not include_external:
+            raise ValueError("At least one of include_internal or include_external must be True")
+    
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "LinkExtractionConfig":
+        """Create LinkExtractionConfig from dictionary (for backward compatibility)."""
+        if not config_dict:
+            return None
+        
+        return LinkExtractionConfig(
+            include_internal=config_dict.get("include_internal", True),
+            include_external=config_dict.get("include_external", False),
+            include_patterns=config_dict.get("include_patterns"),
+            exclude_patterns=config_dict.get("exclude_patterns"),
+            concurrency=config_dict.get("concurrency", 10),
+            timeout=config_dict.get("timeout", 5),
+            max_links=config_dict.get("max_links", 100),
+            query=config_dict.get("query"),
+            score_threshold=config_dict.get("score_threshold"),
+            verbose=config_dict.get("verbose", False)
+        )
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary format."""
+        return {
+            "include_internal": self.include_internal,
+            "include_external": self.include_external,
+            "include_patterns": self.include_patterns,
+            "exclude_patterns": self.exclude_patterns,
+            "concurrency": self.concurrency,
+            "timeout": self.timeout,
+            "max_links": self.max_links,
+            "query": self.query,
+            "score_threshold": self.score_threshold,
+            "verbose": self.verbose
+        }
+    
+    def clone(self, **kwargs) -> "LinkExtractionConfig":
+        """Create a copy with updated values."""
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return LinkExtractionConfig.from_dict(config_dict)
+
+
 class HTTPCrawlerConfig:
    """HTTP-specific crawler configuration"""

@@ -829,6 +924,9 @@ class CrawlerRunConfig():
                                       Default: [].
        exclude_internal_links (bool): If True, exclude internal links from the results.
                                       Default: False.
+        score_links (bool): If True, calculate intrinsic quality scores for all links using URL structure,
+                           text quality, and contextual relevance metrics. Separate from link_extraction_config.
+                           Default: False.

        # Debugging and Logging Parameters
        verbose (bool): Enable verbose logging.
@@ -939,6 +1037,7 @@ class CrawlerRunConfig():
        exclude_social_media_links: bool = False,
        exclude_domains: list = None,
        exclude_internal_links: bool = False,
+        score_links: bool = False,
        # Debugging and Logging Parameters
        verbose: bool = True,
        log_console: bool = False,
@@ -955,6 +1054,8 @@ class CrawlerRunConfig():
        user_agent_generator_config: dict = {},
        # Deep Crawl Parameters
        deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
+        # Link Extraction Parameters
+        link_extraction_config: Union[LinkExtractionConfig, Dict[str, Any]] = None,
        # Experimental Parameters
        experimental: Dict[str, Any] = None,
    ):
@@ -976,7 +1077,7 @@ class CrawlerRunConfig():
        self.remove_forms = remove_forms
        self.prettiify = prettiify
        self.parser_type = parser_type
-        self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
+        self.scraping_strategy = scraping_strategy or LXMLWebScrapingStrategy()
        self.proxy_config = proxy_config
        self.proxy_rotation_strategy = proxy_rotation_strategy
        
@@ -1042,6 +1143,7 @@ class CrawlerRunConfig():
        self.exclude_social_media_links = exclude_social_media_links
        self.exclude_domains = exclude_domains or []
        self.exclude_internal_links = exclude_internal_links
+        self.score_links = score_links

        # Debugging and Logging Parameters
        self.verbose = verbose
@@ -1084,6 +1186,17 @@ class CrawlerRunConfig():
        # Deep Crawl Parameters
        self.deep_crawl_strategy = deep_crawl_strategy
        
+        # Link Extraction Parameters
+        if link_extraction_config is None:
+            self.link_extraction_config = None
+        elif isinstance(link_extraction_config, LinkExtractionConfig):
+            self.link_extraction_config = link_extraction_config
+        elif isinstance(link_extraction_config, dict):
+            # Convert dict to config object for backward compatibility
+            self.link_extraction_config = LinkExtractionConfig.from_dict(link_extraction_config)
+        else:
+            raise ValueError("link_extraction_config must be LinkExtractionConfig object or dict")
+        
        # Experimental Parameters
        self.experimental = experimental or {}
        
@@ -1241,6 +1354,7 @@ class CrawlerRunConfig():
            exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
            exclude_domains=kwargs.get("exclude_domains", []),
            exclude_internal_links=kwargs.get("exclude_internal_links", False),
+            score_links=kwargs.get("score_links", False),
            # Debugging and Logging Parameters
            verbose=kwargs.get("verbose", True),
            log_console=kwargs.get("log_console", False),
@@ -1256,6 +1370,8 @@ class CrawlerRunConfig():
            user_agent_generator_config=kwargs.get("user_agent_generator_config", {}),
            # Deep Crawl Parameters
            deep_crawl_strategy=kwargs.get("deep_crawl_strategy"),
+            # Link Extraction Parameters
+            link_extraction_config=kwargs.get("link_extraction_config"),
            url=kwargs.get("url"),
            # Experimental Parameters 
            experimental=kwargs.get("experimental"),
@@ -1339,6 +1455,7 @@ class CrawlerRunConfig():
            "exclude_social_media_links": self.exclude_social_media_links,
            "exclude_domains": self.exclude_domains,
            "exclude_internal_links": self.exclude_internal_links,
+            "score_links": self.score_links,
            "verbose": self.verbose,
            "log_console": self.log_console,
            "capture_network_requests": self.capture_network_requests,
@@ -1350,6 +1467,7 @@ class CrawlerRunConfig():
            "user_agent_mode": self.user_agent_mode,
            "user_agent_generator_config": self.user_agent_generator_config,
            "deep_crawl_strategy": self.deep_crawl_strategy,
+            "link_extraction_config": self.link_extraction_config.to_dict() if self.link_extraction_config else None,
            "url": self.url,
            "experimental": self.experimental,
        }
--- a/crawl4ai/async_url_seeder.py
+++ b/crawl4ai/async_url_seeder.py
@@ -109,12 +109,16 @@ def _parse_head(src: str) -> Dict[str, Any]:
            elif "charset" in el.attrib:
                info["charset"] = el.attrib["charset"].lower()
        for el in doc.xpath(".//link"):
-            rel = " ".join(el.attrib.get("rel", [])).lower()
-            if not rel:
+            rel_attr = el.attrib.get("rel", "")
+            if not rel_attr:
                continue
+            # Handle multiple space-separated rel values
+            rel_values = rel_attr.lower().split()
            entry = {a: el.attrib[a] for a in (
                "href", "as", "type", "hreflang") if a in el.attrib}
-            info["link"].setdefault(rel, []).append(entry)
+            # Add entry for each rel value
+            for rel in rel_values:
+                info["link"].setdefault(rel, []).append(entry)
        # Extract JSON-LD structured data
        for script in doc.xpath('.//script[@type="application/ld+json"]'):
            if script.text:
@@ -467,6 +471,200 @@ class AsyncUrlSeeder:
            "info", "Finished URL seeding for multiple domains.", tag="URL_SEED")
        return final_results

+    async def extract_head_for_urls(
+        self,
+        urls: List[str],
+        config: Optional["SeedingConfig"] = None,
+        concurrency: int = 10,
+        timeout: int = 5
+    ) -> List[Dict[str, Any]]:
+        """
+        Extract head content for a custom list of URLs using URLSeeder's parallel processing.
+        
+        This method reuses URLSeeder's efficient parallel processing, caching, and head extraction
+        logic to process a custom list of URLs rather than discovering URLs from sources.
+        
+        Parameters
+        ----------
+        urls : List[str]
+            List of URLs to extract head content from
+        config : SeedingConfig, optional
+            Configuration object. If None, uses default settings for head extraction
+        concurrency : int, default=10
+            Number of concurrent requests
+        timeout : int, default=5
+            Timeout for each request in seconds
+            
+        Returns
+        -------
+        List[Dict[str, Any]]
+            List of dictionaries containing url, status, head_data, and optional relevance_score
+        """
+        # Create default config if none provided
+        if config is None:
+            # Import here to avoid circular imports
+            from .async_configs import SeedingConfig
+            config = SeedingConfig(
+                extract_head=True,
+                concurrency=concurrency,
+                verbose=False
+            )
+        
+        # Override concurrency and ensure head extraction is enabled
+        config.concurrency = concurrency
+        config.extract_head = True
+        
+        self._log("info", "Starting head extraction for {count} custom URLs",
+                  params={"count": len(urls)}, tag="URL_SEED")
+        
+        # Setup rate limiting if specified in config
+        if config.hits_per_sec:
+            if config.hits_per_sec <= 0:
+                self._log("warning", "hits_per_sec must be positive. Disabling rate limiting.", tag="URL_SEED")
+                self._rate_sem = None
+            else:
+                self._rate_sem = asyncio.Semaphore(config.hits_per_sec)
+        else:
+            self._rate_sem = None
+        
+        # Use bounded queue to prevent memory issues with large URL lists
+        queue_size = min(10000, max(1000, concurrency * 100))
+        queue = asyncio.Queue(maxsize=queue_size)
+        producer_done = asyncio.Event()
+        stop_event = asyncio.Event()
+        seen: set[str] = set()
+        
+        # Results collection
+        results: List[Dict[str, Any]] = []
+        
+        async def producer():
+            """Producer to feed URLs into the queue."""
+            try:
+                for url in urls:
+                    if url in seen:
+                        self._log("debug", "Skipping duplicate URL: {url}",
+                                  params={"url": url}, tag="URL_SEED")
+                        continue
+                    if stop_event.is_set():
+                        break
+                    seen.add(url)
+                    await queue.put(url)
+            finally:
+                producer_done.set()
+        
+        async def worker(res_list: List[Dict[str, Any]]):
+            """Worker to process URLs from the queue."""
+            while True:
+                try:
+                    # Wait for URL or producer completion
+                    url = await asyncio.wait_for(queue.get(), timeout=1.0)
+                except asyncio.TimeoutError:
+                    if producer_done.is_set() and queue.empty():
+                        break
+                    continue
+                
+                try:
+                    # Use existing _validate method which handles head extraction, caching, etc.
+                    await self._validate(
+                        url, res_list, 
+                        live=False,  # We're not doing live checks, just head extraction
+                        extract=True,  # Always extract head content
+                        timeout=timeout,
+                        verbose=config.verbose or False,
+                        query=config.query,
+                        score_threshold=config.score_threshold,
+                        scoring_method=config.scoring_method or "bm25",
+                        filter_nonsense=config.filter_nonsense_urls
+                    )
+                except Exception as e:
+                    self._log("error", "Failed to process URL {url}: {error}",
+                              params={"url": url, "error": str(e)}, tag="URL_SEED")
+                    # Add failed entry to results
+                    res_list.append({
+                        "url": url,
+                        "status": "failed",
+                        "head_data": {},
+                        "error": str(e)
+                    })
+                finally:
+                    queue.task_done()
+        
+        # Start producer
+        producer_task = asyncio.create_task(producer())
+        
+        # Start workers
+        worker_tasks = []
+        for _ in range(concurrency):
+            worker_task = asyncio.create_task(worker(results))
+            worker_tasks.append(worker_task)
+        
+        # Wait for producer to finish
+        await producer_task
+        
+        # Wait for all items to be processed
+        await queue.join()
+        
+        # Cancel workers
+        for task in worker_tasks:
+            task.cancel()
+        
+        # Wait for workers to finish canceling
+        await asyncio.gather(*worker_tasks, return_exceptions=True)
+        
+        # Apply BM25 scoring if query is provided
+        if config.query and config.scoring_method == "bm25":
+            results = await self._apply_bm25_scoring(results, config)
+        
+        # Apply score threshold filtering
+        if config.score_threshold is not None:
+            results = [r for r in results if r.get("relevance_score", 0) >= config.score_threshold]
+        
+        # Sort by relevance score if available
+        if any("relevance_score" in r for r in results):
+            results.sort(key=lambda x: x.get("relevance_score", 0), reverse=True)
+        
+        self._log("info", "Completed head extraction for {count} URLs, {success} successful",
+                  params={
+                      "count": len(urls),
+                      "success": len([r for r in results if r.get("status") == "valid"])
+                  }, tag="URL_SEED")
+        
+        return results
+
+    async def _apply_bm25_scoring(self, results: List[Dict[str, Any]], config: "SeedingConfig") -> List[Dict[str, Any]]:
+        """Apply BM25 scoring to results that have head_data."""
+        if not HAS_BM25:
+            self._log("warning", "BM25 scoring requested but rank_bm25 not available", tag="URL_SEED")
+            return results
+        
+        # Extract text contexts from head data
+        text_contexts = []
+        valid_results = []
+        
+        for result in results:
+            if result.get("status") == "valid" and result.get("head_data"):
+                text_context = self._extract_text_context(result["head_data"])
+                if text_context:
+                    text_contexts.append(text_context)
+                    valid_results.append(result)
+                else:
+                    # Use URL-based scoring as fallback
+                    score = self._calculate_url_relevance_score(config.query, result["url"])
+                    result["relevance_score"] = float(score)
+            elif result.get("status") == "valid":
+                # No head data but valid URL - use URL-based scoring
+                score = self._calculate_url_relevance_score(config.query, result["url"])
+                result["relevance_score"] = float(score)
+        
+        # Calculate BM25 scores for results with text context
+        if text_contexts and valid_results:
+            scores = await asyncio.to_thread(self._calculate_bm25_score, config.query, text_contexts)
+            for i, result in enumerate(valid_results):
+                if i < len(scores):
+                    result["relevance_score"] = float(scores[i])
+        
+        return results
+
    async def _resolve_head(self, url: str) -> Optional[str]:
        """
        HEAD-probe a URL.
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -23,6 +23,8 @@ from .utils import (
    is_external_url,
    get_base_domain,
    extract_metadata_using_lxml,
+    extract_page_context,
+    calculate_link_intrinsic_score,
 )
 from lxml import etree
 from lxml import html as lhtml
@@ -944,6 +946,72 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        # Update the links dictionary with unique links
        links["internal"] = list(internal_links_dict.values())
        links["external"] = list(external_links_dict.values())
+        
+        # Extract head content for links if configured
+        link_extraction_config = kwargs.get("link_extraction_config")
+        if link_extraction_config is not None:
+            try:
+                import asyncio
+                from .link_extractor import LinkExtractor
+                from .models import Links, Link
+                
+                verbose = link_extraction_config.verbose
+                
+                if verbose:
+                    self._log("info", "Starting link head extraction for {internal} internal and {external} external links",
+                              params={"internal": len(links["internal"]), "external": len(links["external"])}, tag="LINK_EXTRACT")
+                
+                # Convert dict links to Link objects
+                internal_links = [Link(**link_data) for link_data in links["internal"]]
+                external_links = [Link(**link_data) for link_data in links["external"]]
+                links_obj = Links(internal=internal_links, external=external_links)
+                
+                # Create a config object for LinkExtractor  
+                class TempCrawlerRunConfig:
+                    def __init__(self, link_config, score_links):
+                        self.link_extraction_config = link_config
+                        self.score_links = score_links
+                
+                config = TempCrawlerRunConfig(link_extraction_config, kwargs.get("score_links", False))
+                
+                # Extract head content (run async operation in sync context)
+                async def extract_links():
+                    async with LinkExtractor(self.logger) as extractor:
+                        return await extractor.extract_link_heads(links_obj, config)
+                
+                # Run the async operation
+                try:
+                    # Check if we're already in an async context
+                    loop = asyncio.get_running_loop()
+                    # If we're in an async context, we need to run in a thread
+                    import concurrent.futures
+                    with concurrent.futures.ThreadPoolExecutor() as executor:
+                        future = executor.submit(asyncio.run, extract_links())
+                        updated_links = future.result()
+                except RuntimeError:
+                    # No running loop, we can use asyncio.run directly
+                    updated_links = asyncio.run(extract_links())
+                
+                # Convert back to dict format
+                links["internal"] = [link.dict() for link in updated_links.internal]
+                links["external"] = [link.dict() for link in updated_links.external]
+                
+                if verbose:
+                    successful_internal = len([l for l in updated_links.internal if l.head_extraction_status == "valid"])
+                    successful_external = len([l for l in updated_links.external if l.head_extraction_status == "valid"])
+                    self._log("info", "Link head extraction completed: {internal_success}/{internal_total} internal, {external_success}/{external_total} external",
+                              params={
+                                  "internal_success": successful_internal,
+                                  "internal_total": len(updated_links.internal),
+                                  "external_success": successful_external,
+                                  "external_total": len(updated_links.external)
+                              }, tag="LINK_EXTRACT")
+                else:
+                    self._log("info", "Link head extraction completed successfully", tag="LINK_EXTRACT")
+                
+            except Exception as e:
+                self._log("error", f"Link head extraction failed: {str(e)}", tag="LINK_EXTRACT")
+                # Continue with original links if extraction fails

        # # Process images using ThreadPoolExecutor
        imgs = body.find_all("img")
@@ -1037,6 +1105,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
        media: Dict[str, List],
        internal_links_dict: Dict[str, Any],
        external_links_dict: Dict[str, Any],
+        page_context: dict = None,
        **kwargs,
    ) -> bool:
        base_domain = kwargs.get("base_domain", get_base_domain(url))
@@ -1056,6 +1125,25 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
                    "title": link.get("title", "").strip(),
                    "base_domain": base_domain,
                }
+                
+                # Add intrinsic scoring if enabled
+                if kwargs.get("score_links", False) and page_context is not None:
+                    try:
+                        intrinsic_score = calculate_link_intrinsic_score(
+                            link_text=link_data["text"],
+                            url=normalized_href,
+                            title_attr=link_data["title"],
+                            class_attr=link.get("class", ""),
+                            rel_attr=link.get("rel", ""),
+                            page_context=page_context
+                        )
+                        link_data["intrinsic_score"] = intrinsic_score
+                    except Exception:
+                        # Fail gracefully - assign default score
+                        link_data["intrinsic_score"] = float('inf')
+                else:
+                    # No scoring enabled - assign infinity (all links equal priority)
+                    link_data["intrinsic_score"] = float('inf')

                is_external = is_external_url(normalized_href, base_domain)
                if is_external:
@@ -1491,6 +1579,33 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):

            base_domain = get_base_domain(url)
            
+            # Extract page context for link scoring (if enabled) - do this BEFORE any removals
+            page_context = None
+            if kwargs.get("score_links", False):
+                try:
+                    # Extract title
+                    title_elements = doc.xpath('//title')
+                    page_title = title_elements[0].text_content() if title_elements else ""
+                    
+                    # Extract headlines
+                    headlines = []
+                    for tag in ['h1', 'h2', 'h3']:
+                        elements = doc.xpath(f'//{tag}')
+                        for el in elements:
+                            text = el.text_content().strip()
+                            if text:
+                                headlines.append(text)
+                    headlines_text = ' '.join(headlines)
+                    
+                    # Extract meta description
+                    meta_desc_elements = doc.xpath('//meta[@name="description"]/@content')
+                    meta_description = meta_desc_elements[0] if meta_desc_elements else ""
+                    
+                    # Create page context
+                    page_context = extract_page_context(page_title, headlines_text, meta_description, url)
+                except Exception:
+                    page_context = {}  # Fail gracefully
+            
            # Early removal of all images if exclude_all_images is set
            # This is more efficient in lxml as we remove elements before any processing
            if kwargs.get("exclude_all_images", False):
@@ -1579,6 +1694,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
                media,
                internal_links_dict,
                external_links_dict,
+                page_context=page_context,
                base_domain=base_domain,
                **kwargs,
            )
@@ -1623,14 +1739,84 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
                method="html",
                with_tail=False,
            ).strip()
+            
+            # Create links dictionary in the format expected by LinkExtractor
+            links = {
+                "internal": list(internal_links_dict.values()),
+                "external": list(external_links_dict.values()),
+            }
+            
+            # Extract head content for links if configured
+            link_extraction_config = kwargs.get("link_extraction_config")
+            if link_extraction_config is not None:
+                try:
+                    import asyncio
+                    from .link_extractor import LinkExtractor
+                    from .models import Links, Link
+                    
+                    verbose = link_extraction_config.verbose
+                    
+                    if verbose:
+                        self._log("info", "Starting link head extraction for {internal} internal and {external} external links",
+                                  params={"internal": len(links["internal"]), "external": len(links["external"])}, tag="LINK_EXTRACT")
+                    
+                    # Convert dict links to Link objects
+                    internal_links = [Link(**link_data) for link_data in links["internal"]]
+                    external_links = [Link(**link_data) for link_data in links["external"]]
+                    links_obj = Links(internal=internal_links, external=external_links)
+                    
+                    # Create a config object for LinkExtractor
+                    class TempCrawlerRunConfig:
+                        def __init__(self, link_config, score_links):
+                            self.link_extraction_config = link_config
+                            self.score_links = score_links
+                    
+                    config = TempCrawlerRunConfig(link_extraction_config, kwargs.get("score_links", False))
+                    
+                    # Extract head content (run async operation in sync context)
+                    async def extract_links():
+                        async with LinkExtractor(self.logger) as extractor:
+                            return await extractor.extract_link_heads(links_obj, config)
+                    
+                    # Run the async operation
+                    try:
+                        # Check if we're already in an async context
+                        loop = asyncio.get_running_loop()
+                        # If we're in an async context, we need to run in a thread
+                        import concurrent.futures
+                        with concurrent.futures.ThreadPoolExecutor() as executor:
+                            future = executor.submit(asyncio.run, extract_links())
+                            updated_links = future.result()
+                    except RuntimeError:
+                        # No running loop, we can use asyncio.run directly
+                        updated_links = asyncio.run(extract_links())
+                    
+                    # Convert back to dict format
+                    links["internal"] = [link.dict() for link in updated_links.internal]
+                    links["external"] = [link.dict() for link in updated_links.external]
+                    
+                    if verbose:
+                        successful_internal = len([l for l in updated_links.internal if l.head_extraction_status == "valid"])
+                        successful_external = len([l for l in updated_links.external if l.head_extraction_status == "valid"])
+                        self._log("info", "Link head extraction completed: {internal_success}/{internal_total} internal, {external_success}/{external_total} external",
+                                  params={
+                                      "internal_success": successful_internal,
+                                      "internal_total": len(updated_links.internal),
+                                      "external_success": successful_external,
+                                      "external_total": len(updated_links.external)
+                                  }, tag="LINK_EXTRACT")
+                    else:
+                        self._log("info", "Link head extraction completed successfully", tag="LINK_EXTRACT")
+                        
+                except Exception as e:
+                    self._log("error", f"Error during link head extraction: {str(e)}", tag="LINK_EXTRACT")
+                    # Continue with original links if head extraction fails
+            
            return {
                "cleaned_html": cleaned_html,
                "success": success,
                "media": media,
-                "links": {
-                    "internal": list(internal_links_dict.values()),
-                    "external": list(external_links_dict.values()),
-                },
+                "links": links,
                "metadata": meta,
            }

--- a/crawl4ai/link_extractor.py
+++ b/crawl4ai/link_extractor.py
@@ -0,0 +1,395 @@
+"""
+Link Extractor for Crawl4AI
+
+Extracts head content from links discovered during crawling using URLSeeder's
+efficient parallel processing and caching infrastructure.
+"""
+
+import asyncio
+import fnmatch
+from typing import Dict, List, Optional, Any
+from .async_logger import AsyncLogger
+from .async_url_seeder import AsyncUrlSeeder
+from .async_configs import SeedingConfig, CrawlerRunConfig
+from .models import Links, Link
+from .utils import calculate_total_score
+
+
+class LinkExtractor:
+    """
+    Extracts head content from links using URLSeeder's parallel processing infrastructure.
+    
+    This class provides intelligent link filtering and head content extraction with:
+    - Pattern-based inclusion/exclusion filtering
+    - Parallel processing with configurable concurrency
+    - Caching for performance
+    - BM25 relevance scoring
+    - Memory-safe processing for large link sets
+    """
+    
+    def __init__(self, logger: Optional[AsyncLogger] = None):
+        """
+        Initialize the LinkExtractor.
+        
+        Args:
+            logger: Optional logger instance for recording events
+        """
+        self.logger = logger
+        self.seeder: Optional[AsyncUrlSeeder] = None
+        self._owns_seeder = False
+    
+    async def __aenter__(self):
+        """Async context manager entry."""
+        await self.start()
+        return self
+    
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit."""
+        await self.close()
+    
+    async def start(self):
+        """Initialize the URLSeeder instance."""
+        if not self.seeder:
+            self.seeder = AsyncUrlSeeder(logger=self.logger)
+            await self.seeder.__aenter__()
+            self._owns_seeder = True
+    
+    async def close(self):
+        """Clean up resources."""
+        if self.seeder and self._owns_seeder:
+            await self.seeder.__aexit__(None, None, None)
+            self.seeder = None
+            self._owns_seeder = False
+    
+    def _log(self, level: str, message: str, tag: str = "LINK_EXTRACT", **kwargs):
+        """Helper method to safely log messages."""
+        if self.logger:
+            log_method = getattr(self.logger, level, None)
+            if log_method:
+                log_method(message=message, tag=tag, params=kwargs.get('params', {}))
+    
+    async def extract_link_heads(
+        self, 
+        links: Links, 
+        config: CrawlerRunConfig
+    ) -> Links:
+        """
+        Extract head content for filtered links and attach to Link objects.
+        
+        Args:
+            links: Links object containing internal and external links
+            config: CrawlerRunConfig with link_extraction_config settings
+            
+        Returns:
+            Links object with head_data attached to filtered Link objects
+        """
+        link_config = config.link_extraction_config
+        
+        # Ensure seeder is initialized
+        await self.start()
+        
+        # Filter links based on configuration
+        filtered_urls = self._filter_links(links, link_config)
+        
+        if not filtered_urls:
+            self._log("info", "No links matched filtering criteria")
+            return links
+        
+        self._log("info", "Extracting head content for {count} filtered links",
+                  params={"count": len(filtered_urls)})
+        
+        # Extract head content using URLSeeder
+        head_results = await self._extract_heads_parallel(filtered_urls, link_config)
+        
+        # Merge results back into Link objects
+        updated_links = self._merge_head_data(links, head_results, config)
+        
+        self._log("info", "Completed head extraction for links, {success} successful",
+                  params={"success": len([r for r in head_results if r.get("status") == "valid"])})
+        
+        return updated_links
+    
+    def _filter_links(self, links: Links, link_config: Dict[str, Any]) -> List[str]:
+        """
+        Filter links based on configuration parameters.
+        
+        Args:
+            links: Links object containing internal and external links
+            link_config: Configuration dictionary for link extraction
+            
+        Returns:
+            List of filtered URL strings
+        """
+        filtered_urls = []
+        
+        # Include internal links if configured
+        if link_config.include_internal:
+            filtered_urls.extend([link.href for link in links.internal if link.href])
+            self._log("debug", "Added {count} internal links",
+                      params={"count": len(links.internal)})
+        
+        # Include external links if configured
+        if link_config.include_external:
+            filtered_urls.extend([link.href for link in links.external if link.href])
+            self._log("debug", "Added {count} external links",
+                      params={"count": len(links.external)})
+        
+        # Apply include patterns
+        include_patterns = link_config.include_patterns
+        if include_patterns:
+            filtered_urls = [
+                url for url in filtered_urls
+                if any(fnmatch.fnmatch(url, pattern) for pattern in include_patterns)
+            ]
+            self._log("debug", "After include patterns: {count} links remain",
+                      params={"count": len(filtered_urls)})
+        
+        # Apply exclude patterns
+        exclude_patterns = link_config.exclude_patterns
+        if exclude_patterns:
+            filtered_urls = [
+                url for url in filtered_urls
+                if not any(fnmatch.fnmatch(url, pattern) for pattern in exclude_patterns)
+            ]
+            self._log("debug", "After exclude patterns: {count} links remain",
+                      params={"count": len(filtered_urls)})
+        
+        # Limit number of links
+        max_links = link_config.max_links
+        if max_links > 0 and len(filtered_urls) > max_links:
+            filtered_urls = filtered_urls[:max_links]
+            self._log("debug", "Limited to {max_links} links",
+                      params={"max_links": max_links})
+        
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_urls = []
+        for url in filtered_urls:
+            if url not in seen:
+                seen.add(url)
+                unique_urls.append(url)
+        
+        self._log("debug", "Final filtered URLs: {count} unique links",
+                  params={"count": len(unique_urls)})
+        
+        return unique_urls
+    
+    async def _extract_heads_parallel(
+        self, 
+        urls: List[str], 
+        link_config: Dict[str, Any]
+    ) -> List[Dict[str, Any]]:
+        """
+        Extract head content for URLs using URLSeeder's parallel processing.
+        
+        Args:
+            urls: List of URLs to process
+            link_config: Configuration dictionary for link extraction
+            
+        Returns:
+            List of dictionaries with url, status, head_data, and optional relevance_score
+        """
+        verbose = link_config.verbose
+        concurrency = link_config.concurrency
+        
+        if verbose:
+            self._log("info", "Starting batch processing: {total} links with {concurrency} concurrent workers",
+                      params={"total": len(urls), "concurrency": concurrency})
+        
+        # Create SeedingConfig for URLSeeder
+        seeding_config = SeedingConfig(
+            extract_head=True,
+            concurrency=concurrency,
+            hits_per_sec=getattr(link_config, 'hits_per_sec', None),
+            query=link_config.query,
+            score_threshold=link_config.score_threshold,
+            scoring_method="bm25" if link_config.query else None,
+            verbose=verbose
+        )
+        
+        # Use URLSeeder's extract_head_for_urls method with progress tracking
+        if verbose:
+            # Create a wrapper to track progress
+            results = await self._extract_with_progress(urls, seeding_config, link_config)
+        else:
+            results = await self.seeder.extract_head_for_urls(
+                urls=urls,
+                config=seeding_config,
+                concurrency=concurrency,
+                timeout=link_config.timeout
+            )
+        
+        return results
+    
+    async def _extract_with_progress(
+        self, 
+        urls: List[str], 
+        seeding_config: SeedingConfig, 
+        link_config: Dict[str, Any]
+    ) -> List[Dict[str, Any]]:
+        """Extract head content with progress reporting."""
+        
+        total_urls = len(urls)
+        concurrency = link_config.concurrency
+        batch_size = max(1, total_urls // 10)  # Report progress every 10%
+        
+        # Process URLs and track progress
+        completed = 0
+        successful = 0
+        failed = 0
+        
+        # Create a custom progress tracking version
+        # We'll modify URLSeeder's method to include progress callbacks
+        
+        # For now, let's use the existing method and report at the end
+        # In a production version, we would modify URLSeeder to accept progress callbacks
+        
+        self._log("info", "Processing links in batches...")
+        
+        # Use existing method
+        results = await self.seeder.extract_head_for_urls(
+            urls=urls,
+            config=seeding_config,
+            concurrency=concurrency,
+            timeout=link_config.timeout
+        )
+        
+        # Count results
+        for result in results:
+            completed += 1
+            if result.get("status") == "valid":
+                successful += 1
+            else:
+                failed += 1
+        
+        # Final progress report
+        self._log("info", "Batch processing completed: {completed}/{total} processed, {successful} successful, {failed} failed",
+                  params={
+                      "completed": completed,
+                      "total": total_urls,
+                      "successful": successful,
+                      "failed": failed
+                  })
+        
+        return results
+    
+    def _merge_head_data(
+        self, 
+        original_links: Links, 
+        head_results: List[Dict[str, Any]],
+        config: CrawlerRunConfig
+    ) -> Links:
+        """
+        Merge head extraction results back into Link objects.
+        
+        Args:
+            original_links: Original Links object
+            head_results: Results from head extraction
+            
+        Returns:
+            Links object with head_data attached to matching links
+        """
+        # Create URL to head_data mapping
+        url_to_head_data = {}
+        for result in head_results:
+            url = result.get("url")
+            if url:
+                url_to_head_data[url] = {
+                    "head_data": result.get("head_data", {}),
+                    "status": result.get("status", "unknown"),
+                    "error": result.get("error"),
+                    "relevance_score": result.get("relevance_score")
+                }
+        
+        # Update internal links
+        updated_internal = []
+        for link in original_links.internal:
+            if link.href in url_to_head_data:
+                head_info = url_to_head_data[link.href]
+                # Create new Link object with head data and scoring
+                contextual_score = head_info.get("relevance_score")
+                
+                updated_link = Link(
+                    href=link.href,
+                    text=link.text,
+                    title=link.title,
+                    base_domain=link.base_domain,
+                    head_data=head_info["head_data"],
+                    head_extraction_status=head_info["status"],
+                    head_extraction_error=head_info.get("error"),
+                    intrinsic_score=getattr(link, 'intrinsic_score', None),
+                    contextual_score=contextual_score
+                )
+                
+                # Add relevance score to head_data for backward compatibility
+                if contextual_score is not None:
+                    updated_link.head_data = updated_link.head_data or {}
+                    updated_link.head_data["relevance_score"] = contextual_score
+                
+                # Calculate total score combining intrinsic and contextual scores
+                updated_link.total_score = calculate_total_score(
+                    intrinsic_score=updated_link.intrinsic_score,
+                    contextual_score=updated_link.contextual_score,
+                    score_links_enabled=getattr(config, 'score_links', False),
+                    query_provided=bool(config.link_extraction_config.query)
+                )
+                
+                updated_internal.append(updated_link)
+            else:
+                # Keep original link unchanged
+                updated_internal.append(link)
+        
+        # Update external links
+        updated_external = []
+        for link in original_links.external:
+            if link.href in url_to_head_data:
+                head_info = url_to_head_data[link.href]
+                # Create new Link object with head data and scoring
+                contextual_score = head_info.get("relevance_score")
+                
+                updated_link = Link(
+                    href=link.href,
+                    text=link.text,
+                    title=link.title,
+                    base_domain=link.base_domain,
+                    head_data=head_info["head_data"],
+                    head_extraction_status=head_info["status"],
+                    head_extraction_error=head_info.get("error"),
+                    intrinsic_score=getattr(link, 'intrinsic_score', None),
+                    contextual_score=contextual_score
+                )
+                
+                # Add relevance score to head_data for backward compatibility
+                if contextual_score is not None:
+                    updated_link.head_data = updated_link.head_data or {}
+                    updated_link.head_data["relevance_score"] = contextual_score
+                
+                # Calculate total score combining intrinsic and contextual scores
+                updated_link.total_score = calculate_total_score(
+                    intrinsic_score=updated_link.intrinsic_score,
+                    contextual_score=updated_link.contextual_score,
+                    score_links_enabled=getattr(config, 'score_links', False),
+                    query_provided=bool(config.link_extraction_config.query)
+                )
+                
+                updated_external.append(updated_link)
+            else:
+                # Keep original link unchanged
+                updated_external.append(link)
+        
+        # Sort links by relevance score if available
+        if any(hasattr(link, 'head_data') and link.head_data and 'relevance_score' in link.head_data 
+               for link in updated_internal + updated_external):
+            
+            def get_relevance_score(link):
+                if hasattr(link, 'head_data') and link.head_data and 'relevance_score' in link.head_data:
+                    return link.head_data['relevance_score']
+                return 0.0
+            
+            updated_internal.sort(key=get_relevance_score, reverse=True)
+            updated_external.sort(key=get_relevance_score, reverse=True)
+        
+        return Links(
+            internal=updated_internal,
+            external=updated_external
+        )
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -345,6 +345,12 @@ class Link(BaseModel):
    text: Optional[str] = ""
    title: Optional[str] = ""
    base_domain: Optional[str] = ""
+    head_data: Optional[Dict[str, Any]] = None  # Head metadata extracted from link target
+    head_extraction_status: Optional[str] = None  # "success", "failed", "skipped"
+    head_extraction_error: Optional[str] = None  # Error message if extraction failed
+    intrinsic_score: Optional[float] = None  # Quality score based on URL structure, text, and context
+    contextual_score: Optional[float] = None  # BM25 relevance score based on query and head content
+    total_score: Optional[float] = None  # Combined score from intrinsic and contextual scores


 class Media(BaseModel):
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -2939,3 +2939,212 @@ pip install -q nest_asyncio google-colab
 echo "✅ Setup complete!"
 ''')

+
+# Link Quality Scoring Functions
+def extract_page_context(page_title: str, headlines_text: str, meta_description: str, base_url: str) -> dict:
+    """
+    Extract page context for link scoring - called ONCE per page for performance.
+    Parser-agnostic function that takes pre-extracted data.
+    
+    Args:
+        page_title: Title of the page
+        headlines_text: Combined text from h1, h2, h3 elements
+        meta_description: Meta description content
+        base_url: Base URL of the page
+        
+    Returns:
+        Dictionary containing page context data for fast link scoring
+    """
+    context = {
+        'terms': set(),
+        'headlines': headlines_text or '',
+        'meta_description': meta_description or '',
+        'domain': '',
+        'is_docs_site': False
+    }
+    
+    try:
+        from urllib.parse import urlparse
+        parsed = urlparse(base_url)
+        context['domain'] = parsed.netloc.lower()
+        
+        # Check if this is a documentation/reference site
+        context['is_docs_site'] = any(indicator in context['domain'] 
+                                    for indicator in ['docs.', 'api.', 'developer.', 'reference.'])
+        
+        # Create term set for fast intersection (performance optimization)
+        all_text = ((page_title or '') + ' ' + context['headlines'] + ' ' + context['meta_description']).lower()
+        # Simple tokenization - fast and sufficient for scoring
+        context['terms'] = set(word.strip('.,!?;:"()[]{}') 
+                             for word in all_text.split() 
+                             if len(word.strip('.,!?;:"()[]{}')) > 2)
+                             
+    except Exception:
+        # Fail gracefully - return empty context
+        pass
+    
+    return context
+
+
+def calculate_link_intrinsic_score(
+    link_text: str, 
+    url: str, 
+    title_attr: str, 
+    class_attr: str, 
+    rel_attr: str, 
+    page_context: dict
+) -> float:
+    """
+    Ultra-fast link quality scoring using only provided data (no DOM access needed).
+    Parser-agnostic function.
+    
+    Args:
+        link_text: Text content of the link
+        url: Link URL
+        title_attr: Title attribute of the link
+        class_attr: Class attribute of the link
+        rel_attr: Rel attribute of the link
+        page_context: Pre-computed page context from extract_page_context()
+        
+    Returns:
+        Quality score (0.0 - 10.0), higher is better
+    """
+    score = 0.0
+    
+    try:
+        # 1. ATTRIBUTE QUALITY (string analysis - very fast)
+        if title_attr and len(title_attr.strip()) > 3:
+            score += 1.0
+            
+        class_str = (class_attr or '').lower()
+        # Navigation/important classes boost score
+        if any(nav_class in class_str for nav_class in ['nav', 'menu', 'primary', 'main', 'important']):
+            score += 1.5
+        # Marketing/ad classes reduce score  
+        if any(bad_class in class_str for bad_class in ['ad', 'sponsor', 'track', 'promo', 'banner']):
+            score -= 1.0
+            
+        rel_str = (rel_attr or '').lower()
+        # Semantic rel values
+        if any(good_rel in rel_str for good_rel in ['canonical', 'next', 'prev', 'chapter']):
+            score += 1.0
+        if any(bad_rel in rel_str for bad_rel in ['nofollow', 'sponsored', 'ugc']):
+            score -= 0.5
+            
+        # 2. URL STRUCTURE QUALITY (string operations - very fast)
+        url_lower = url.lower()
+        
+        # High-value path patterns
+        if any(good_path in url_lower for good_path in ['/docs/', '/api/', '/guide/', '/tutorial/', '/reference/', '/manual/']):
+            score += 2.0
+        elif any(medium_path in url_lower for medium_path in ['/blog/', '/article/', '/post/', '/news/']):
+            score += 1.0
+            
+        # Penalize certain patterns
+        if any(bad_path in url_lower for bad_path in ['/admin/', '/login/', '/cart/', '/checkout/', '/track/', '/click/']):
+            score -= 1.5
+            
+        # URL depth (shallow URLs often more important)
+        url_depth = url.count('/') - 2  # Subtract protocol and domain
+        if url_depth <= 2:
+            score += 1.0
+        elif url_depth > 5:
+            score -= 0.5
+            
+        # HTTPS bonus
+        if url.startswith('https://'):
+            score += 0.5
+            
+        # 3. TEXT QUALITY (string analysis - very fast)
+        if link_text:
+            text_clean = link_text.strip()
+            if len(text_clean) > 3:
+                score += 1.0
+                
+            # Multi-word links are usually more descriptive
+            word_count = len(text_clean.split())
+            if word_count >= 2:
+                score += 0.5
+            if word_count >= 4:
+                score += 0.5
+                
+            # Avoid generic link text
+            generic_texts = ['click here', 'read more', 'more info', 'link', 'here']
+            if text_clean.lower() in generic_texts:
+                score -= 1.0
+                
+        # 4. CONTEXTUAL RELEVANCE (pre-computed page terms - very fast)
+        if page_context.get('terms') and link_text:
+            link_words = set(word.strip('.,!?;:"()[]{}').lower() 
+                           for word in link_text.split() 
+                           if len(word.strip('.,!?;:"()[]{}')) > 2)
+            
+            if link_words:
+                # Calculate word overlap ratio
+                overlap = len(link_words & page_context['terms'])
+                if overlap > 0:
+                    relevance_ratio = overlap / min(len(link_words), 10)  # Cap to avoid over-weighting
+                    score += relevance_ratio * 2.0  # Up to 2 points for relevance
+                    
+        # 5. DOMAIN CONTEXT BONUSES (very fast string checks)
+        if page_context.get('is_docs_site', False):
+            # Documentation sites: prioritize internal navigation
+            if link_text and any(doc_keyword in link_text.lower() 
+                               for doc_keyword in ['api', 'reference', 'guide', 'tutorial', 'example']):
+                score += 1.0
+                
+    except Exception:
+        # Fail gracefully - return minimal score
+        score = 0.5
+        
+    # Ensure score is within reasonable bounds
+    return max(0.0, min(score, 10.0))
+
+
+def calculate_total_score(
+    intrinsic_score: Optional[float] = None,
+    contextual_score: Optional[float] = None,
+    score_links_enabled: bool = False,
+    query_provided: bool = False
+) -> float:
+    """
+    Calculate combined total score from intrinsic and contextual scores with smart fallbacks.
+    
+    Args:
+        intrinsic_score: Quality score based on URL structure, text, and context (0-10)
+        contextual_score: BM25 relevance score based on query and head content (0-1 typically)
+        score_links_enabled: Whether link scoring is enabled
+        query_provided: Whether a query was provided for contextual scoring
+        
+    Returns:
+        Combined total score (0-10 scale)
+        
+    Scoring Logic:
+        - No scoring: return 5.0 (neutral score)
+        - Only intrinsic: return normalized intrinsic score
+        - Only contextual: return contextual score scaled to 10
+        - Both: weighted combination (70% intrinsic, 30% contextual scaled)
+    """
+    # Case 1: No scoring enabled at all
+    if not score_links_enabled:
+        return 5.0  # Neutral score - all links treated equally
+    
+    # Normalize scores to handle None values
+    intrinsic = intrinsic_score if intrinsic_score is not None else 0.0
+    contextual = contextual_score if contextual_score is not None else 0.0
+    
+    # Case 2: Only intrinsic scoring (no query provided or no head extraction)
+    if not query_provided or contextual_score is None:
+        # Use intrinsic score directly (already 0-10 scale)
+        return max(0.0, min(intrinsic, 10.0))
+    
+    # Case 3: Both intrinsic and contextual scores available
+    # Scale contextual score (typically 0-1) to 0-10 range
+    contextual_scaled = min(contextual * 10.0, 10.0)
+    
+    # Weighted combination: 70% intrinsic (structure/content quality) + 30% contextual (query relevance)
+    # This gives more weight to link quality while still considering relevance
+    total = (intrinsic * 0.7) + (contextual_scaled * 0.3)
+    
+    return max(0.0, min(total, 10.0))
+