feat: Add advanced link head extraction with three-layer scoring system (#1)

Squashed commit from feature/link-extractor branch implementing comprehensive link analysis:

- Extract HTML head content from discovered links with parallel processing
- Three-layer scoring: Intrinsic (URL quality), Contextual (BM25), and Total scores
- New LinkExtractionConfig class for type-safe configuration
- Pattern-based filtering for internal/external links
- Comprehensive documentation and examples
This commit is contained in:
UncleCode
2025-06-27 20:06:04 +08:00
parent e528086341
commit 5c9c305dbf
10 changed files with 2126 additions and 15 deletions

View File

@@ -37,6 +37,7 @@ from .content_filter_strategy import (
)
from .models import CrawlResult, MarkdownGenerationResult, DisplayMode
from .components.crawler_monitor import CrawlerMonitor
from .link_extractor import LinkExtractor
from .async_dispatcher import (
MemoryAdaptiveDispatcher,
SemaphoreDispatcher,
@@ -141,6 +142,7 @@ __all__ = [
"SemaphoreDispatcher",
"RateLimiter",
"CrawlerMonitor",
"LinkExtractor",
"DisplayMode",
"MarkdownGenerationResult",
"Crawl4aiDockerClient",

View File

@@ -17,7 +17,7 @@ from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy
from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy, LXMLWebScrapingStrategy
from .deep_crawling import DeepCrawlStrategy
from .cache_context import CacheMode
@@ -594,6 +594,101 @@ class BrowserConfig:
return config
return BrowserConfig.from_kwargs(config)
class LinkExtractionConfig:
"""Configuration for link head extraction and scoring."""
def __init__(
self,
include_internal: bool = True,
include_external: bool = False,
include_patterns: Optional[List[str]] = None,
exclude_patterns: Optional[List[str]] = None,
concurrency: int = 10,
timeout: int = 5,
max_links: int = 100,
query: Optional[str] = None,
score_threshold: Optional[float] = None,
verbose: bool = False
):
"""
Initialize link extraction configuration.
Args:
include_internal: Whether to include same-domain links
include_external: Whether to include different-domain links
include_patterns: List of glob patterns to include (e.g., ["*/docs/*", "*/api/*"])
exclude_patterns: List of glob patterns to exclude (e.g., ["*/login*", "*/admin*"])
concurrency: Number of links to process simultaneously
timeout: Timeout in seconds for each link's head extraction
max_links: Maximum number of links to process (prevents overload)
query: Query string for BM25 contextual scoring (optional)
score_threshold: Minimum relevance score to include links (0.0-1.0, optional)
verbose: Show detailed progress during extraction
"""
self.include_internal = include_internal
self.include_external = include_external
self.include_patterns = include_patterns
self.exclude_patterns = exclude_patterns
self.concurrency = concurrency
self.timeout = timeout
self.max_links = max_links
self.query = query
self.score_threshold = score_threshold
self.verbose = verbose
# Validation
if concurrency <= 0:
raise ValueError("concurrency must be positive")
if timeout <= 0:
raise ValueError("timeout must be positive")
if max_links <= 0:
raise ValueError("max_links must be positive")
if score_threshold is not None and not (0.0 <= score_threshold <= 1.0):
raise ValueError("score_threshold must be between 0.0 and 1.0")
if not include_internal and not include_external:
raise ValueError("At least one of include_internal or include_external must be True")
@staticmethod
def from_dict(config_dict: Dict[str, Any]) -> "LinkExtractionConfig":
"""Create LinkExtractionConfig from dictionary (for backward compatibility)."""
if not config_dict:
return None
return LinkExtractionConfig(
include_internal=config_dict.get("include_internal", True),
include_external=config_dict.get("include_external", False),
include_patterns=config_dict.get("include_patterns"),
exclude_patterns=config_dict.get("exclude_patterns"),
concurrency=config_dict.get("concurrency", 10),
timeout=config_dict.get("timeout", 5),
max_links=config_dict.get("max_links", 100),
query=config_dict.get("query"),
score_threshold=config_dict.get("score_threshold"),
verbose=config_dict.get("verbose", False)
)
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary format."""
return {
"include_internal": self.include_internal,
"include_external": self.include_external,
"include_patterns": self.include_patterns,
"exclude_patterns": self.exclude_patterns,
"concurrency": self.concurrency,
"timeout": self.timeout,
"max_links": self.max_links,
"query": self.query,
"score_threshold": self.score_threshold,
"verbose": self.verbose
}
def clone(self, **kwargs) -> "LinkExtractionConfig":
"""Create a copy with updated values."""
config_dict = self.to_dict()
config_dict.update(kwargs)
return LinkExtractionConfig.from_dict(config_dict)
class HTTPCrawlerConfig:
"""HTTP-specific crawler configuration"""
@@ -829,6 +924,9 @@ class CrawlerRunConfig():
Default: [].
exclude_internal_links (bool): If True, exclude internal links from the results.
Default: False.
score_links (bool): If True, calculate intrinsic quality scores for all links using URL structure,
text quality, and contextual relevance metrics. Separate from link_extraction_config.
Default: False.
# Debugging and Logging Parameters
verbose (bool): Enable verbose logging.
@@ -939,6 +1037,7 @@ class CrawlerRunConfig():
exclude_social_media_links: bool = False,
exclude_domains: list = None,
exclude_internal_links: bool = False,
score_links: bool = False,
# Debugging and Logging Parameters
verbose: bool = True,
log_console: bool = False,
@@ -955,6 +1054,8 @@ class CrawlerRunConfig():
user_agent_generator_config: dict = {},
# Deep Crawl Parameters
deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
# Link Extraction Parameters
link_extraction_config: Union[LinkExtractionConfig, Dict[str, Any]] = None,
# Experimental Parameters
experimental: Dict[str, Any] = None,
):
@@ -976,7 +1077,7 @@ class CrawlerRunConfig():
self.remove_forms = remove_forms
self.prettiify = prettiify
self.parser_type = parser_type
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
self.scraping_strategy = scraping_strategy or LXMLWebScrapingStrategy()
self.proxy_config = proxy_config
self.proxy_rotation_strategy = proxy_rotation_strategy
@@ -1042,6 +1143,7 @@ class CrawlerRunConfig():
self.exclude_social_media_links = exclude_social_media_links
self.exclude_domains = exclude_domains or []
self.exclude_internal_links = exclude_internal_links
self.score_links = score_links
# Debugging and Logging Parameters
self.verbose = verbose
@@ -1084,6 +1186,17 @@ class CrawlerRunConfig():
# Deep Crawl Parameters
self.deep_crawl_strategy = deep_crawl_strategy
# Link Extraction Parameters
if link_extraction_config is None:
self.link_extraction_config = None
elif isinstance(link_extraction_config, LinkExtractionConfig):
self.link_extraction_config = link_extraction_config
elif isinstance(link_extraction_config, dict):
# Convert dict to config object for backward compatibility
self.link_extraction_config = LinkExtractionConfig.from_dict(link_extraction_config)
else:
raise ValueError("link_extraction_config must be LinkExtractionConfig object or dict")
# Experimental Parameters
self.experimental = experimental or {}
@@ -1241,6 +1354,7 @@ class CrawlerRunConfig():
exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
exclude_domains=kwargs.get("exclude_domains", []),
exclude_internal_links=kwargs.get("exclude_internal_links", False),
score_links=kwargs.get("score_links", False),
# Debugging and Logging Parameters
verbose=kwargs.get("verbose", True),
log_console=kwargs.get("log_console", False),
@@ -1256,6 +1370,8 @@ class CrawlerRunConfig():
user_agent_generator_config=kwargs.get("user_agent_generator_config", {}),
# Deep Crawl Parameters
deep_crawl_strategy=kwargs.get("deep_crawl_strategy"),
# Link Extraction Parameters
link_extraction_config=kwargs.get("link_extraction_config"),
url=kwargs.get("url"),
# Experimental Parameters
experimental=kwargs.get("experimental"),
@@ -1339,6 +1455,7 @@ class CrawlerRunConfig():
"exclude_social_media_links": self.exclude_social_media_links,
"exclude_domains": self.exclude_domains,
"exclude_internal_links": self.exclude_internal_links,
"score_links": self.score_links,
"verbose": self.verbose,
"log_console": self.log_console,
"capture_network_requests": self.capture_network_requests,
@@ -1350,6 +1467,7 @@ class CrawlerRunConfig():
"user_agent_mode": self.user_agent_mode,
"user_agent_generator_config": self.user_agent_generator_config,
"deep_crawl_strategy": self.deep_crawl_strategy,
"link_extraction_config": self.link_extraction_config.to_dict() if self.link_extraction_config else None,
"url": self.url,
"experimental": self.experimental,
}

View File

@@ -109,12 +109,16 @@ def _parse_head(src: str) -> Dict[str, Any]:
elif "charset" in el.attrib:
info["charset"] = el.attrib["charset"].lower()
for el in doc.xpath(".//link"):
rel = " ".join(el.attrib.get("rel", [])).lower()
if not rel:
rel_attr = el.attrib.get("rel", "")
if not rel_attr:
continue
# Handle multiple space-separated rel values
rel_values = rel_attr.lower().split()
entry = {a: el.attrib[a] for a in (
"href", "as", "type", "hreflang") if a in el.attrib}
info["link"].setdefault(rel, []).append(entry)
# Add entry for each rel value
for rel in rel_values:
info["link"].setdefault(rel, []).append(entry)
# Extract JSON-LD structured data
for script in doc.xpath('.//script[@type="application/ld+json"]'):
if script.text:
@@ -467,6 +471,200 @@ class AsyncUrlSeeder:
"info", "Finished URL seeding for multiple domains.", tag="URL_SEED")
return final_results
async def extract_head_for_urls(
self,
urls: List[str],
config: Optional["SeedingConfig"] = None,
concurrency: int = 10,
timeout: int = 5
) -> List[Dict[str, Any]]:
"""
Extract head content for a custom list of URLs using URLSeeder's parallel processing.
This method reuses URLSeeder's efficient parallel processing, caching, and head extraction
logic to process a custom list of URLs rather than discovering URLs from sources.
Parameters
----------
urls : List[str]
List of URLs to extract head content from
config : SeedingConfig, optional
Configuration object. If None, uses default settings for head extraction
concurrency : int, default=10
Number of concurrent requests
timeout : int, default=5
Timeout for each request in seconds
Returns
-------
List[Dict[str, Any]]
List of dictionaries containing url, status, head_data, and optional relevance_score
"""
# Create default config if none provided
if config is None:
# Import here to avoid circular imports
from .async_configs import SeedingConfig
config = SeedingConfig(
extract_head=True,
concurrency=concurrency,
verbose=False
)
# Override concurrency and ensure head extraction is enabled
config.concurrency = concurrency
config.extract_head = True
self._log("info", "Starting head extraction for {count} custom URLs",
params={"count": len(urls)}, tag="URL_SEED")
# Setup rate limiting if specified in config
if config.hits_per_sec:
if config.hits_per_sec <= 0:
self._log("warning", "hits_per_sec must be positive. Disabling rate limiting.", tag="URL_SEED")
self._rate_sem = None
else:
self._rate_sem = asyncio.Semaphore(config.hits_per_sec)
else:
self._rate_sem = None
# Use bounded queue to prevent memory issues with large URL lists
queue_size = min(10000, max(1000, concurrency * 100))
queue = asyncio.Queue(maxsize=queue_size)
producer_done = asyncio.Event()
stop_event = asyncio.Event()
seen: set[str] = set()
# Results collection
results: List[Dict[str, Any]] = []
async def producer():
"""Producer to feed URLs into the queue."""
try:
for url in urls:
if url in seen:
self._log("debug", "Skipping duplicate URL: {url}",
params={"url": url}, tag="URL_SEED")
continue
if stop_event.is_set():
break
seen.add(url)
await queue.put(url)
finally:
producer_done.set()
async def worker(res_list: List[Dict[str, Any]]):
"""Worker to process URLs from the queue."""
while True:
try:
# Wait for URL or producer completion
url = await asyncio.wait_for(queue.get(), timeout=1.0)
except asyncio.TimeoutError:
if producer_done.is_set() and queue.empty():
break
continue
try:
# Use existing _validate method which handles head extraction, caching, etc.
await self._validate(
url, res_list,
live=False, # We're not doing live checks, just head extraction
extract=True, # Always extract head content
timeout=timeout,
verbose=config.verbose or False,
query=config.query,
score_threshold=config.score_threshold,
scoring_method=config.scoring_method or "bm25",
filter_nonsense=config.filter_nonsense_urls
)
except Exception as e:
self._log("error", "Failed to process URL {url}: {error}",
params={"url": url, "error": str(e)}, tag="URL_SEED")
# Add failed entry to results
res_list.append({
"url": url,
"status": "failed",
"head_data": {},
"error": str(e)
})
finally:
queue.task_done()
# Start producer
producer_task = asyncio.create_task(producer())
# Start workers
worker_tasks = []
for _ in range(concurrency):
worker_task = asyncio.create_task(worker(results))
worker_tasks.append(worker_task)
# Wait for producer to finish
await producer_task
# Wait for all items to be processed
await queue.join()
# Cancel workers
for task in worker_tasks:
task.cancel()
# Wait for workers to finish canceling
await asyncio.gather(*worker_tasks, return_exceptions=True)
# Apply BM25 scoring if query is provided
if config.query and config.scoring_method == "bm25":
results = await self._apply_bm25_scoring(results, config)
# Apply score threshold filtering
if config.score_threshold is not None:
results = [r for r in results if r.get("relevance_score", 0) >= config.score_threshold]
# Sort by relevance score if available
if any("relevance_score" in r for r in results):
results.sort(key=lambda x: x.get("relevance_score", 0), reverse=True)
self._log("info", "Completed head extraction for {count} URLs, {success} successful",
params={
"count": len(urls),
"success": len([r for r in results if r.get("status") == "valid"])
}, tag="URL_SEED")
return results
async def _apply_bm25_scoring(self, results: List[Dict[str, Any]], config: "SeedingConfig") -> List[Dict[str, Any]]:
"""Apply BM25 scoring to results that have head_data."""
if not HAS_BM25:
self._log("warning", "BM25 scoring requested but rank_bm25 not available", tag="URL_SEED")
return results
# Extract text contexts from head data
text_contexts = []
valid_results = []
for result in results:
if result.get("status") == "valid" and result.get("head_data"):
text_context = self._extract_text_context(result["head_data"])
if text_context:
text_contexts.append(text_context)
valid_results.append(result)
else:
# Use URL-based scoring as fallback
score = self._calculate_url_relevance_score(config.query, result["url"])
result["relevance_score"] = float(score)
elif result.get("status") == "valid":
# No head data but valid URL - use URL-based scoring
score = self._calculate_url_relevance_score(config.query, result["url"])
result["relevance_score"] = float(score)
# Calculate BM25 scores for results with text context
if text_contexts and valid_results:
scores = await asyncio.to_thread(self._calculate_bm25_score, config.query, text_contexts)
for i, result in enumerate(valid_results):
if i < len(scores):
result["relevance_score"] = float(scores[i])
return results
async def _resolve_head(self, url: str) -> Optional[str]:
"""
HEAD-probe a URL.

View File

@@ -23,6 +23,8 @@ from .utils import (
is_external_url,
get_base_domain,
extract_metadata_using_lxml,
extract_page_context,
calculate_link_intrinsic_score,
)
from lxml import etree
from lxml import html as lhtml
@@ -944,6 +946,72 @@ class WebScrapingStrategy(ContentScrapingStrategy):
# Update the links dictionary with unique links
links["internal"] = list(internal_links_dict.values())
links["external"] = list(external_links_dict.values())
# Extract head content for links if configured
link_extraction_config = kwargs.get("link_extraction_config")
if link_extraction_config is not None:
try:
import asyncio
from .link_extractor import LinkExtractor
from .models import Links, Link
verbose = link_extraction_config.verbose
if verbose:
self._log("info", "Starting link head extraction for {internal} internal and {external} external links",
params={"internal": len(links["internal"]), "external": len(links["external"])}, tag="LINK_EXTRACT")
# Convert dict links to Link objects
internal_links = [Link(**link_data) for link_data in links["internal"]]
external_links = [Link(**link_data) for link_data in links["external"]]
links_obj = Links(internal=internal_links, external=external_links)
# Create a config object for LinkExtractor
class TempCrawlerRunConfig:
def __init__(self, link_config, score_links):
self.link_extraction_config = link_config
self.score_links = score_links
config = TempCrawlerRunConfig(link_extraction_config, kwargs.get("score_links", False))
# Extract head content (run async operation in sync context)
async def extract_links():
async with LinkExtractor(self.logger) as extractor:
return await extractor.extract_link_heads(links_obj, config)
# Run the async operation
try:
# Check if we're already in an async context
loop = asyncio.get_running_loop()
# If we're in an async context, we need to run in a thread
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(asyncio.run, extract_links())
updated_links = future.result()
except RuntimeError:
# No running loop, we can use asyncio.run directly
updated_links = asyncio.run(extract_links())
# Convert back to dict format
links["internal"] = [link.dict() for link in updated_links.internal]
links["external"] = [link.dict() for link in updated_links.external]
if verbose:
successful_internal = len([l for l in updated_links.internal if l.head_extraction_status == "valid"])
successful_external = len([l for l in updated_links.external if l.head_extraction_status == "valid"])
self._log("info", "Link head extraction completed: {internal_success}/{internal_total} internal, {external_success}/{external_total} external",
params={
"internal_success": successful_internal,
"internal_total": len(updated_links.internal),
"external_success": successful_external,
"external_total": len(updated_links.external)
}, tag="LINK_EXTRACT")
else:
self._log("info", "Link head extraction completed successfully", tag="LINK_EXTRACT")
except Exception as e:
self._log("error", f"Link head extraction failed: {str(e)}", tag="LINK_EXTRACT")
# Continue with original links if extraction fails
# # Process images using ThreadPoolExecutor
imgs = body.find_all("img")
@@ -1037,6 +1105,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
media: Dict[str, List],
internal_links_dict: Dict[str, Any],
external_links_dict: Dict[str, Any],
page_context: dict = None,
**kwargs,
) -> bool:
base_domain = kwargs.get("base_domain", get_base_domain(url))
@@ -1056,6 +1125,25 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
"title": link.get("title", "").strip(),
"base_domain": base_domain,
}
# Add intrinsic scoring if enabled
if kwargs.get("score_links", False) and page_context is not None:
try:
intrinsic_score = calculate_link_intrinsic_score(
link_text=link_data["text"],
url=normalized_href,
title_attr=link_data["title"],
class_attr=link.get("class", ""),
rel_attr=link.get("rel", ""),
page_context=page_context
)
link_data["intrinsic_score"] = intrinsic_score
except Exception:
# Fail gracefully - assign default score
link_data["intrinsic_score"] = float('inf')
else:
# No scoring enabled - assign infinity (all links equal priority)
link_data["intrinsic_score"] = float('inf')
is_external = is_external_url(normalized_href, base_domain)
if is_external:
@@ -1491,6 +1579,33 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
base_domain = get_base_domain(url)
# Extract page context for link scoring (if enabled) - do this BEFORE any removals
page_context = None
if kwargs.get("score_links", False):
try:
# Extract title
title_elements = doc.xpath('//title')
page_title = title_elements[0].text_content() if title_elements else ""
# Extract headlines
headlines = []
for tag in ['h1', 'h2', 'h3']:
elements = doc.xpath(f'//{tag}')
for el in elements:
text = el.text_content().strip()
if text:
headlines.append(text)
headlines_text = ' '.join(headlines)
# Extract meta description
meta_desc_elements = doc.xpath('//meta[@name="description"]/@content')
meta_description = meta_desc_elements[0] if meta_desc_elements else ""
# Create page context
page_context = extract_page_context(page_title, headlines_text, meta_description, url)
except Exception:
page_context = {} # Fail gracefully
# Early removal of all images if exclude_all_images is set
# This is more efficient in lxml as we remove elements before any processing
if kwargs.get("exclude_all_images", False):
@@ -1579,6 +1694,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
media,
internal_links_dict,
external_links_dict,
page_context=page_context,
base_domain=base_domain,
**kwargs,
)
@@ -1623,14 +1739,84 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
method="html",
with_tail=False,
).strip()
# Create links dictionary in the format expected by LinkExtractor
links = {
"internal": list(internal_links_dict.values()),
"external": list(external_links_dict.values()),
}
# Extract head content for links if configured
link_extraction_config = kwargs.get("link_extraction_config")
if link_extraction_config is not None:
try:
import asyncio
from .link_extractor import LinkExtractor
from .models import Links, Link
verbose = link_extraction_config.verbose
if verbose:
self._log("info", "Starting link head extraction for {internal} internal and {external} external links",
params={"internal": len(links["internal"]), "external": len(links["external"])}, tag="LINK_EXTRACT")
# Convert dict links to Link objects
internal_links = [Link(**link_data) for link_data in links["internal"]]
external_links = [Link(**link_data) for link_data in links["external"]]
links_obj = Links(internal=internal_links, external=external_links)
# Create a config object for LinkExtractor
class TempCrawlerRunConfig:
def __init__(self, link_config, score_links):
self.link_extraction_config = link_config
self.score_links = score_links
config = TempCrawlerRunConfig(link_extraction_config, kwargs.get("score_links", False))
# Extract head content (run async operation in sync context)
async def extract_links():
async with LinkExtractor(self.logger) as extractor:
return await extractor.extract_link_heads(links_obj, config)
# Run the async operation
try:
# Check if we're already in an async context
loop = asyncio.get_running_loop()
# If we're in an async context, we need to run in a thread
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(asyncio.run, extract_links())
updated_links = future.result()
except RuntimeError:
# No running loop, we can use asyncio.run directly
updated_links = asyncio.run(extract_links())
# Convert back to dict format
links["internal"] = [link.dict() for link in updated_links.internal]
links["external"] = [link.dict() for link in updated_links.external]
if verbose:
successful_internal = len([l for l in updated_links.internal if l.head_extraction_status == "valid"])
successful_external = len([l for l in updated_links.external if l.head_extraction_status == "valid"])
self._log("info", "Link head extraction completed: {internal_success}/{internal_total} internal, {external_success}/{external_total} external",
params={
"internal_success": successful_internal,
"internal_total": len(updated_links.internal),
"external_success": successful_external,
"external_total": len(updated_links.external)
}, tag="LINK_EXTRACT")
else:
self._log("info", "Link head extraction completed successfully", tag="LINK_EXTRACT")
except Exception as e:
self._log("error", f"Error during link head extraction: {str(e)}", tag="LINK_EXTRACT")
# Continue with original links if head extraction fails
return {
"cleaned_html": cleaned_html,
"success": success,
"media": media,
"links": {
"internal": list(internal_links_dict.values()),
"external": list(external_links_dict.values()),
},
"links": links,
"metadata": meta,
}

395
crawl4ai/link_extractor.py Normal file
View File

@@ -0,0 +1,395 @@
"""
Link Extractor for Crawl4AI
Extracts head content from links discovered during crawling using URLSeeder's
efficient parallel processing and caching infrastructure.
"""
import asyncio
import fnmatch
from typing import Dict, List, Optional, Any
from .async_logger import AsyncLogger
from .async_url_seeder import AsyncUrlSeeder
from .async_configs import SeedingConfig, CrawlerRunConfig
from .models import Links, Link
from .utils import calculate_total_score
class LinkExtractor:
"""
Extracts head content from links using URLSeeder's parallel processing infrastructure.
This class provides intelligent link filtering and head content extraction with:
- Pattern-based inclusion/exclusion filtering
- Parallel processing with configurable concurrency
- Caching for performance
- BM25 relevance scoring
- Memory-safe processing for large link sets
"""
def __init__(self, logger: Optional[AsyncLogger] = None):
"""
Initialize the LinkExtractor.
Args:
logger: Optional logger instance for recording events
"""
self.logger = logger
self.seeder: Optional[AsyncUrlSeeder] = None
self._owns_seeder = False
async def __aenter__(self):
"""Async context manager entry."""
await self.start()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""Async context manager exit."""
await self.close()
async def start(self):
"""Initialize the URLSeeder instance."""
if not self.seeder:
self.seeder = AsyncUrlSeeder(logger=self.logger)
await self.seeder.__aenter__()
self._owns_seeder = True
async def close(self):
"""Clean up resources."""
if self.seeder and self._owns_seeder:
await self.seeder.__aexit__(None, None, None)
self.seeder = None
self._owns_seeder = False
def _log(self, level: str, message: str, tag: str = "LINK_EXTRACT", **kwargs):
"""Helper method to safely log messages."""
if self.logger:
log_method = getattr(self.logger, level, None)
if log_method:
log_method(message=message, tag=tag, params=kwargs.get('params', {}))
async def extract_link_heads(
self,
links: Links,
config: CrawlerRunConfig
) -> Links:
"""
Extract head content for filtered links and attach to Link objects.
Args:
links: Links object containing internal and external links
config: CrawlerRunConfig with link_extraction_config settings
Returns:
Links object with head_data attached to filtered Link objects
"""
link_config = config.link_extraction_config
# Ensure seeder is initialized
await self.start()
# Filter links based on configuration
filtered_urls = self._filter_links(links, link_config)
if not filtered_urls:
self._log("info", "No links matched filtering criteria")
return links
self._log("info", "Extracting head content for {count} filtered links",
params={"count": len(filtered_urls)})
# Extract head content using URLSeeder
head_results = await self._extract_heads_parallel(filtered_urls, link_config)
# Merge results back into Link objects
updated_links = self._merge_head_data(links, head_results, config)
self._log("info", "Completed head extraction for links, {success} successful",
params={"success": len([r for r in head_results if r.get("status") == "valid"])})
return updated_links
def _filter_links(self, links: Links, link_config: Dict[str, Any]) -> List[str]:
"""
Filter links based on configuration parameters.
Args:
links: Links object containing internal and external links
link_config: Configuration dictionary for link extraction
Returns:
List of filtered URL strings
"""
filtered_urls = []
# Include internal links if configured
if link_config.include_internal:
filtered_urls.extend([link.href for link in links.internal if link.href])
self._log("debug", "Added {count} internal links",
params={"count": len(links.internal)})
# Include external links if configured
if link_config.include_external:
filtered_urls.extend([link.href for link in links.external if link.href])
self._log("debug", "Added {count} external links",
params={"count": len(links.external)})
# Apply include patterns
include_patterns = link_config.include_patterns
if include_patterns:
filtered_urls = [
url for url in filtered_urls
if any(fnmatch.fnmatch(url, pattern) for pattern in include_patterns)
]
self._log("debug", "After include patterns: {count} links remain",
params={"count": len(filtered_urls)})
# Apply exclude patterns
exclude_patterns = link_config.exclude_patterns
if exclude_patterns:
filtered_urls = [
url for url in filtered_urls
if not any(fnmatch.fnmatch(url, pattern) for pattern in exclude_patterns)
]
self._log("debug", "After exclude patterns: {count} links remain",
params={"count": len(filtered_urls)})
# Limit number of links
max_links = link_config.max_links
if max_links > 0 and len(filtered_urls) > max_links:
filtered_urls = filtered_urls[:max_links]
self._log("debug", "Limited to {max_links} links",
params={"max_links": max_links})
# Remove duplicates while preserving order
seen = set()
unique_urls = []
for url in filtered_urls:
if url not in seen:
seen.add(url)
unique_urls.append(url)
self._log("debug", "Final filtered URLs: {count} unique links",
params={"count": len(unique_urls)})
return unique_urls
async def _extract_heads_parallel(
self,
urls: List[str],
link_config: Dict[str, Any]
) -> List[Dict[str, Any]]:
"""
Extract head content for URLs using URLSeeder's parallel processing.
Args:
urls: List of URLs to process
link_config: Configuration dictionary for link extraction
Returns:
List of dictionaries with url, status, head_data, and optional relevance_score
"""
verbose = link_config.verbose
concurrency = link_config.concurrency
if verbose:
self._log("info", "Starting batch processing: {total} links with {concurrency} concurrent workers",
params={"total": len(urls), "concurrency": concurrency})
# Create SeedingConfig for URLSeeder
seeding_config = SeedingConfig(
extract_head=True,
concurrency=concurrency,
hits_per_sec=getattr(link_config, 'hits_per_sec', None),
query=link_config.query,
score_threshold=link_config.score_threshold,
scoring_method="bm25" if link_config.query else None,
verbose=verbose
)
# Use URLSeeder's extract_head_for_urls method with progress tracking
if verbose:
# Create a wrapper to track progress
results = await self._extract_with_progress(urls, seeding_config, link_config)
else:
results = await self.seeder.extract_head_for_urls(
urls=urls,
config=seeding_config,
concurrency=concurrency,
timeout=link_config.timeout
)
return results
async def _extract_with_progress(
self,
urls: List[str],
seeding_config: SeedingConfig,
link_config: Dict[str, Any]
) -> List[Dict[str, Any]]:
"""Extract head content with progress reporting."""
total_urls = len(urls)
concurrency = link_config.concurrency
batch_size = max(1, total_urls // 10) # Report progress every 10%
# Process URLs and track progress
completed = 0
successful = 0
failed = 0
# Create a custom progress tracking version
# We'll modify URLSeeder's method to include progress callbacks
# For now, let's use the existing method and report at the end
# In a production version, we would modify URLSeeder to accept progress callbacks
self._log("info", "Processing links in batches...")
# Use existing method
results = await self.seeder.extract_head_for_urls(
urls=urls,
config=seeding_config,
concurrency=concurrency,
timeout=link_config.timeout
)
# Count results
for result in results:
completed += 1
if result.get("status") == "valid":
successful += 1
else:
failed += 1
# Final progress report
self._log("info", "Batch processing completed: {completed}/{total} processed, {successful} successful, {failed} failed",
params={
"completed": completed,
"total": total_urls,
"successful": successful,
"failed": failed
})
return results
def _merge_head_data(
self,
original_links: Links,
head_results: List[Dict[str, Any]],
config: CrawlerRunConfig
) -> Links:
"""
Merge head extraction results back into Link objects.
Args:
original_links: Original Links object
head_results: Results from head extraction
Returns:
Links object with head_data attached to matching links
"""
# Create URL to head_data mapping
url_to_head_data = {}
for result in head_results:
url = result.get("url")
if url:
url_to_head_data[url] = {
"head_data": result.get("head_data", {}),
"status": result.get("status", "unknown"),
"error": result.get("error"),
"relevance_score": result.get("relevance_score")
}
# Update internal links
updated_internal = []
for link in original_links.internal:
if link.href in url_to_head_data:
head_info = url_to_head_data[link.href]
# Create new Link object with head data and scoring
contextual_score = head_info.get("relevance_score")
updated_link = Link(
href=link.href,
text=link.text,
title=link.title,
base_domain=link.base_domain,
head_data=head_info["head_data"],
head_extraction_status=head_info["status"],
head_extraction_error=head_info.get("error"),
intrinsic_score=getattr(link, 'intrinsic_score', None),
contextual_score=contextual_score
)
# Add relevance score to head_data for backward compatibility
if contextual_score is not None:
updated_link.head_data = updated_link.head_data or {}
updated_link.head_data["relevance_score"] = contextual_score
# Calculate total score combining intrinsic and contextual scores
updated_link.total_score = calculate_total_score(
intrinsic_score=updated_link.intrinsic_score,
contextual_score=updated_link.contextual_score,
score_links_enabled=getattr(config, 'score_links', False),
query_provided=bool(config.link_extraction_config.query)
)
updated_internal.append(updated_link)
else:
# Keep original link unchanged
updated_internal.append(link)
# Update external links
updated_external = []
for link in original_links.external:
if link.href in url_to_head_data:
head_info = url_to_head_data[link.href]
# Create new Link object with head data and scoring
contextual_score = head_info.get("relevance_score")
updated_link = Link(
href=link.href,
text=link.text,
title=link.title,
base_domain=link.base_domain,
head_data=head_info["head_data"],
head_extraction_status=head_info["status"],
head_extraction_error=head_info.get("error"),
intrinsic_score=getattr(link, 'intrinsic_score', None),
contextual_score=contextual_score
)
# Add relevance score to head_data for backward compatibility
if contextual_score is not None:
updated_link.head_data = updated_link.head_data or {}
updated_link.head_data["relevance_score"] = contextual_score
# Calculate total score combining intrinsic and contextual scores
updated_link.total_score = calculate_total_score(
intrinsic_score=updated_link.intrinsic_score,
contextual_score=updated_link.contextual_score,
score_links_enabled=getattr(config, 'score_links', False),
query_provided=bool(config.link_extraction_config.query)
)
updated_external.append(updated_link)
else:
# Keep original link unchanged
updated_external.append(link)
# Sort links by relevance score if available
if any(hasattr(link, 'head_data') and link.head_data and 'relevance_score' in link.head_data
for link in updated_internal + updated_external):
def get_relevance_score(link):
if hasattr(link, 'head_data') and link.head_data and 'relevance_score' in link.head_data:
return link.head_data['relevance_score']
return 0.0
updated_internal.sort(key=get_relevance_score, reverse=True)
updated_external.sort(key=get_relevance_score, reverse=True)
return Links(
internal=updated_internal,
external=updated_external
)

View File

@@ -345,6 +345,12 @@ class Link(BaseModel):
text: Optional[str] = ""
title: Optional[str] = ""
base_domain: Optional[str] = ""
head_data: Optional[Dict[str, Any]] = None # Head metadata extracted from link target
head_extraction_status: Optional[str] = None # "success", "failed", "skipped"
head_extraction_error: Optional[str] = None # Error message if extraction failed
intrinsic_score: Optional[float] = None # Quality score based on URL structure, text, and context
contextual_score: Optional[float] = None # BM25 relevance score based on query and head content
total_score: Optional[float] = None # Combined score from intrinsic and contextual scores
class Media(BaseModel):

View File

@@ -2939,3 +2939,212 @@ pip install -q nest_asyncio google-colab
echo "✅ Setup complete!"
''')
# Link Quality Scoring Functions
def extract_page_context(page_title: str, headlines_text: str, meta_description: str, base_url: str) -> dict:
"""
Extract page context for link scoring - called ONCE per page for performance.
Parser-agnostic function that takes pre-extracted data.
Args:
page_title: Title of the page
headlines_text: Combined text from h1, h2, h3 elements
meta_description: Meta description content
base_url: Base URL of the page
Returns:
Dictionary containing page context data for fast link scoring
"""
context = {
'terms': set(),
'headlines': headlines_text or '',
'meta_description': meta_description or '',
'domain': '',
'is_docs_site': False
}
try:
from urllib.parse import urlparse
parsed = urlparse(base_url)
context['domain'] = parsed.netloc.lower()
# Check if this is a documentation/reference site
context['is_docs_site'] = any(indicator in context['domain']
for indicator in ['docs.', 'api.', 'developer.', 'reference.'])
# Create term set for fast intersection (performance optimization)
all_text = ((page_title or '') + ' ' + context['headlines'] + ' ' + context['meta_description']).lower()
# Simple tokenization - fast and sufficient for scoring
context['terms'] = set(word.strip('.,!?;:"()[]{}')
for word in all_text.split()
if len(word.strip('.,!?;:"()[]{}')) > 2)
except Exception:
# Fail gracefully - return empty context
pass
return context
def calculate_link_intrinsic_score(
link_text: str,
url: str,
title_attr: str,
class_attr: str,
rel_attr: str,
page_context: dict
) -> float:
"""
Ultra-fast link quality scoring using only provided data (no DOM access needed).
Parser-agnostic function.
Args:
link_text: Text content of the link
url: Link URL
title_attr: Title attribute of the link
class_attr: Class attribute of the link
rel_attr: Rel attribute of the link
page_context: Pre-computed page context from extract_page_context()
Returns:
Quality score (0.0 - 10.0), higher is better
"""
score = 0.0
try:
# 1. ATTRIBUTE QUALITY (string analysis - very fast)
if title_attr and len(title_attr.strip()) > 3:
score += 1.0
class_str = (class_attr or '').lower()
# Navigation/important classes boost score
if any(nav_class in class_str for nav_class in ['nav', 'menu', 'primary', 'main', 'important']):
score += 1.5
# Marketing/ad classes reduce score
if any(bad_class in class_str for bad_class in ['ad', 'sponsor', 'track', 'promo', 'banner']):
score -= 1.0
rel_str = (rel_attr or '').lower()
# Semantic rel values
if any(good_rel in rel_str for good_rel in ['canonical', 'next', 'prev', 'chapter']):
score += 1.0
if any(bad_rel in rel_str for bad_rel in ['nofollow', 'sponsored', 'ugc']):
score -= 0.5
# 2. URL STRUCTURE QUALITY (string operations - very fast)
url_lower = url.lower()
# High-value path patterns
if any(good_path in url_lower for good_path in ['/docs/', '/api/', '/guide/', '/tutorial/', '/reference/', '/manual/']):
score += 2.0
elif any(medium_path in url_lower for medium_path in ['/blog/', '/article/', '/post/', '/news/']):
score += 1.0
# Penalize certain patterns
if any(bad_path in url_lower for bad_path in ['/admin/', '/login/', '/cart/', '/checkout/', '/track/', '/click/']):
score -= 1.5
# URL depth (shallow URLs often more important)
url_depth = url.count('/') - 2 # Subtract protocol and domain
if url_depth <= 2:
score += 1.0
elif url_depth > 5:
score -= 0.5
# HTTPS bonus
if url.startswith('https://'):
score += 0.5
# 3. TEXT QUALITY (string analysis - very fast)
if link_text:
text_clean = link_text.strip()
if len(text_clean) > 3:
score += 1.0
# Multi-word links are usually more descriptive
word_count = len(text_clean.split())
if word_count >= 2:
score += 0.5
if word_count >= 4:
score += 0.5
# Avoid generic link text
generic_texts = ['click here', 'read more', 'more info', 'link', 'here']
if text_clean.lower() in generic_texts:
score -= 1.0
# 4. CONTEXTUAL RELEVANCE (pre-computed page terms - very fast)
if page_context.get('terms') and link_text:
link_words = set(word.strip('.,!?;:"()[]{}').lower()
for word in link_text.split()
if len(word.strip('.,!?;:"()[]{}')) > 2)
if link_words:
# Calculate word overlap ratio
overlap = len(link_words & page_context['terms'])
if overlap > 0:
relevance_ratio = overlap / min(len(link_words), 10) # Cap to avoid over-weighting
score += relevance_ratio * 2.0 # Up to 2 points for relevance
# 5. DOMAIN CONTEXT BONUSES (very fast string checks)
if page_context.get('is_docs_site', False):
# Documentation sites: prioritize internal navigation
if link_text and any(doc_keyword in link_text.lower()
for doc_keyword in ['api', 'reference', 'guide', 'tutorial', 'example']):
score += 1.0
except Exception:
# Fail gracefully - return minimal score
score = 0.5
# Ensure score is within reasonable bounds
return max(0.0, min(score, 10.0))
def calculate_total_score(
intrinsic_score: Optional[float] = None,
contextual_score: Optional[float] = None,
score_links_enabled: bool = False,
query_provided: bool = False
) -> float:
"""
Calculate combined total score from intrinsic and contextual scores with smart fallbacks.
Args:
intrinsic_score: Quality score based on URL structure, text, and context (0-10)
contextual_score: BM25 relevance score based on query and head content (0-1 typically)
score_links_enabled: Whether link scoring is enabled
query_provided: Whether a query was provided for contextual scoring
Returns:
Combined total score (0-10 scale)
Scoring Logic:
- No scoring: return 5.0 (neutral score)
- Only intrinsic: return normalized intrinsic score
- Only contextual: return contextual score scaled to 10
- Both: weighted combination (70% intrinsic, 30% contextual scaled)
"""
# Case 1: No scoring enabled at all
if not score_links_enabled:
return 5.0 # Neutral score - all links treated equally
# Normalize scores to handle None values
intrinsic = intrinsic_score if intrinsic_score is not None else 0.0
contextual = contextual_score if contextual_score is not None else 0.0
# Case 2: Only intrinsic scoring (no query provided or no head extraction)
if not query_provided or contextual_score is None:
# Use intrinsic score directly (already 0-10 scale)
return max(0.0, min(intrinsic, 10.0))
# Case 3: Both intrinsic and contextual scores available
# Scale contextual score (typically 0-1) to 0-10 range
contextual_scaled = min(contextual * 10.0, 10.0)
# Weighted combination: 70% intrinsic (structure/content quality) + 30% contextual (query relevance)
# This gives more weight to link quality while still considering relevance
total = (intrinsic * 0.7) + (contextual_scaled * 0.3)
return max(0.0, min(total, 10.0))

View File

@@ -0,0 +1,376 @@
#!/usr/bin/env python3
"""
Link Head Extraction & Scoring Example
This example demonstrates Crawl4AI's advanced link analysis capabilities:
1. Basic link head extraction
2. Three-layer scoring system (intrinsic, contextual, total)
3. Pattern-based filtering
4. Multiple practical use cases
Requirements:
- crawl4ai installed
- Internet connection
Usage:
python link_head_extraction_example.py
"""
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.async_configs import LinkExtractionConfig
async def basic_link_head_extraction():
"""
Basic example: Extract head content from internal links with scoring
"""
print("🔗 Basic Link Head Extraction Example")
print("=" * 50)
config = CrawlerRunConfig(
# Enable link head extraction
link_extraction_config=LinkExtractionConfig(
include_internal=True, # Process internal links
include_external=False, # Skip external links for this demo
max_links=5, # Limit to 5 links
concurrency=3, # Process 3 links simultaneously
timeout=10, # 10 second timeout per link
query="API documentation guide", # Query for relevance scoring
verbose=True # Show detailed progress
),
# Enable intrinsic link scoring
score_links=True,
only_text=True
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://docs.python.org/3/", config=config)
if result.success:
print(f"\n✅ Successfully crawled: {result.url}")
internal_links = result.links.get("internal", [])
links_with_head = [link for link in internal_links
if link.get("head_data") is not None]
print(f"🧠 Links with head data: {len(links_with_head)}")
# Show detailed results
for i, link in enumerate(links_with_head[:3]):
print(f"\n📄 Link {i+1}: {link['href']}")
print(f" Text: '{link.get('text', 'No text')[:50]}...'")
# Show all three score types
intrinsic = link.get('intrinsic_score')
contextual = link.get('contextual_score')
total = link.get('total_score')
print(f" 📊 Scores:")
if intrinsic is not None:
print(f" • Intrinsic: {intrinsic:.2f}/10.0")
if contextual is not None:
print(f" • Contextual: {contextual:.3f}")
if total is not None:
print(f" • Total: {total:.3f}")
# Show head data
head_data = link.get("head_data", {})
if head_data:
title = head_data.get("title", "No title")
description = head_data.get("meta", {}).get("description", "")
print(f" 📰 Title: {title[:60]}...")
if description:
print(f" 📝 Description: {description[:80]}...")
else:
print(f"❌ Crawl failed: {result.error_message}")
async def research_assistant_example():
"""
Research Assistant: Find highly relevant documentation pages
"""
print("\n\n🔍 Research Assistant Example")
print("=" * 50)
config = CrawlerRunConfig(
link_extraction_config=LinkExtractionConfig(
include_internal=True,
include_external=True,
include_patterns=["*/docs/*", "*/tutorial/*", "*/guide/*"],
exclude_patterns=["*/login*", "*/admin*"],
query="machine learning neural networks deep learning",
max_links=15,
score_threshold=0.4, # Only include high-relevance links
concurrency=8,
verbose=False # Clean output for this example
),
score_links=True
)
# Test with scikit-learn documentation
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://scikit-learn.org/stable/", config=config)
if result.success:
print(f"✅ Analyzed: {result.url}")
all_links = result.links.get("internal", []) + result.links.get("external", [])
# Filter for high-scoring links
high_scoring_links = [link for link in all_links
if link.get("total_score", 0) > 0.6]
# Sort by total score (highest first)
high_scoring_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
print(f"\n🎯 Found {len(high_scoring_links)} highly relevant links:")
print(" (Showing top 5 by relevance score)")
for i, link in enumerate(high_scoring_links[:5]):
score = link.get("total_score", 0)
title = link.get("head_data", {}).get("title", "No title")
print(f"\n{i+1}. ⭐ {score:.3f} - {title[:70]}...")
print(f" 🔗 {link['href']}")
# Show score breakdown
intrinsic = link.get('intrinsic_score', 0)
contextual = link.get('contextual_score', 0)
print(f" 📊 Quality: {intrinsic:.1f}/10 | Relevance: {contextual:.3f}")
else:
print(f"❌ Research failed: {result.error_message}")
async def api_discovery_example():
"""
API Discovery: Find API endpoints and references
"""
print("\n\n🔧 API Discovery Example")
print("=" * 50)
config = CrawlerRunConfig(
link_extraction_config=LinkExtractionConfig(
include_internal=True,
include_patterns=["*/api/*", "*/reference/*", "*/endpoint/*"],
exclude_patterns=["*/deprecated/*", "*/v1/*"], # Skip old versions
max_links=25,
concurrency=10,
timeout=8,
verbose=False
),
score_links=True
)
# Example with a documentation site that has API references
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://httpbin.org/", config=config)
if result.success:
print(f"✅ Discovered APIs at: {result.url}")
api_links = result.links.get("internal", [])
# Categorize by detected content
endpoints = {"GET": [], "POST": [], "PUT": [], "DELETE": [], "OTHER": []}
for link in api_links:
if link.get("head_data"):
title = link.get("head_data", {}).get("title", "").upper()
text = link.get("text", "").upper()
# Simple categorization based on content
if "GET" in title or "GET" in text:
endpoints["GET"].append(link)
elif "POST" in title or "POST" in text:
endpoints["POST"].append(link)
elif "PUT" in title or "PUT" in text:
endpoints["PUT"].append(link)
elif "DELETE" in title or "DELETE" in text:
endpoints["DELETE"].append(link)
else:
endpoints["OTHER"].append(link)
# Display results
total_found = sum(len(links) for links in endpoints.values())
print(f"\n📡 Found {total_found} API-related links:")
for method, links in endpoints.items():
if links:
print(f"\n{method} Endpoints ({len(links)}):")
for link in links[:3]: # Show first 3 of each type
title = link.get("head_data", {}).get("title", "No title")
score = link.get("intrinsic_score", 0)
print(f" • [{score:.1f}] {title[:50]}...")
print(f" {link['href']}")
else:
print(f"❌ API discovery failed: {result.error_message}")
async def link_quality_analysis():
"""
Link Quality Analysis: Analyze website structure and link quality
"""
print("\n\n📊 Link Quality Analysis Example")
print("=" * 50)
config = CrawlerRunConfig(
link_extraction_config=LinkExtractionConfig(
include_internal=True,
max_links=30, # Analyze more links for better statistics
concurrency=15,
timeout=6,
verbose=False
),
score_links=True
)
async with AsyncWebCrawler() as crawler:
# Test with a content-rich site
result = await crawler.arun("https://docs.python.org/3/", config=config)
if result.success:
print(f"✅ Analyzed: {result.url}")
links = result.links.get("internal", [])
# Extract intrinsic scores for analysis
scores = [link.get('intrinsic_score', 0) for link in links if link.get('intrinsic_score') is not None]
if scores:
avg_score = sum(scores) / len(scores)
high_quality = len([s for s in scores if s >= 7.0])
medium_quality = len([s for s in scores if 4.0 <= s < 7.0])
low_quality = len([s for s in scores if s < 4.0])
print(f"\n📈 Quality Analysis Results:")
print(f" 📊 Average Score: {avg_score:.2f}/10.0")
print(f" 🟢 High Quality (≥7.0): {high_quality} links")
print(f" 🟡 Medium Quality (4.0-6.9): {medium_quality} links")
print(f" 🔴 Low Quality (<4.0): {low_quality} links")
# Show best and worst links
scored_links = [(link, link.get('intrinsic_score', 0)) for link in links
if link.get('intrinsic_score') is not None]
scored_links.sort(key=lambda x: x[1], reverse=True)
print(f"\n🏆 Top 3 Quality Links:")
for i, (link, score) in enumerate(scored_links[:3]):
text = link.get('text', 'No text')[:40]
print(f" {i+1}. [{score:.1f}] {text}...")
print(f" {link['href']}")
print(f"\n⚠️ Bottom 3 Quality Links:")
for i, (link, score) in enumerate(scored_links[-3:]):
text = link.get('text', 'No text')[:40]
print(f" {i+1}. [{score:.1f}] {text}...")
print(f" {link['href']}")
else:
print("❌ No scoring data available")
else:
print(f"❌ Analysis failed: {result.error_message}")
async def pattern_filtering_example():
"""
Pattern Filtering: Demonstrate advanced filtering capabilities
"""
print("\n\n🎯 Pattern Filtering Example")
print("=" * 50)
# Example with multiple filtering strategies
filters = [
{
"name": "Documentation Only",
"config": LinkExtractionConfig(
include_internal=True,
max_links=10,
concurrency=5,
verbose=False,
include_patterns=["*/docs/*", "*/documentation/*"],
exclude_patterns=["*/api/*"]
)
},
{
"name": "API References Only",
"config": LinkExtractionConfig(
include_internal=True,
max_links=10,
concurrency=5,
verbose=False,
include_patterns=["*/api/*", "*/reference/*"],
exclude_patterns=["*/tutorial/*"]
)
},
{
"name": "Exclude Admin Areas",
"config": LinkExtractionConfig(
include_internal=True,
max_links=10,
concurrency=5,
verbose=False,
exclude_patterns=["*/admin/*", "*/login/*", "*/dashboard/*"]
)
}
]
async with AsyncWebCrawler() as crawler:
for filter_example in filters:
print(f"\n🔍 Testing: {filter_example['name']}")
config = CrawlerRunConfig(
link_extraction_config=filter_example['config'],
score_links=True
)
result = await crawler.arun("https://docs.python.org/3/", config=config)
if result.success:
links = result.links.get("internal", [])
links_with_head = [link for link in links if link.get("head_data")]
print(f" 📊 Found {len(links_with_head)} matching links")
if links_with_head:
# Show sample matches
for link in links_with_head[:2]:
title = link.get("head_data", {}).get("title", "No title")
print(f"{title[:50]}...")
print(f" {link['href']}")
else:
print(f" ❌ Failed: {result.error_message}")
async def main():
"""
Run all examples
"""
print("🚀 Crawl4AI Link Head Extraction Examples")
print("=" * 60)
print("This will demonstrate various link analysis capabilities.\n")
try:
# Run all examples
await basic_link_head_extraction()
await research_assistant_example()
await api_discovery_example()
await link_quality_analysis()
await pattern_filtering_example()
print("\n" + "=" * 60)
print("✨ All examples completed successfully!")
print("\nNext steps:")
print("1. Try modifying the queries and patterns above")
print("2. Test with your own websites")
print("3. Experiment with different score thresholds")
print("4. Check out the full documentation for more options")
except KeyboardInterrupt:
print("\n⏹️ Examples interrupted by user")
except Exception as e:
print(f"\n💥 Error running examples: {str(e)}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -105,7 +105,366 @@ result.links = {
---
## 2. Domain Filtering
## 2. Advanced Link Head Extraction & Scoring
Ever wanted to not just extract links, but also get the actual content (title, description, metadata) from those linked pages? And score them for relevance? This is exactly what Link Head Extraction does - it fetches the `<head>` section from each discovered link and scores them using multiple algorithms.
### 2.1 Why Link Head Extraction?
When you crawl a page, you get hundreds of links. But which ones are actually valuable? Link Head Extraction solves this by:
1. **Fetching head content** from each link (title, description, meta tags)
2. **Scoring links intrinsically** based on URL quality, text relevance, and context
3. **Scoring links contextually** using BM25 algorithm when you provide a search query
4. **Combining scores intelligently** to give you a final relevance ranking
### 2.2 Complete Working Example
Here's a full example you can copy, paste, and run immediately:
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.async_configs import LinkExtractionConfig
async def extract_link_heads_example():
"""
Complete example showing link head extraction with scoring.
This will crawl a documentation site and extract head content from internal links.
"""
# Configure link head extraction
config = CrawlerRunConfig(
# Enable link head extraction with detailed configuration
link_extraction_config=LinkExtractionConfig(
include_internal=True, # Extract from internal links
include_external=False, # Skip external links for this example
max_links=10, # Limit to 10 links for demo
concurrency=5, # Process 5 links simultaneously
timeout=10, # 10 second timeout per link
query="API documentation guide", # Query for contextual scoring
score_threshold=0.3, # Only include links scoring above 0.3
verbose=True # Show detailed progress
),
# Enable intrinsic scoring (URL quality, text relevance)
score_links=True,
# Keep output clean
only_text=True,
verbose=True
)
async with AsyncWebCrawler() as crawler:
# Crawl a documentation site (great for testing)
result = await crawler.arun("https://docs.python.org/3/", config=config)
if result.success:
print(f"✅ Successfully crawled: {result.url}")
print(f"📄 Page title: {result.metadata.get('title', 'No title')}")
# Access links (now enhanced with head data and scores)
internal_links = result.links.get("internal", [])
external_links = result.links.get("external", [])
print(f"\n🔗 Found {len(internal_links)} internal links")
print(f"🌍 Found {len(external_links)} external links")
# Count links with head data
links_with_head = [link for link in internal_links
if link.get("head_data") is not None]
print(f"🧠 Links with head data extracted: {len(links_with_head)}")
# Show the top 3 scoring links
print(f"\n🏆 Top 3 Links with Full Scoring:")
for i, link in enumerate(links_with_head[:3]):
print(f"\n{i+1}. {link['href']}")
print(f" Link Text: '{link.get('text', 'No text')[:50]}...'")
# Show all three score types
intrinsic = link.get('intrinsic_score')
contextual = link.get('contextual_score')
total = link.get('total_score')
if intrinsic is not None:
print(f" 📊 Intrinsic Score: {intrinsic:.2f}/10.0 (URL quality & context)")
if contextual is not None:
print(f" 🎯 Contextual Score: {contextual:.3f} (BM25 relevance to query)")
if total is not None:
print(f" ⭐ Total Score: {total:.3f} (combined final score)")
# Show extracted head data
head_data = link.get("head_data", {})
if head_data:
title = head_data.get("title", "No title")
description = head_data.get("meta", {}).get("description", "No description")
print(f" 📰 Title: {title[:60]}...")
if description:
print(f" 📝 Description: {description[:80]}...")
# Show extraction status
status = link.get("head_extraction_status", "unknown")
print(f" ✅ Extraction Status: {status}")
else:
print(f"❌ Crawl failed: {result.error_message}")
# Run the example
if __name__ == "__main__":
asyncio.run(extract_link_heads_example())
```
**Expected Output:**
```
✅ Successfully crawled: https://docs.python.org/3/
📄 Page title: 3.13.5 Documentation
🔗 Found 53 internal links
🌍 Found 1 external links
🧠 Links with head data extracted: 10
🏆 Top 3 Links with Full Scoring:
1. https://docs.python.org/3.15/
Link Text: 'Python 3.15 (in development)...'
📊 Intrinsic Score: 4.17/10.0 (URL quality & context)
🎯 Contextual Score: 1.000 (BM25 relevance to query)
⭐ Total Score: 5.917 (combined final score)
📰 Title: 3.15.0a0 Documentation...
📝 Description: The official Python documentation...
✅ Extraction Status: valid
```
### 2.3 Configuration Deep Dive
The `LinkExtractionConfig` class supports these options:
```python
from crawl4ai.async_configs import LinkExtractionConfig
link_extraction_config = LinkExtractionConfig(
# BASIC SETTINGS
verbose=True, # Show detailed logs (recommended for learning)
# LINK FILTERING
include_internal=True, # Include same-domain links
include_external=True, # Include different-domain links
max_links=50, # Maximum links to process (prevents overload)
# PATTERN FILTERING
include_patterns=[ # Only process links matching these patterns
"*/docs/*",
"*/api/*",
"*/reference/*"
],
exclude_patterns=[ # Skip links matching these patterns
"*/login*",
"*/admin*"
],
# PERFORMANCE SETTINGS
concurrency=10, # How many links to process simultaneously
timeout=5, # Seconds to wait per link
# RELEVANCE SCORING
query="machine learning API", # Query for BM25 contextual scoring
score_threshold=0.3, # Only include links above this score
)
```
### 2.4 Understanding the Three Score Types
Each extracted link gets three different scores:
#### 1. **Intrinsic Score (0-10)** - URL and Content Quality
Based on URL structure, link text quality, and page context:
```python
# High intrinsic score indicators:
# ✅ Clean URL structure (docs.python.org/api/reference)
# ✅ Meaningful link text ("API Reference Guide")
# ✅ Relevant to page context
# ✅ Not buried deep in navigation
# Low intrinsic score indicators:
# ❌ Random URLs (site.com/x7f9g2h)
# ❌ No link text or generic text ("Click here")
# ❌ Unrelated to page content
```
#### 2. **Contextual Score (0-1)** - BM25 Relevance to Query
Only available when you provide a `query`. Uses BM25 algorithm against head content:
```python
# Example: query = "machine learning tutorial"
# High contextual score: Link to "Complete Machine Learning Guide"
# Low contextual score: Link to "Privacy Policy"
```
#### 3. **Total Score** - Smart Combination
Intelligently combines intrinsic and contextual scores with fallbacks:
```python
# When both scores available: (intrinsic * 0.3) + (contextual * 0.7)
# When only intrinsic: uses intrinsic score
# When only contextual: uses contextual score
# When neither: not calculated
```
### 2.5 Practical Use Cases
#### Use Case 1: Research Assistant
Find the most relevant documentation pages:
```python
async def research_assistant():
config = CrawlerRunConfig(
link_extraction_config=LinkExtractionConfig(
include_internal=True,
include_external=True,
include_patterns=["*/docs/*", "*/tutorial/*", "*/guide/*"],
query="machine learning neural networks",
max_links=20,
score_threshold=0.5, # Only high-relevance links
verbose=True
),
score_links=True
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://scikit-learn.org/", config=config)
if result.success:
# Get high-scoring links
good_links = [link for link in result.links.get("internal", [])
if link.get("total_score", 0) > 0.7]
print(f"🎯 Found {len(good_links)} highly relevant links:")
for link in good_links[:5]:
print(f"{link['total_score']:.3f} - {link['href']}")
print(f" {link.get('head_data', {}).get('title', 'No title')}")
```
#### Use Case 2: Content Discovery
Find all API endpoints and references:
```python
async def api_discovery():
config = CrawlerRunConfig(
link_extraction_config=LinkExtractionConfig(
include_internal=True,
include_patterns=["*/api/*", "*/reference/*"],
exclude_patterns=["*/deprecated/*"],
max_links=100,
concurrency=15,
verbose=False # Clean output
),
score_links=True
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://docs.example-api.com/", config=config)
if result.success:
api_links = result.links.get("internal", [])
# Group by endpoint type
endpoints = {}
for link in api_links:
if link.get("head_data"):
title = link["head_data"].get("title", "")
if "GET" in title:
endpoints.setdefault("GET", []).append(link)
elif "POST" in title:
endpoints.setdefault("POST", []).append(link)
for method, links in endpoints.items():
print(f"\n{method} Endpoints ({len(links)}):")
for link in links[:3]:
print(f"{link['href']}")
```
#### Use Case 3: Link Quality Analysis
Analyze website structure and content quality:
```python
async def quality_analysis():
config = CrawlerRunConfig(
link_extraction_config=LinkExtractionConfig(
include_internal=True,
max_links=200,
concurrency=20,
),
score_links=True
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://your-website.com/", config=config)
if result.success:
links = result.links.get("internal", [])
# Analyze intrinsic scores
scores = [link.get('intrinsic_score', 0) for link in links]
avg_score = sum(scores) / len(scores) if scores else 0
print(f"📊 Link Quality Analysis:")
print(f" Average intrinsic score: {avg_score:.2f}/10.0")
print(f" High quality links (>7.0): {len([s for s in scores if s > 7.0])}")
print(f" Low quality links (<3.0): {len([s for s in scores if s < 3.0])}")
# Find problematic links
bad_links = [link for link in links
if link.get('intrinsic_score', 0) < 2.0]
if bad_links:
print(f"\n⚠️ Links needing attention:")
for link in bad_links[:5]:
print(f" {link['href']} (score: {link.get('intrinsic_score', 0):.1f})")
```
### 2.6 Performance Tips
1. **Start Small**: Begin with `max_links: 10` to understand the feature
2. **Use Patterns**: Filter with `include_patterns` to focus on relevant sections
3. **Adjust Concurrency**: Higher concurrency = faster but more resource usage
4. **Set Timeouts**: Use `timeout: 5` to prevent hanging on slow sites
5. **Use Score Thresholds**: Filter out low-quality links with `score_threshold`
### 2.7 Troubleshooting
**No head data extracted?**
```python
# Check your configuration:
config = CrawlerRunConfig(
link_extraction_config=LinkExtractionConfig(
verbose=True # ← Enable to see what's happening
)
)
```
**Scores showing as None?**
```python
# Make sure scoring is enabled:
config = CrawlerRunConfig(
score_links=True, # ← Enable intrinsic scoring
link_extraction_config=LinkExtractionConfig(
query="your search terms" # ← For contextual scoring
)
)
```
**Process taking too long?**
```python
# Optimize performance:
link_extraction_config = LinkExtractionConfig(
max_links=20, # ← Reduce number
concurrency=10, # ← Increase parallelism
timeout=3, # ← Shorter timeout
include_patterns=["*/important/*"] # ← Focus on key areas
)
```
---
## 3. Domain Filtering
Some websites contain hundreds of third-party or affiliate links. You can filter out certain domains at **crawl time** by configuring the crawler. The most relevant parameters in `CrawlerRunConfig` are:
@@ -114,7 +473,7 @@ Some websites contain hundreds of third-party or affiliate links. You can filter
- **`exclude_social_media_links`**: If `True`, automatically skip known social platforms.
- **`exclude_domains`**: Provide a list of custom domains you want to exclude (e.g., `["spammyads.com", "tracker.net"]`).
### 2.1 Example: Excluding External & Social Media Links
### 3.1 Example: Excluding External & Social Media Links
```python
import asyncio
@@ -143,7 +502,7 @@ if __name__ == "__main__":
asyncio.run(main())
```
### 2.2 Example: Excluding Specific Domains
### 3.2 Example: Excluding Specific Domains
If you want to let external links in, but specifically exclude a domain (e.g., `suspiciousads.com`), do this:
@@ -157,9 +516,9 @@ This approach is handy when you still want external links but need to block cert
---
## 3. Media Extraction
## 4. Media Extraction
### 3.1 Accessing `result.media`
### 4.1 Accessing `result.media`
By default, Crawl4AI collects images, audio, video URLs, and data tables it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`, `tables`).
@@ -237,7 +596,7 @@ Depending on your Crawl4AI version or scraping strategy, these dictionaries can
With these details, you can easily filter out or focus on certain images (for instance, ignoring images with very low scores or a different domain), or gather metadata for analytics.
### 3.2 Excluding External Images
### 4.2 Excluding External Images
If youre dealing with heavy pages or want to skip third-party images (advertisements, for example), you can turn on:

View File

@@ -0,0 +1,262 @@
#!/usr/bin/env python3
"""
Test script for Link Extractor functionality
"""
from crawl4ai.models import Link
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.async_configs import LinkExtractionConfig
import asyncio
import sys
import os
# Add the crawl4ai directory to the path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'crawl4ai'))
async def test_link_extractor():
"""Test the link extractor functionality"""
print("🔗 Testing Link Extractor Functionality")
print("=" * 50)
# Test configuration with link extraction AND scoring enabled
config = CrawlerRunConfig(
link_extraction_config=LinkExtractionConfig(
include_internal=True,
include_external=False, # Only internal links for this test
# No include/exclude patterns for first test - let's see what we get
query="API documentation reference guide",
score_threshold=0.3,
concurrency=5,
timeout=10,
max_links=5, # Just test with 5 links first
verbose=True # Show detailed progress
),
score_links=True, # Enable intrinsic link scoring
only_text=True,
verbose=True
)
# Test URLs
test_urls = [
"https://docs.python.org/3/", # Python docs - should have many internal links
"https://httpbin.org/", # Simple site for testing
]
async with AsyncWebCrawler() as crawler:
for url in test_urls:
print(f"\n🌐 Testing URL: {url}")
print("-" * 40)
try:
result = await crawler.arun(url, config=config)
# Debug: Check if link extraction config is being passed
print(f"🔍 Debug - Link extraction config: {config.link_extraction_config.to_dict() if config.link_extraction_config else None}")
print(f"🔍 Debug - Score links: {config.score_links}")
if result.success:
print(f"✅ Crawl successful!")
print(
f"📄 Page title: {result.metadata.get('title', 'No title')}")
# Check links - handle both dict and Links object structure
if isinstance(result.links, dict):
internal_links = [
Link(**link) for link in result.links.get('internal', [])]
external_links = [
Link(**link) for link in result.links.get('external', [])]
else:
internal_links = result.links.internal
external_links = result.links.external
print(f"🔗 Found {len(internal_links)} internal links")
print(f"🌍 Found {len(external_links)} external links")
# Show links with head data
links_with_head = [link for link in internal_links + external_links
if hasattr(link, 'head_data') and link.head_data]
print(
f"🧠 Links with head data extracted: {len(links_with_head)}")
# Show all score types for all links (first 3)
all_links = internal_links + external_links
if all_links:
print(f"\n🔢 Sample link scores (first 3 links):")
for i, link in enumerate(all_links[:3]):
print(f"\n {i+1}. {link.href}")
# Show intrinsic score
if hasattr(link, 'intrinsic_score') and link.intrinsic_score is not None:
if link.intrinsic_score == float('inf'):
print(f" Intrinsic Score: ∞ (scoring disabled)")
else:
print(f" Intrinsic Score: {link.intrinsic_score:.2f}/10.0")
else:
print(f" Intrinsic Score: Not available")
# Show contextual score (BM25)
if hasattr(link, 'contextual_score') and link.contextual_score is not None:
print(f" Contextual Score: {link.contextual_score:.3f}")
else:
print(f" Contextual Score: Not available")
# Show total score
if hasattr(link, 'total_score') and link.total_score is not None:
print(f" Total Score: {link.total_score:.3f}")
else:
print(f" Total Score: Not available")
print(f" Text: '{link.text[:50]}...' " if link.text else " Text: (no text)")
if links_with_head:
print("\n📊 Sample links with head data:")
# Show top 3
for i, link in enumerate(links_with_head[:3]):
print(f"\n {i+1}. {link.href}")
print(
f" Status: {link.head_extraction_status}")
# Show all three score types
print(f" 📊 Scoring Summary:")
if hasattr(link, 'intrinsic_score') and link.intrinsic_score is not None:
if link.intrinsic_score == float('inf'):
print(f" • Intrinsic Score: ∞ (scoring disabled)")
else:
print(f" • Intrinsic Score: {link.intrinsic_score:.2f}/10.0")
else:
print(f" • Intrinsic Score: Not available")
if hasattr(link, 'contextual_score') and link.contextual_score is not None:
print(f" • Contextual Score: {link.contextual_score:.3f}")
else:
print(f" • Contextual Score: Not available")
if hasattr(link, 'total_score') and link.total_score is not None:
print(f" • Total Score: {link.total_score:.3f}")
else:
print(f" • Total Score: Not available")
if link.head_data:
title = link.head_data.get('title', 'No title')
if title:
print(f" Title: {title[:60]}...")
meta = link.head_data.get('meta', {})
if 'description' in meta and meta['description']:
desc = meta['description']
print(f" Description: {desc[:80]}...")
# Show link metadata keys (should now be properly formatted)
link_data = link.head_data.get('link', {})
if link_data:
keys = list(link_data.keys())[:3]
print(f" Link types: {keys}")
# Show failed extractions
failed_links = [link for link in internal_links + external_links
if hasattr(link, 'head_extraction_status') and
link.head_extraction_status == 'failed']
if failed_links:
print(
f"\n❌ Failed head extractions: {len(failed_links)}")
for link in failed_links[:2]: # Show first 2 failures
print(f" - {link.href}")
if hasattr(link, 'head_extraction_error') and link.head_extraction_error:
print(
f" Error: {link.head_extraction_error}")
else:
print(f"❌ Crawl failed: {result.error_message}")
except Exception as e:
print(f"💥 Error testing {url}: {str(e)}")
import traceback
traceback.print_exc()
def test_config_examples():
"""Show example configurations"""
print("\n📚 Example Configurations")
print("=" * 50)
examples = [
{
"name": "BM25 Scored Documentation Links",
"config": LinkExtractionConfig(
include_internal=True,
include_external=False,
include_patterns=["*/docs/*", "*/api/*", "*/reference/*"],
query="API documentation reference guide",
score_threshold=0.3,
max_links=30,
verbose=True
)
},
{
"name": "Internal Links Only",
"config": LinkExtractionConfig(
include_internal=True,
include_external=False,
max_links=50,
verbose=True
)
},
{
"name": "External Links with Patterns",
"config": LinkExtractionConfig(
include_internal=False,
include_external=True,
include_patterns=["*github.com*", "*stackoverflow.com*"],
max_links=20,
concurrency=10
)
},
{
"name": "High-Performance Mode",
"config": LinkExtractionConfig(
include_internal=True,
include_external=False,
concurrency=20,
timeout=3,
max_links=100,
verbose=False
)
}
]
for example in examples:
print(f"\n📝 {example['name']}:")
print(" Configuration:")
config_dict = example['config'].to_dict()
for key, value in config_dict.items():
print(f" {key}: {value}")
print(" Usage:")
print(" from crawl4ai.async_configs import LinkExtractionConfig")
print(" config = CrawlerRunConfig(")
print(" link_extraction_config=LinkExtractionConfig(")
for key, value in config_dict.items():
if isinstance(value, str):
print(f" {key}='{value}',")
elif isinstance(value, list) and value:
print(f" {key}={value},")
elif value is not None:
print(f" {key}={value},")
print(" )")
print(" )")
if __name__ == "__main__":
# Show configuration examples first
test_config_examples()
# Run the actual test
print("\n🚀 Running Link Extractor Tests...")
asyncio.run(test_link_extractor())
print("\n✨ Test completed!")