feat: Add advanced link head extraction with three-layer scoring system (#1)
Squashed commit from feature/link-extractor branch implementing comprehensive link analysis: - Extract HTML head content from discovered links with parallel processing - Three-layer scoring: Intrinsic (URL quality), Contextual (BM25), and Total scores - New LinkExtractionConfig class for type-safe configuration - Pattern-based filtering for internal/external links - Comprehensive documentation and examples
This commit is contained in:
@@ -37,6 +37,7 @@ from .content_filter_strategy import (
|
||||
)
|
||||
from .models import CrawlResult, MarkdownGenerationResult, DisplayMode
|
||||
from .components.crawler_monitor import CrawlerMonitor
|
||||
from .link_extractor import LinkExtractor
|
||||
from .async_dispatcher import (
|
||||
MemoryAdaptiveDispatcher,
|
||||
SemaphoreDispatcher,
|
||||
@@ -141,6 +142,7 @@ __all__ = [
|
||||
"SemaphoreDispatcher",
|
||||
"RateLimiter",
|
||||
"CrawlerMonitor",
|
||||
"LinkExtractor",
|
||||
"DisplayMode",
|
||||
"MarkdownGenerationResult",
|
||||
"Crawl4aiDockerClient",
|
||||
|
||||
@@ -17,7 +17,7 @@ from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||
|
||||
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
|
||||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
|
||||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||
from .deep_crawling import DeepCrawlStrategy
|
||||
|
||||
from .cache_context import CacheMode
|
||||
@@ -594,6 +594,101 @@ class BrowserConfig:
|
||||
return config
|
||||
return BrowserConfig.from_kwargs(config)
|
||||
|
||||
class LinkExtractionConfig:
|
||||
"""Configuration for link head extraction and scoring."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
include_internal: bool = True,
|
||||
include_external: bool = False,
|
||||
include_patterns: Optional[List[str]] = None,
|
||||
exclude_patterns: Optional[List[str]] = None,
|
||||
concurrency: int = 10,
|
||||
timeout: int = 5,
|
||||
max_links: int = 100,
|
||||
query: Optional[str] = None,
|
||||
score_threshold: Optional[float] = None,
|
||||
verbose: bool = False
|
||||
):
|
||||
"""
|
||||
Initialize link extraction configuration.
|
||||
|
||||
Args:
|
||||
include_internal: Whether to include same-domain links
|
||||
include_external: Whether to include different-domain links
|
||||
include_patterns: List of glob patterns to include (e.g., ["*/docs/*", "*/api/*"])
|
||||
exclude_patterns: List of glob patterns to exclude (e.g., ["*/login*", "*/admin*"])
|
||||
concurrency: Number of links to process simultaneously
|
||||
timeout: Timeout in seconds for each link's head extraction
|
||||
max_links: Maximum number of links to process (prevents overload)
|
||||
query: Query string for BM25 contextual scoring (optional)
|
||||
score_threshold: Minimum relevance score to include links (0.0-1.0, optional)
|
||||
verbose: Show detailed progress during extraction
|
||||
"""
|
||||
self.include_internal = include_internal
|
||||
self.include_external = include_external
|
||||
self.include_patterns = include_patterns
|
||||
self.exclude_patterns = exclude_patterns
|
||||
self.concurrency = concurrency
|
||||
self.timeout = timeout
|
||||
self.max_links = max_links
|
||||
self.query = query
|
||||
self.score_threshold = score_threshold
|
||||
self.verbose = verbose
|
||||
|
||||
# Validation
|
||||
if concurrency <= 0:
|
||||
raise ValueError("concurrency must be positive")
|
||||
if timeout <= 0:
|
||||
raise ValueError("timeout must be positive")
|
||||
if max_links <= 0:
|
||||
raise ValueError("max_links must be positive")
|
||||
if score_threshold is not None and not (0.0 <= score_threshold <= 1.0):
|
||||
raise ValueError("score_threshold must be between 0.0 and 1.0")
|
||||
if not include_internal and not include_external:
|
||||
raise ValueError("At least one of include_internal or include_external must be True")
|
||||
|
||||
@staticmethod
|
||||
def from_dict(config_dict: Dict[str, Any]) -> "LinkExtractionConfig":
|
||||
"""Create LinkExtractionConfig from dictionary (for backward compatibility)."""
|
||||
if not config_dict:
|
||||
return None
|
||||
|
||||
return LinkExtractionConfig(
|
||||
include_internal=config_dict.get("include_internal", True),
|
||||
include_external=config_dict.get("include_external", False),
|
||||
include_patterns=config_dict.get("include_patterns"),
|
||||
exclude_patterns=config_dict.get("exclude_patterns"),
|
||||
concurrency=config_dict.get("concurrency", 10),
|
||||
timeout=config_dict.get("timeout", 5),
|
||||
max_links=config_dict.get("max_links", 100),
|
||||
query=config_dict.get("query"),
|
||||
score_threshold=config_dict.get("score_threshold"),
|
||||
verbose=config_dict.get("verbose", False)
|
||||
)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary format."""
|
||||
return {
|
||||
"include_internal": self.include_internal,
|
||||
"include_external": self.include_external,
|
||||
"include_patterns": self.include_patterns,
|
||||
"exclude_patterns": self.exclude_patterns,
|
||||
"concurrency": self.concurrency,
|
||||
"timeout": self.timeout,
|
||||
"max_links": self.max_links,
|
||||
"query": self.query,
|
||||
"score_threshold": self.score_threshold,
|
||||
"verbose": self.verbose
|
||||
}
|
||||
|
||||
def clone(self, **kwargs) -> "LinkExtractionConfig":
|
||||
"""Create a copy with updated values."""
|
||||
config_dict = self.to_dict()
|
||||
config_dict.update(kwargs)
|
||||
return LinkExtractionConfig.from_dict(config_dict)
|
||||
|
||||
|
||||
class HTTPCrawlerConfig:
|
||||
"""HTTP-specific crawler configuration"""
|
||||
|
||||
@@ -829,6 +924,9 @@ class CrawlerRunConfig():
|
||||
Default: [].
|
||||
exclude_internal_links (bool): If True, exclude internal links from the results.
|
||||
Default: False.
|
||||
score_links (bool): If True, calculate intrinsic quality scores for all links using URL structure,
|
||||
text quality, and contextual relevance metrics. Separate from link_extraction_config.
|
||||
Default: False.
|
||||
|
||||
# Debugging and Logging Parameters
|
||||
verbose (bool): Enable verbose logging.
|
||||
@@ -939,6 +1037,7 @@ class CrawlerRunConfig():
|
||||
exclude_social_media_links: bool = False,
|
||||
exclude_domains: list = None,
|
||||
exclude_internal_links: bool = False,
|
||||
score_links: bool = False,
|
||||
# Debugging and Logging Parameters
|
||||
verbose: bool = True,
|
||||
log_console: bool = False,
|
||||
@@ -955,6 +1054,8 @@ class CrawlerRunConfig():
|
||||
user_agent_generator_config: dict = {},
|
||||
# Deep Crawl Parameters
|
||||
deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
|
||||
# Link Extraction Parameters
|
||||
link_extraction_config: Union[LinkExtractionConfig, Dict[str, Any]] = None,
|
||||
# Experimental Parameters
|
||||
experimental: Dict[str, Any] = None,
|
||||
):
|
||||
@@ -976,7 +1077,7 @@ class CrawlerRunConfig():
|
||||
self.remove_forms = remove_forms
|
||||
self.prettiify = prettiify
|
||||
self.parser_type = parser_type
|
||||
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
|
||||
self.scraping_strategy = scraping_strategy or LXMLWebScrapingStrategy()
|
||||
self.proxy_config = proxy_config
|
||||
self.proxy_rotation_strategy = proxy_rotation_strategy
|
||||
|
||||
@@ -1042,6 +1143,7 @@ class CrawlerRunConfig():
|
||||
self.exclude_social_media_links = exclude_social_media_links
|
||||
self.exclude_domains = exclude_domains or []
|
||||
self.exclude_internal_links = exclude_internal_links
|
||||
self.score_links = score_links
|
||||
|
||||
# Debugging and Logging Parameters
|
||||
self.verbose = verbose
|
||||
@@ -1084,6 +1186,17 @@ class CrawlerRunConfig():
|
||||
# Deep Crawl Parameters
|
||||
self.deep_crawl_strategy = deep_crawl_strategy
|
||||
|
||||
# Link Extraction Parameters
|
||||
if link_extraction_config is None:
|
||||
self.link_extraction_config = None
|
||||
elif isinstance(link_extraction_config, LinkExtractionConfig):
|
||||
self.link_extraction_config = link_extraction_config
|
||||
elif isinstance(link_extraction_config, dict):
|
||||
# Convert dict to config object for backward compatibility
|
||||
self.link_extraction_config = LinkExtractionConfig.from_dict(link_extraction_config)
|
||||
else:
|
||||
raise ValueError("link_extraction_config must be LinkExtractionConfig object or dict")
|
||||
|
||||
# Experimental Parameters
|
||||
self.experimental = experimental or {}
|
||||
|
||||
@@ -1241,6 +1354,7 @@ class CrawlerRunConfig():
|
||||
exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
|
||||
exclude_domains=kwargs.get("exclude_domains", []),
|
||||
exclude_internal_links=kwargs.get("exclude_internal_links", False),
|
||||
score_links=kwargs.get("score_links", False),
|
||||
# Debugging and Logging Parameters
|
||||
verbose=kwargs.get("verbose", True),
|
||||
log_console=kwargs.get("log_console", False),
|
||||
@@ -1256,6 +1370,8 @@ class CrawlerRunConfig():
|
||||
user_agent_generator_config=kwargs.get("user_agent_generator_config", {}),
|
||||
# Deep Crawl Parameters
|
||||
deep_crawl_strategy=kwargs.get("deep_crawl_strategy"),
|
||||
# Link Extraction Parameters
|
||||
link_extraction_config=kwargs.get("link_extraction_config"),
|
||||
url=kwargs.get("url"),
|
||||
# Experimental Parameters
|
||||
experimental=kwargs.get("experimental"),
|
||||
@@ -1339,6 +1455,7 @@ class CrawlerRunConfig():
|
||||
"exclude_social_media_links": self.exclude_social_media_links,
|
||||
"exclude_domains": self.exclude_domains,
|
||||
"exclude_internal_links": self.exclude_internal_links,
|
||||
"score_links": self.score_links,
|
||||
"verbose": self.verbose,
|
||||
"log_console": self.log_console,
|
||||
"capture_network_requests": self.capture_network_requests,
|
||||
@@ -1350,6 +1467,7 @@ class CrawlerRunConfig():
|
||||
"user_agent_mode": self.user_agent_mode,
|
||||
"user_agent_generator_config": self.user_agent_generator_config,
|
||||
"deep_crawl_strategy": self.deep_crawl_strategy,
|
||||
"link_extraction_config": self.link_extraction_config.to_dict() if self.link_extraction_config else None,
|
||||
"url": self.url,
|
||||
"experimental": self.experimental,
|
||||
}
|
||||
|
||||
@@ -109,12 +109,16 @@ def _parse_head(src: str) -> Dict[str, Any]:
|
||||
elif "charset" in el.attrib:
|
||||
info["charset"] = el.attrib["charset"].lower()
|
||||
for el in doc.xpath(".//link"):
|
||||
rel = " ".join(el.attrib.get("rel", [])).lower()
|
||||
if not rel:
|
||||
rel_attr = el.attrib.get("rel", "")
|
||||
if not rel_attr:
|
||||
continue
|
||||
# Handle multiple space-separated rel values
|
||||
rel_values = rel_attr.lower().split()
|
||||
entry = {a: el.attrib[a] for a in (
|
||||
"href", "as", "type", "hreflang") if a in el.attrib}
|
||||
info["link"].setdefault(rel, []).append(entry)
|
||||
# Add entry for each rel value
|
||||
for rel in rel_values:
|
||||
info["link"].setdefault(rel, []).append(entry)
|
||||
# Extract JSON-LD structured data
|
||||
for script in doc.xpath('.//script[@type="application/ld+json"]'):
|
||||
if script.text:
|
||||
@@ -467,6 +471,200 @@ class AsyncUrlSeeder:
|
||||
"info", "Finished URL seeding for multiple domains.", tag="URL_SEED")
|
||||
return final_results
|
||||
|
||||
async def extract_head_for_urls(
|
||||
self,
|
||||
urls: List[str],
|
||||
config: Optional["SeedingConfig"] = None,
|
||||
concurrency: int = 10,
|
||||
timeout: int = 5
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract head content for a custom list of URLs using URLSeeder's parallel processing.
|
||||
|
||||
This method reuses URLSeeder's efficient parallel processing, caching, and head extraction
|
||||
logic to process a custom list of URLs rather than discovering URLs from sources.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
urls : List[str]
|
||||
List of URLs to extract head content from
|
||||
config : SeedingConfig, optional
|
||||
Configuration object. If None, uses default settings for head extraction
|
||||
concurrency : int, default=10
|
||||
Number of concurrent requests
|
||||
timeout : int, default=5
|
||||
Timeout for each request in seconds
|
||||
|
||||
Returns
|
||||
-------
|
||||
List[Dict[str, Any]]
|
||||
List of dictionaries containing url, status, head_data, and optional relevance_score
|
||||
"""
|
||||
# Create default config if none provided
|
||||
if config is None:
|
||||
# Import here to avoid circular imports
|
||||
from .async_configs import SeedingConfig
|
||||
config = SeedingConfig(
|
||||
extract_head=True,
|
||||
concurrency=concurrency,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
# Override concurrency and ensure head extraction is enabled
|
||||
config.concurrency = concurrency
|
||||
config.extract_head = True
|
||||
|
||||
self._log("info", "Starting head extraction for {count} custom URLs",
|
||||
params={"count": len(urls)}, tag="URL_SEED")
|
||||
|
||||
# Setup rate limiting if specified in config
|
||||
if config.hits_per_sec:
|
||||
if config.hits_per_sec <= 0:
|
||||
self._log("warning", "hits_per_sec must be positive. Disabling rate limiting.", tag="URL_SEED")
|
||||
self._rate_sem = None
|
||||
else:
|
||||
self._rate_sem = asyncio.Semaphore(config.hits_per_sec)
|
||||
else:
|
||||
self._rate_sem = None
|
||||
|
||||
# Use bounded queue to prevent memory issues with large URL lists
|
||||
queue_size = min(10000, max(1000, concurrency * 100))
|
||||
queue = asyncio.Queue(maxsize=queue_size)
|
||||
producer_done = asyncio.Event()
|
||||
stop_event = asyncio.Event()
|
||||
seen: set[str] = set()
|
||||
|
||||
# Results collection
|
||||
results: List[Dict[str, Any]] = []
|
||||
|
||||
async def producer():
|
||||
"""Producer to feed URLs into the queue."""
|
||||
try:
|
||||
for url in urls:
|
||||
if url in seen:
|
||||
self._log("debug", "Skipping duplicate URL: {url}",
|
||||
params={"url": url}, tag="URL_SEED")
|
||||
continue
|
||||
if stop_event.is_set():
|
||||
break
|
||||
seen.add(url)
|
||||
await queue.put(url)
|
||||
finally:
|
||||
producer_done.set()
|
||||
|
||||
async def worker(res_list: List[Dict[str, Any]]):
|
||||
"""Worker to process URLs from the queue."""
|
||||
while True:
|
||||
try:
|
||||
# Wait for URL or producer completion
|
||||
url = await asyncio.wait_for(queue.get(), timeout=1.0)
|
||||
except asyncio.TimeoutError:
|
||||
if producer_done.is_set() and queue.empty():
|
||||
break
|
||||
continue
|
||||
|
||||
try:
|
||||
# Use existing _validate method which handles head extraction, caching, etc.
|
||||
await self._validate(
|
||||
url, res_list,
|
||||
live=False, # We're not doing live checks, just head extraction
|
||||
extract=True, # Always extract head content
|
||||
timeout=timeout,
|
||||
verbose=config.verbose or False,
|
||||
query=config.query,
|
||||
score_threshold=config.score_threshold,
|
||||
scoring_method=config.scoring_method or "bm25",
|
||||
filter_nonsense=config.filter_nonsense_urls
|
||||
)
|
||||
except Exception as e:
|
||||
self._log("error", "Failed to process URL {url}: {error}",
|
||||
params={"url": url, "error": str(e)}, tag="URL_SEED")
|
||||
# Add failed entry to results
|
||||
res_list.append({
|
||||
"url": url,
|
||||
"status": "failed",
|
||||
"head_data": {},
|
||||
"error": str(e)
|
||||
})
|
||||
finally:
|
||||
queue.task_done()
|
||||
|
||||
# Start producer
|
||||
producer_task = asyncio.create_task(producer())
|
||||
|
||||
# Start workers
|
||||
worker_tasks = []
|
||||
for _ in range(concurrency):
|
||||
worker_task = asyncio.create_task(worker(results))
|
||||
worker_tasks.append(worker_task)
|
||||
|
||||
# Wait for producer to finish
|
||||
await producer_task
|
||||
|
||||
# Wait for all items to be processed
|
||||
await queue.join()
|
||||
|
||||
# Cancel workers
|
||||
for task in worker_tasks:
|
||||
task.cancel()
|
||||
|
||||
# Wait for workers to finish canceling
|
||||
await asyncio.gather(*worker_tasks, return_exceptions=True)
|
||||
|
||||
# Apply BM25 scoring if query is provided
|
||||
if config.query and config.scoring_method == "bm25":
|
||||
results = await self._apply_bm25_scoring(results, config)
|
||||
|
||||
# Apply score threshold filtering
|
||||
if config.score_threshold is not None:
|
||||
results = [r for r in results if r.get("relevance_score", 0) >= config.score_threshold]
|
||||
|
||||
# Sort by relevance score if available
|
||||
if any("relevance_score" in r for r in results):
|
||||
results.sort(key=lambda x: x.get("relevance_score", 0), reverse=True)
|
||||
|
||||
self._log("info", "Completed head extraction for {count} URLs, {success} successful",
|
||||
params={
|
||||
"count": len(urls),
|
||||
"success": len([r for r in results if r.get("status") == "valid"])
|
||||
}, tag="URL_SEED")
|
||||
|
||||
return results
|
||||
|
||||
async def _apply_bm25_scoring(self, results: List[Dict[str, Any]], config: "SeedingConfig") -> List[Dict[str, Any]]:
|
||||
"""Apply BM25 scoring to results that have head_data."""
|
||||
if not HAS_BM25:
|
||||
self._log("warning", "BM25 scoring requested but rank_bm25 not available", tag="URL_SEED")
|
||||
return results
|
||||
|
||||
# Extract text contexts from head data
|
||||
text_contexts = []
|
||||
valid_results = []
|
||||
|
||||
for result in results:
|
||||
if result.get("status") == "valid" and result.get("head_data"):
|
||||
text_context = self._extract_text_context(result["head_data"])
|
||||
if text_context:
|
||||
text_contexts.append(text_context)
|
||||
valid_results.append(result)
|
||||
else:
|
||||
# Use URL-based scoring as fallback
|
||||
score = self._calculate_url_relevance_score(config.query, result["url"])
|
||||
result["relevance_score"] = float(score)
|
||||
elif result.get("status") == "valid":
|
||||
# No head data but valid URL - use URL-based scoring
|
||||
score = self._calculate_url_relevance_score(config.query, result["url"])
|
||||
result["relevance_score"] = float(score)
|
||||
|
||||
# Calculate BM25 scores for results with text context
|
||||
if text_contexts and valid_results:
|
||||
scores = await asyncio.to_thread(self._calculate_bm25_score, config.query, text_contexts)
|
||||
for i, result in enumerate(valid_results):
|
||||
if i < len(scores):
|
||||
result["relevance_score"] = float(scores[i])
|
||||
|
||||
return results
|
||||
|
||||
async def _resolve_head(self, url: str) -> Optional[str]:
|
||||
"""
|
||||
HEAD-probe a URL.
|
||||
|
||||
@@ -23,6 +23,8 @@ from .utils import (
|
||||
is_external_url,
|
||||
get_base_domain,
|
||||
extract_metadata_using_lxml,
|
||||
extract_page_context,
|
||||
calculate_link_intrinsic_score,
|
||||
)
|
||||
from lxml import etree
|
||||
from lxml import html as lhtml
|
||||
@@ -944,6 +946,72 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
# Update the links dictionary with unique links
|
||||
links["internal"] = list(internal_links_dict.values())
|
||||
links["external"] = list(external_links_dict.values())
|
||||
|
||||
# Extract head content for links if configured
|
||||
link_extraction_config = kwargs.get("link_extraction_config")
|
||||
if link_extraction_config is not None:
|
||||
try:
|
||||
import asyncio
|
||||
from .link_extractor import LinkExtractor
|
||||
from .models import Links, Link
|
||||
|
||||
verbose = link_extraction_config.verbose
|
||||
|
||||
if verbose:
|
||||
self._log("info", "Starting link head extraction for {internal} internal and {external} external links",
|
||||
params={"internal": len(links["internal"]), "external": len(links["external"])}, tag="LINK_EXTRACT")
|
||||
|
||||
# Convert dict links to Link objects
|
||||
internal_links = [Link(**link_data) for link_data in links["internal"]]
|
||||
external_links = [Link(**link_data) for link_data in links["external"]]
|
||||
links_obj = Links(internal=internal_links, external=external_links)
|
||||
|
||||
# Create a config object for LinkExtractor
|
||||
class TempCrawlerRunConfig:
|
||||
def __init__(self, link_config, score_links):
|
||||
self.link_extraction_config = link_config
|
||||
self.score_links = score_links
|
||||
|
||||
config = TempCrawlerRunConfig(link_extraction_config, kwargs.get("score_links", False))
|
||||
|
||||
# Extract head content (run async operation in sync context)
|
||||
async def extract_links():
|
||||
async with LinkExtractor(self.logger) as extractor:
|
||||
return await extractor.extract_link_heads(links_obj, config)
|
||||
|
||||
# Run the async operation
|
||||
try:
|
||||
# Check if we're already in an async context
|
||||
loop = asyncio.get_running_loop()
|
||||
# If we're in an async context, we need to run in a thread
|
||||
import concurrent.futures
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future = executor.submit(asyncio.run, extract_links())
|
||||
updated_links = future.result()
|
||||
except RuntimeError:
|
||||
# No running loop, we can use asyncio.run directly
|
||||
updated_links = asyncio.run(extract_links())
|
||||
|
||||
# Convert back to dict format
|
||||
links["internal"] = [link.dict() for link in updated_links.internal]
|
||||
links["external"] = [link.dict() for link in updated_links.external]
|
||||
|
||||
if verbose:
|
||||
successful_internal = len([l for l in updated_links.internal if l.head_extraction_status == "valid"])
|
||||
successful_external = len([l for l in updated_links.external if l.head_extraction_status == "valid"])
|
||||
self._log("info", "Link head extraction completed: {internal_success}/{internal_total} internal, {external_success}/{external_total} external",
|
||||
params={
|
||||
"internal_success": successful_internal,
|
||||
"internal_total": len(updated_links.internal),
|
||||
"external_success": successful_external,
|
||||
"external_total": len(updated_links.external)
|
||||
}, tag="LINK_EXTRACT")
|
||||
else:
|
||||
self._log("info", "Link head extraction completed successfully", tag="LINK_EXTRACT")
|
||||
|
||||
except Exception as e:
|
||||
self._log("error", f"Link head extraction failed: {str(e)}", tag="LINK_EXTRACT")
|
||||
# Continue with original links if extraction fails
|
||||
|
||||
# # Process images using ThreadPoolExecutor
|
||||
imgs = body.find_all("img")
|
||||
@@ -1037,6 +1105,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
media: Dict[str, List],
|
||||
internal_links_dict: Dict[str, Any],
|
||||
external_links_dict: Dict[str, Any],
|
||||
page_context: dict = None,
|
||||
**kwargs,
|
||||
) -> bool:
|
||||
base_domain = kwargs.get("base_domain", get_base_domain(url))
|
||||
@@ -1056,6 +1125,25 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
"title": link.get("title", "").strip(),
|
||||
"base_domain": base_domain,
|
||||
}
|
||||
|
||||
# Add intrinsic scoring if enabled
|
||||
if kwargs.get("score_links", False) and page_context is not None:
|
||||
try:
|
||||
intrinsic_score = calculate_link_intrinsic_score(
|
||||
link_text=link_data["text"],
|
||||
url=normalized_href,
|
||||
title_attr=link_data["title"],
|
||||
class_attr=link.get("class", ""),
|
||||
rel_attr=link.get("rel", ""),
|
||||
page_context=page_context
|
||||
)
|
||||
link_data["intrinsic_score"] = intrinsic_score
|
||||
except Exception:
|
||||
# Fail gracefully - assign default score
|
||||
link_data["intrinsic_score"] = float('inf')
|
||||
else:
|
||||
# No scoring enabled - assign infinity (all links equal priority)
|
||||
link_data["intrinsic_score"] = float('inf')
|
||||
|
||||
is_external = is_external_url(normalized_href, base_domain)
|
||||
if is_external:
|
||||
@@ -1491,6 +1579,33 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
|
||||
base_domain = get_base_domain(url)
|
||||
|
||||
# Extract page context for link scoring (if enabled) - do this BEFORE any removals
|
||||
page_context = None
|
||||
if kwargs.get("score_links", False):
|
||||
try:
|
||||
# Extract title
|
||||
title_elements = doc.xpath('//title')
|
||||
page_title = title_elements[0].text_content() if title_elements else ""
|
||||
|
||||
# Extract headlines
|
||||
headlines = []
|
||||
for tag in ['h1', 'h2', 'h3']:
|
||||
elements = doc.xpath(f'//{tag}')
|
||||
for el in elements:
|
||||
text = el.text_content().strip()
|
||||
if text:
|
||||
headlines.append(text)
|
||||
headlines_text = ' '.join(headlines)
|
||||
|
||||
# Extract meta description
|
||||
meta_desc_elements = doc.xpath('//meta[@name="description"]/@content')
|
||||
meta_description = meta_desc_elements[0] if meta_desc_elements else ""
|
||||
|
||||
# Create page context
|
||||
page_context = extract_page_context(page_title, headlines_text, meta_description, url)
|
||||
except Exception:
|
||||
page_context = {} # Fail gracefully
|
||||
|
||||
# Early removal of all images if exclude_all_images is set
|
||||
# This is more efficient in lxml as we remove elements before any processing
|
||||
if kwargs.get("exclude_all_images", False):
|
||||
@@ -1579,6 +1694,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
media,
|
||||
internal_links_dict,
|
||||
external_links_dict,
|
||||
page_context=page_context,
|
||||
base_domain=base_domain,
|
||||
**kwargs,
|
||||
)
|
||||
@@ -1623,14 +1739,84 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
method="html",
|
||||
with_tail=False,
|
||||
).strip()
|
||||
|
||||
# Create links dictionary in the format expected by LinkExtractor
|
||||
links = {
|
||||
"internal": list(internal_links_dict.values()),
|
||||
"external": list(external_links_dict.values()),
|
||||
}
|
||||
|
||||
# Extract head content for links if configured
|
||||
link_extraction_config = kwargs.get("link_extraction_config")
|
||||
if link_extraction_config is not None:
|
||||
try:
|
||||
import asyncio
|
||||
from .link_extractor import LinkExtractor
|
||||
from .models import Links, Link
|
||||
|
||||
verbose = link_extraction_config.verbose
|
||||
|
||||
if verbose:
|
||||
self._log("info", "Starting link head extraction for {internal} internal and {external} external links",
|
||||
params={"internal": len(links["internal"]), "external": len(links["external"])}, tag="LINK_EXTRACT")
|
||||
|
||||
# Convert dict links to Link objects
|
||||
internal_links = [Link(**link_data) for link_data in links["internal"]]
|
||||
external_links = [Link(**link_data) for link_data in links["external"]]
|
||||
links_obj = Links(internal=internal_links, external=external_links)
|
||||
|
||||
# Create a config object for LinkExtractor
|
||||
class TempCrawlerRunConfig:
|
||||
def __init__(self, link_config, score_links):
|
||||
self.link_extraction_config = link_config
|
||||
self.score_links = score_links
|
||||
|
||||
config = TempCrawlerRunConfig(link_extraction_config, kwargs.get("score_links", False))
|
||||
|
||||
# Extract head content (run async operation in sync context)
|
||||
async def extract_links():
|
||||
async with LinkExtractor(self.logger) as extractor:
|
||||
return await extractor.extract_link_heads(links_obj, config)
|
||||
|
||||
# Run the async operation
|
||||
try:
|
||||
# Check if we're already in an async context
|
||||
loop = asyncio.get_running_loop()
|
||||
# If we're in an async context, we need to run in a thread
|
||||
import concurrent.futures
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future = executor.submit(asyncio.run, extract_links())
|
||||
updated_links = future.result()
|
||||
except RuntimeError:
|
||||
# No running loop, we can use asyncio.run directly
|
||||
updated_links = asyncio.run(extract_links())
|
||||
|
||||
# Convert back to dict format
|
||||
links["internal"] = [link.dict() for link in updated_links.internal]
|
||||
links["external"] = [link.dict() for link in updated_links.external]
|
||||
|
||||
if verbose:
|
||||
successful_internal = len([l for l in updated_links.internal if l.head_extraction_status == "valid"])
|
||||
successful_external = len([l for l in updated_links.external if l.head_extraction_status == "valid"])
|
||||
self._log("info", "Link head extraction completed: {internal_success}/{internal_total} internal, {external_success}/{external_total} external",
|
||||
params={
|
||||
"internal_success": successful_internal,
|
||||
"internal_total": len(updated_links.internal),
|
||||
"external_success": successful_external,
|
||||
"external_total": len(updated_links.external)
|
||||
}, tag="LINK_EXTRACT")
|
||||
else:
|
||||
self._log("info", "Link head extraction completed successfully", tag="LINK_EXTRACT")
|
||||
|
||||
except Exception as e:
|
||||
self._log("error", f"Error during link head extraction: {str(e)}", tag="LINK_EXTRACT")
|
||||
# Continue with original links if head extraction fails
|
||||
|
||||
return {
|
||||
"cleaned_html": cleaned_html,
|
||||
"success": success,
|
||||
"media": media,
|
||||
"links": {
|
||||
"internal": list(internal_links_dict.values()),
|
||||
"external": list(external_links_dict.values()),
|
||||
},
|
||||
"links": links,
|
||||
"metadata": meta,
|
||||
}
|
||||
|
||||
|
||||
395
crawl4ai/link_extractor.py
Normal file
395
crawl4ai/link_extractor.py
Normal file
@@ -0,0 +1,395 @@
|
||||
"""
|
||||
Link Extractor for Crawl4AI
|
||||
|
||||
Extracts head content from links discovered during crawling using URLSeeder's
|
||||
efficient parallel processing and caching infrastructure.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import fnmatch
|
||||
from typing import Dict, List, Optional, Any
|
||||
from .async_logger import AsyncLogger
|
||||
from .async_url_seeder import AsyncUrlSeeder
|
||||
from .async_configs import SeedingConfig, CrawlerRunConfig
|
||||
from .models import Links, Link
|
||||
from .utils import calculate_total_score
|
||||
|
||||
|
||||
class LinkExtractor:
|
||||
"""
|
||||
Extracts head content from links using URLSeeder's parallel processing infrastructure.
|
||||
|
||||
This class provides intelligent link filtering and head content extraction with:
|
||||
- Pattern-based inclusion/exclusion filtering
|
||||
- Parallel processing with configurable concurrency
|
||||
- Caching for performance
|
||||
- BM25 relevance scoring
|
||||
- Memory-safe processing for large link sets
|
||||
"""
|
||||
|
||||
def __init__(self, logger: Optional[AsyncLogger] = None):
|
||||
"""
|
||||
Initialize the LinkExtractor.
|
||||
|
||||
Args:
|
||||
logger: Optional logger instance for recording events
|
||||
"""
|
||||
self.logger = logger
|
||||
self.seeder: Optional[AsyncUrlSeeder] = None
|
||||
self._owns_seeder = False
|
||||
|
||||
async def __aenter__(self):
|
||||
"""Async context manager entry."""
|
||||
await self.start()
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Async context manager exit."""
|
||||
await self.close()
|
||||
|
||||
async def start(self):
|
||||
"""Initialize the URLSeeder instance."""
|
||||
if not self.seeder:
|
||||
self.seeder = AsyncUrlSeeder(logger=self.logger)
|
||||
await self.seeder.__aenter__()
|
||||
self._owns_seeder = True
|
||||
|
||||
async def close(self):
|
||||
"""Clean up resources."""
|
||||
if self.seeder and self._owns_seeder:
|
||||
await self.seeder.__aexit__(None, None, None)
|
||||
self.seeder = None
|
||||
self._owns_seeder = False
|
||||
|
||||
def _log(self, level: str, message: str, tag: str = "LINK_EXTRACT", **kwargs):
|
||||
"""Helper method to safely log messages."""
|
||||
if self.logger:
|
||||
log_method = getattr(self.logger, level, None)
|
||||
if log_method:
|
||||
log_method(message=message, tag=tag, params=kwargs.get('params', {}))
|
||||
|
||||
async def extract_link_heads(
|
||||
self,
|
||||
links: Links,
|
||||
config: CrawlerRunConfig
|
||||
) -> Links:
|
||||
"""
|
||||
Extract head content for filtered links and attach to Link objects.
|
||||
|
||||
Args:
|
||||
links: Links object containing internal and external links
|
||||
config: CrawlerRunConfig with link_extraction_config settings
|
||||
|
||||
Returns:
|
||||
Links object with head_data attached to filtered Link objects
|
||||
"""
|
||||
link_config = config.link_extraction_config
|
||||
|
||||
# Ensure seeder is initialized
|
||||
await self.start()
|
||||
|
||||
# Filter links based on configuration
|
||||
filtered_urls = self._filter_links(links, link_config)
|
||||
|
||||
if not filtered_urls:
|
||||
self._log("info", "No links matched filtering criteria")
|
||||
return links
|
||||
|
||||
self._log("info", "Extracting head content for {count} filtered links",
|
||||
params={"count": len(filtered_urls)})
|
||||
|
||||
# Extract head content using URLSeeder
|
||||
head_results = await self._extract_heads_parallel(filtered_urls, link_config)
|
||||
|
||||
# Merge results back into Link objects
|
||||
updated_links = self._merge_head_data(links, head_results, config)
|
||||
|
||||
self._log("info", "Completed head extraction for links, {success} successful",
|
||||
params={"success": len([r for r in head_results if r.get("status") == "valid"])})
|
||||
|
||||
return updated_links
|
||||
|
||||
def _filter_links(self, links: Links, link_config: Dict[str, Any]) -> List[str]:
|
||||
"""
|
||||
Filter links based on configuration parameters.
|
||||
|
||||
Args:
|
||||
links: Links object containing internal and external links
|
||||
link_config: Configuration dictionary for link extraction
|
||||
|
||||
Returns:
|
||||
List of filtered URL strings
|
||||
"""
|
||||
filtered_urls = []
|
||||
|
||||
# Include internal links if configured
|
||||
if link_config.include_internal:
|
||||
filtered_urls.extend([link.href for link in links.internal if link.href])
|
||||
self._log("debug", "Added {count} internal links",
|
||||
params={"count": len(links.internal)})
|
||||
|
||||
# Include external links if configured
|
||||
if link_config.include_external:
|
||||
filtered_urls.extend([link.href for link in links.external if link.href])
|
||||
self._log("debug", "Added {count} external links",
|
||||
params={"count": len(links.external)})
|
||||
|
||||
# Apply include patterns
|
||||
include_patterns = link_config.include_patterns
|
||||
if include_patterns:
|
||||
filtered_urls = [
|
||||
url for url in filtered_urls
|
||||
if any(fnmatch.fnmatch(url, pattern) for pattern in include_patterns)
|
||||
]
|
||||
self._log("debug", "After include patterns: {count} links remain",
|
||||
params={"count": len(filtered_urls)})
|
||||
|
||||
# Apply exclude patterns
|
||||
exclude_patterns = link_config.exclude_patterns
|
||||
if exclude_patterns:
|
||||
filtered_urls = [
|
||||
url for url in filtered_urls
|
||||
if not any(fnmatch.fnmatch(url, pattern) for pattern in exclude_patterns)
|
||||
]
|
||||
self._log("debug", "After exclude patterns: {count} links remain",
|
||||
params={"count": len(filtered_urls)})
|
||||
|
||||
# Limit number of links
|
||||
max_links = link_config.max_links
|
||||
if max_links > 0 and len(filtered_urls) > max_links:
|
||||
filtered_urls = filtered_urls[:max_links]
|
||||
self._log("debug", "Limited to {max_links} links",
|
||||
params={"max_links": max_links})
|
||||
|
||||
# Remove duplicates while preserving order
|
||||
seen = set()
|
||||
unique_urls = []
|
||||
for url in filtered_urls:
|
||||
if url not in seen:
|
||||
seen.add(url)
|
||||
unique_urls.append(url)
|
||||
|
||||
self._log("debug", "Final filtered URLs: {count} unique links",
|
||||
params={"count": len(unique_urls)})
|
||||
|
||||
return unique_urls
|
||||
|
||||
async def _extract_heads_parallel(
|
||||
self,
|
||||
urls: List[str],
|
||||
link_config: Dict[str, Any]
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract head content for URLs using URLSeeder's parallel processing.
|
||||
|
||||
Args:
|
||||
urls: List of URLs to process
|
||||
link_config: Configuration dictionary for link extraction
|
||||
|
||||
Returns:
|
||||
List of dictionaries with url, status, head_data, and optional relevance_score
|
||||
"""
|
||||
verbose = link_config.verbose
|
||||
concurrency = link_config.concurrency
|
||||
|
||||
if verbose:
|
||||
self._log("info", "Starting batch processing: {total} links with {concurrency} concurrent workers",
|
||||
params={"total": len(urls), "concurrency": concurrency})
|
||||
|
||||
# Create SeedingConfig for URLSeeder
|
||||
seeding_config = SeedingConfig(
|
||||
extract_head=True,
|
||||
concurrency=concurrency,
|
||||
hits_per_sec=getattr(link_config, 'hits_per_sec', None),
|
||||
query=link_config.query,
|
||||
score_threshold=link_config.score_threshold,
|
||||
scoring_method="bm25" if link_config.query else None,
|
||||
verbose=verbose
|
||||
)
|
||||
|
||||
# Use URLSeeder's extract_head_for_urls method with progress tracking
|
||||
if verbose:
|
||||
# Create a wrapper to track progress
|
||||
results = await self._extract_with_progress(urls, seeding_config, link_config)
|
||||
else:
|
||||
results = await self.seeder.extract_head_for_urls(
|
||||
urls=urls,
|
||||
config=seeding_config,
|
||||
concurrency=concurrency,
|
||||
timeout=link_config.timeout
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
async def _extract_with_progress(
|
||||
self,
|
||||
urls: List[str],
|
||||
seeding_config: SeedingConfig,
|
||||
link_config: Dict[str, Any]
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Extract head content with progress reporting."""
|
||||
|
||||
total_urls = len(urls)
|
||||
concurrency = link_config.concurrency
|
||||
batch_size = max(1, total_urls // 10) # Report progress every 10%
|
||||
|
||||
# Process URLs and track progress
|
||||
completed = 0
|
||||
successful = 0
|
||||
failed = 0
|
||||
|
||||
# Create a custom progress tracking version
|
||||
# We'll modify URLSeeder's method to include progress callbacks
|
||||
|
||||
# For now, let's use the existing method and report at the end
|
||||
# In a production version, we would modify URLSeeder to accept progress callbacks
|
||||
|
||||
self._log("info", "Processing links in batches...")
|
||||
|
||||
# Use existing method
|
||||
results = await self.seeder.extract_head_for_urls(
|
||||
urls=urls,
|
||||
config=seeding_config,
|
||||
concurrency=concurrency,
|
||||
timeout=link_config.timeout
|
||||
)
|
||||
|
||||
# Count results
|
||||
for result in results:
|
||||
completed += 1
|
||||
if result.get("status") == "valid":
|
||||
successful += 1
|
||||
else:
|
||||
failed += 1
|
||||
|
||||
# Final progress report
|
||||
self._log("info", "Batch processing completed: {completed}/{total} processed, {successful} successful, {failed} failed",
|
||||
params={
|
||||
"completed": completed,
|
||||
"total": total_urls,
|
||||
"successful": successful,
|
||||
"failed": failed
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
def _merge_head_data(
|
||||
self,
|
||||
original_links: Links,
|
||||
head_results: List[Dict[str, Any]],
|
||||
config: CrawlerRunConfig
|
||||
) -> Links:
|
||||
"""
|
||||
Merge head extraction results back into Link objects.
|
||||
|
||||
Args:
|
||||
original_links: Original Links object
|
||||
head_results: Results from head extraction
|
||||
|
||||
Returns:
|
||||
Links object with head_data attached to matching links
|
||||
"""
|
||||
# Create URL to head_data mapping
|
||||
url_to_head_data = {}
|
||||
for result in head_results:
|
||||
url = result.get("url")
|
||||
if url:
|
||||
url_to_head_data[url] = {
|
||||
"head_data": result.get("head_data", {}),
|
||||
"status": result.get("status", "unknown"),
|
||||
"error": result.get("error"),
|
||||
"relevance_score": result.get("relevance_score")
|
||||
}
|
||||
|
||||
# Update internal links
|
||||
updated_internal = []
|
||||
for link in original_links.internal:
|
||||
if link.href in url_to_head_data:
|
||||
head_info = url_to_head_data[link.href]
|
||||
# Create new Link object with head data and scoring
|
||||
contextual_score = head_info.get("relevance_score")
|
||||
|
||||
updated_link = Link(
|
||||
href=link.href,
|
||||
text=link.text,
|
||||
title=link.title,
|
||||
base_domain=link.base_domain,
|
||||
head_data=head_info["head_data"],
|
||||
head_extraction_status=head_info["status"],
|
||||
head_extraction_error=head_info.get("error"),
|
||||
intrinsic_score=getattr(link, 'intrinsic_score', None),
|
||||
contextual_score=contextual_score
|
||||
)
|
||||
|
||||
# Add relevance score to head_data for backward compatibility
|
||||
if contextual_score is not None:
|
||||
updated_link.head_data = updated_link.head_data or {}
|
||||
updated_link.head_data["relevance_score"] = contextual_score
|
||||
|
||||
# Calculate total score combining intrinsic and contextual scores
|
||||
updated_link.total_score = calculate_total_score(
|
||||
intrinsic_score=updated_link.intrinsic_score,
|
||||
contextual_score=updated_link.contextual_score,
|
||||
score_links_enabled=getattr(config, 'score_links', False),
|
||||
query_provided=bool(config.link_extraction_config.query)
|
||||
)
|
||||
|
||||
updated_internal.append(updated_link)
|
||||
else:
|
||||
# Keep original link unchanged
|
||||
updated_internal.append(link)
|
||||
|
||||
# Update external links
|
||||
updated_external = []
|
||||
for link in original_links.external:
|
||||
if link.href in url_to_head_data:
|
||||
head_info = url_to_head_data[link.href]
|
||||
# Create new Link object with head data and scoring
|
||||
contextual_score = head_info.get("relevance_score")
|
||||
|
||||
updated_link = Link(
|
||||
href=link.href,
|
||||
text=link.text,
|
||||
title=link.title,
|
||||
base_domain=link.base_domain,
|
||||
head_data=head_info["head_data"],
|
||||
head_extraction_status=head_info["status"],
|
||||
head_extraction_error=head_info.get("error"),
|
||||
intrinsic_score=getattr(link, 'intrinsic_score', None),
|
||||
contextual_score=contextual_score
|
||||
)
|
||||
|
||||
# Add relevance score to head_data for backward compatibility
|
||||
if contextual_score is not None:
|
||||
updated_link.head_data = updated_link.head_data or {}
|
||||
updated_link.head_data["relevance_score"] = contextual_score
|
||||
|
||||
# Calculate total score combining intrinsic and contextual scores
|
||||
updated_link.total_score = calculate_total_score(
|
||||
intrinsic_score=updated_link.intrinsic_score,
|
||||
contextual_score=updated_link.contextual_score,
|
||||
score_links_enabled=getattr(config, 'score_links', False),
|
||||
query_provided=bool(config.link_extraction_config.query)
|
||||
)
|
||||
|
||||
updated_external.append(updated_link)
|
||||
else:
|
||||
# Keep original link unchanged
|
||||
updated_external.append(link)
|
||||
|
||||
# Sort links by relevance score if available
|
||||
if any(hasattr(link, 'head_data') and link.head_data and 'relevance_score' in link.head_data
|
||||
for link in updated_internal + updated_external):
|
||||
|
||||
def get_relevance_score(link):
|
||||
if hasattr(link, 'head_data') and link.head_data and 'relevance_score' in link.head_data:
|
||||
return link.head_data['relevance_score']
|
||||
return 0.0
|
||||
|
||||
updated_internal.sort(key=get_relevance_score, reverse=True)
|
||||
updated_external.sort(key=get_relevance_score, reverse=True)
|
||||
|
||||
return Links(
|
||||
internal=updated_internal,
|
||||
external=updated_external
|
||||
)
|
||||
@@ -345,6 +345,12 @@ class Link(BaseModel):
|
||||
text: Optional[str] = ""
|
||||
title: Optional[str] = ""
|
||||
base_domain: Optional[str] = ""
|
||||
head_data: Optional[Dict[str, Any]] = None # Head metadata extracted from link target
|
||||
head_extraction_status: Optional[str] = None # "success", "failed", "skipped"
|
||||
head_extraction_error: Optional[str] = None # Error message if extraction failed
|
||||
intrinsic_score: Optional[float] = None # Quality score based on URL structure, text, and context
|
||||
contextual_score: Optional[float] = None # BM25 relevance score based on query and head content
|
||||
total_score: Optional[float] = None # Combined score from intrinsic and contextual scores
|
||||
|
||||
|
||||
class Media(BaseModel):
|
||||
|
||||
@@ -2939,3 +2939,212 @@ pip install -q nest_asyncio google-colab
|
||||
echo "✅ Setup complete!"
|
||||
''')
|
||||
|
||||
|
||||
# Link Quality Scoring Functions
|
||||
def extract_page_context(page_title: str, headlines_text: str, meta_description: str, base_url: str) -> dict:
|
||||
"""
|
||||
Extract page context for link scoring - called ONCE per page for performance.
|
||||
Parser-agnostic function that takes pre-extracted data.
|
||||
|
||||
Args:
|
||||
page_title: Title of the page
|
||||
headlines_text: Combined text from h1, h2, h3 elements
|
||||
meta_description: Meta description content
|
||||
base_url: Base URL of the page
|
||||
|
||||
Returns:
|
||||
Dictionary containing page context data for fast link scoring
|
||||
"""
|
||||
context = {
|
||||
'terms': set(),
|
||||
'headlines': headlines_text or '',
|
||||
'meta_description': meta_description or '',
|
||||
'domain': '',
|
||||
'is_docs_site': False
|
||||
}
|
||||
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(base_url)
|
||||
context['domain'] = parsed.netloc.lower()
|
||||
|
||||
# Check if this is a documentation/reference site
|
||||
context['is_docs_site'] = any(indicator in context['domain']
|
||||
for indicator in ['docs.', 'api.', 'developer.', 'reference.'])
|
||||
|
||||
# Create term set for fast intersection (performance optimization)
|
||||
all_text = ((page_title or '') + ' ' + context['headlines'] + ' ' + context['meta_description']).lower()
|
||||
# Simple tokenization - fast and sufficient for scoring
|
||||
context['terms'] = set(word.strip('.,!?;:"()[]{}')
|
||||
for word in all_text.split()
|
||||
if len(word.strip('.,!?;:"()[]{}')) > 2)
|
||||
|
||||
except Exception:
|
||||
# Fail gracefully - return empty context
|
||||
pass
|
||||
|
||||
return context
|
||||
|
||||
|
||||
def calculate_link_intrinsic_score(
|
||||
link_text: str,
|
||||
url: str,
|
||||
title_attr: str,
|
||||
class_attr: str,
|
||||
rel_attr: str,
|
||||
page_context: dict
|
||||
) -> float:
|
||||
"""
|
||||
Ultra-fast link quality scoring using only provided data (no DOM access needed).
|
||||
Parser-agnostic function.
|
||||
|
||||
Args:
|
||||
link_text: Text content of the link
|
||||
url: Link URL
|
||||
title_attr: Title attribute of the link
|
||||
class_attr: Class attribute of the link
|
||||
rel_attr: Rel attribute of the link
|
||||
page_context: Pre-computed page context from extract_page_context()
|
||||
|
||||
Returns:
|
||||
Quality score (0.0 - 10.0), higher is better
|
||||
"""
|
||||
score = 0.0
|
||||
|
||||
try:
|
||||
# 1. ATTRIBUTE QUALITY (string analysis - very fast)
|
||||
if title_attr and len(title_attr.strip()) > 3:
|
||||
score += 1.0
|
||||
|
||||
class_str = (class_attr or '').lower()
|
||||
# Navigation/important classes boost score
|
||||
if any(nav_class in class_str for nav_class in ['nav', 'menu', 'primary', 'main', 'important']):
|
||||
score += 1.5
|
||||
# Marketing/ad classes reduce score
|
||||
if any(bad_class in class_str for bad_class in ['ad', 'sponsor', 'track', 'promo', 'banner']):
|
||||
score -= 1.0
|
||||
|
||||
rel_str = (rel_attr or '').lower()
|
||||
# Semantic rel values
|
||||
if any(good_rel in rel_str for good_rel in ['canonical', 'next', 'prev', 'chapter']):
|
||||
score += 1.0
|
||||
if any(bad_rel in rel_str for bad_rel in ['nofollow', 'sponsored', 'ugc']):
|
||||
score -= 0.5
|
||||
|
||||
# 2. URL STRUCTURE QUALITY (string operations - very fast)
|
||||
url_lower = url.lower()
|
||||
|
||||
# High-value path patterns
|
||||
if any(good_path in url_lower for good_path in ['/docs/', '/api/', '/guide/', '/tutorial/', '/reference/', '/manual/']):
|
||||
score += 2.0
|
||||
elif any(medium_path in url_lower for medium_path in ['/blog/', '/article/', '/post/', '/news/']):
|
||||
score += 1.0
|
||||
|
||||
# Penalize certain patterns
|
||||
if any(bad_path in url_lower for bad_path in ['/admin/', '/login/', '/cart/', '/checkout/', '/track/', '/click/']):
|
||||
score -= 1.5
|
||||
|
||||
# URL depth (shallow URLs often more important)
|
||||
url_depth = url.count('/') - 2 # Subtract protocol and domain
|
||||
if url_depth <= 2:
|
||||
score += 1.0
|
||||
elif url_depth > 5:
|
||||
score -= 0.5
|
||||
|
||||
# HTTPS bonus
|
||||
if url.startswith('https://'):
|
||||
score += 0.5
|
||||
|
||||
# 3. TEXT QUALITY (string analysis - very fast)
|
||||
if link_text:
|
||||
text_clean = link_text.strip()
|
||||
if len(text_clean) > 3:
|
||||
score += 1.0
|
||||
|
||||
# Multi-word links are usually more descriptive
|
||||
word_count = len(text_clean.split())
|
||||
if word_count >= 2:
|
||||
score += 0.5
|
||||
if word_count >= 4:
|
||||
score += 0.5
|
||||
|
||||
# Avoid generic link text
|
||||
generic_texts = ['click here', 'read more', 'more info', 'link', 'here']
|
||||
if text_clean.lower() in generic_texts:
|
||||
score -= 1.0
|
||||
|
||||
# 4. CONTEXTUAL RELEVANCE (pre-computed page terms - very fast)
|
||||
if page_context.get('terms') and link_text:
|
||||
link_words = set(word.strip('.,!?;:"()[]{}').lower()
|
||||
for word in link_text.split()
|
||||
if len(word.strip('.,!?;:"()[]{}')) > 2)
|
||||
|
||||
if link_words:
|
||||
# Calculate word overlap ratio
|
||||
overlap = len(link_words & page_context['terms'])
|
||||
if overlap > 0:
|
||||
relevance_ratio = overlap / min(len(link_words), 10) # Cap to avoid over-weighting
|
||||
score += relevance_ratio * 2.0 # Up to 2 points for relevance
|
||||
|
||||
# 5. DOMAIN CONTEXT BONUSES (very fast string checks)
|
||||
if page_context.get('is_docs_site', False):
|
||||
# Documentation sites: prioritize internal navigation
|
||||
if link_text and any(doc_keyword in link_text.lower()
|
||||
for doc_keyword in ['api', 'reference', 'guide', 'tutorial', 'example']):
|
||||
score += 1.0
|
||||
|
||||
except Exception:
|
||||
# Fail gracefully - return minimal score
|
||||
score = 0.5
|
||||
|
||||
# Ensure score is within reasonable bounds
|
||||
return max(0.0, min(score, 10.0))
|
||||
|
||||
|
||||
def calculate_total_score(
|
||||
intrinsic_score: Optional[float] = None,
|
||||
contextual_score: Optional[float] = None,
|
||||
score_links_enabled: bool = False,
|
||||
query_provided: bool = False
|
||||
) -> float:
|
||||
"""
|
||||
Calculate combined total score from intrinsic and contextual scores with smart fallbacks.
|
||||
|
||||
Args:
|
||||
intrinsic_score: Quality score based on URL structure, text, and context (0-10)
|
||||
contextual_score: BM25 relevance score based on query and head content (0-1 typically)
|
||||
score_links_enabled: Whether link scoring is enabled
|
||||
query_provided: Whether a query was provided for contextual scoring
|
||||
|
||||
Returns:
|
||||
Combined total score (0-10 scale)
|
||||
|
||||
Scoring Logic:
|
||||
- No scoring: return 5.0 (neutral score)
|
||||
- Only intrinsic: return normalized intrinsic score
|
||||
- Only contextual: return contextual score scaled to 10
|
||||
- Both: weighted combination (70% intrinsic, 30% contextual scaled)
|
||||
"""
|
||||
# Case 1: No scoring enabled at all
|
||||
if not score_links_enabled:
|
||||
return 5.0 # Neutral score - all links treated equally
|
||||
|
||||
# Normalize scores to handle None values
|
||||
intrinsic = intrinsic_score if intrinsic_score is not None else 0.0
|
||||
contextual = contextual_score if contextual_score is not None else 0.0
|
||||
|
||||
# Case 2: Only intrinsic scoring (no query provided or no head extraction)
|
||||
if not query_provided or contextual_score is None:
|
||||
# Use intrinsic score directly (already 0-10 scale)
|
||||
return max(0.0, min(intrinsic, 10.0))
|
||||
|
||||
# Case 3: Both intrinsic and contextual scores available
|
||||
# Scale contextual score (typically 0-1) to 0-10 range
|
||||
contextual_scaled = min(contextual * 10.0, 10.0)
|
||||
|
||||
# Weighted combination: 70% intrinsic (structure/content quality) + 30% contextual (query relevance)
|
||||
# This gives more weight to link quality while still considering relevance
|
||||
total = (intrinsic * 0.7) + (contextual_scaled * 0.3)
|
||||
|
||||
return max(0.0, min(total, 10.0))
|
||||
|
||||
|
||||
376
docs/examples/link_head_extraction_example.py
Normal file
376
docs/examples/link_head_extraction_example.py
Normal file
@@ -0,0 +1,376 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Link Head Extraction & Scoring Example
|
||||
|
||||
This example demonstrates Crawl4AI's advanced link analysis capabilities:
|
||||
1. Basic link head extraction
|
||||
2. Three-layer scoring system (intrinsic, contextual, total)
|
||||
3. Pattern-based filtering
|
||||
4. Multiple practical use cases
|
||||
|
||||
Requirements:
|
||||
- crawl4ai installed
|
||||
- Internet connection
|
||||
|
||||
Usage:
|
||||
python link_head_extraction_example.py
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import LinkExtractionConfig
|
||||
|
||||
|
||||
async def basic_link_head_extraction():
|
||||
"""
|
||||
Basic example: Extract head content from internal links with scoring
|
||||
"""
|
||||
print("🔗 Basic Link Head Extraction Example")
|
||||
print("=" * 50)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
# Enable link head extraction
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
include_internal=True, # Process internal links
|
||||
include_external=False, # Skip external links for this demo
|
||||
max_links=5, # Limit to 5 links
|
||||
concurrency=3, # Process 3 links simultaneously
|
||||
timeout=10, # 10 second timeout per link
|
||||
query="API documentation guide", # Query for relevance scoring
|
||||
verbose=True # Show detailed progress
|
||||
),
|
||||
# Enable intrinsic link scoring
|
||||
score_links=True,
|
||||
only_text=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://docs.python.org/3/", config=config)
|
||||
|
||||
if result.success:
|
||||
print(f"\n✅ Successfully crawled: {result.url}")
|
||||
|
||||
internal_links = result.links.get("internal", [])
|
||||
links_with_head = [link for link in internal_links
|
||||
if link.get("head_data") is not None]
|
||||
|
||||
print(f"🧠 Links with head data: {len(links_with_head)}")
|
||||
|
||||
# Show detailed results
|
||||
for i, link in enumerate(links_with_head[:3]):
|
||||
print(f"\n📄 Link {i+1}: {link['href']}")
|
||||
print(f" Text: '{link.get('text', 'No text')[:50]}...'")
|
||||
|
||||
# Show all three score types
|
||||
intrinsic = link.get('intrinsic_score')
|
||||
contextual = link.get('contextual_score')
|
||||
total = link.get('total_score')
|
||||
|
||||
print(f" 📊 Scores:")
|
||||
if intrinsic is not None:
|
||||
print(f" • Intrinsic: {intrinsic:.2f}/10.0")
|
||||
if contextual is not None:
|
||||
print(f" • Contextual: {contextual:.3f}")
|
||||
if total is not None:
|
||||
print(f" • Total: {total:.3f}")
|
||||
|
||||
# Show head data
|
||||
head_data = link.get("head_data", {})
|
||||
if head_data:
|
||||
title = head_data.get("title", "No title")
|
||||
description = head_data.get("meta", {}).get("description", "")
|
||||
print(f" 📰 Title: {title[:60]}...")
|
||||
if description:
|
||||
print(f" 📝 Description: {description[:80]}...")
|
||||
else:
|
||||
print(f"❌ Crawl failed: {result.error_message}")
|
||||
|
||||
|
||||
async def research_assistant_example():
|
||||
"""
|
||||
Research Assistant: Find highly relevant documentation pages
|
||||
"""
|
||||
print("\n\n🔍 Research Assistant Example")
|
||||
print("=" * 50)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
include_external=True,
|
||||
include_patterns=["*/docs/*", "*/tutorial/*", "*/guide/*"],
|
||||
exclude_patterns=["*/login*", "*/admin*"],
|
||||
query="machine learning neural networks deep learning",
|
||||
max_links=15,
|
||||
score_threshold=0.4, # Only include high-relevance links
|
||||
concurrency=8,
|
||||
verbose=False # Clean output for this example
|
||||
),
|
||||
score_links=True
|
||||
)
|
||||
|
||||
# Test with scikit-learn documentation
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://scikit-learn.org/stable/", config=config)
|
||||
|
||||
if result.success:
|
||||
print(f"✅ Analyzed: {result.url}")
|
||||
|
||||
all_links = result.links.get("internal", []) + result.links.get("external", [])
|
||||
|
||||
# Filter for high-scoring links
|
||||
high_scoring_links = [link for link in all_links
|
||||
if link.get("total_score", 0) > 0.6]
|
||||
|
||||
# Sort by total score (highest first)
|
||||
high_scoring_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
|
||||
|
||||
print(f"\n🎯 Found {len(high_scoring_links)} highly relevant links:")
|
||||
print(" (Showing top 5 by relevance score)")
|
||||
|
||||
for i, link in enumerate(high_scoring_links[:5]):
|
||||
score = link.get("total_score", 0)
|
||||
title = link.get("head_data", {}).get("title", "No title")
|
||||
print(f"\n{i+1}. ⭐ {score:.3f} - {title[:70]}...")
|
||||
print(f" 🔗 {link['href']}")
|
||||
|
||||
# Show score breakdown
|
||||
intrinsic = link.get('intrinsic_score', 0)
|
||||
contextual = link.get('contextual_score', 0)
|
||||
print(f" 📊 Quality: {intrinsic:.1f}/10 | Relevance: {contextual:.3f}")
|
||||
else:
|
||||
print(f"❌ Research failed: {result.error_message}")
|
||||
|
||||
|
||||
async def api_discovery_example():
|
||||
"""
|
||||
API Discovery: Find API endpoints and references
|
||||
"""
|
||||
print("\n\n🔧 API Discovery Example")
|
||||
print("=" * 50)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
include_patterns=["*/api/*", "*/reference/*", "*/endpoint/*"],
|
||||
exclude_patterns=["*/deprecated/*", "*/v1/*"], # Skip old versions
|
||||
max_links=25,
|
||||
concurrency=10,
|
||||
timeout=8,
|
||||
verbose=False
|
||||
),
|
||||
score_links=True
|
||||
)
|
||||
|
||||
# Example with a documentation site that has API references
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://httpbin.org/", config=config)
|
||||
|
||||
if result.success:
|
||||
print(f"✅ Discovered APIs at: {result.url}")
|
||||
|
||||
api_links = result.links.get("internal", [])
|
||||
|
||||
# Categorize by detected content
|
||||
endpoints = {"GET": [], "POST": [], "PUT": [], "DELETE": [], "OTHER": []}
|
||||
|
||||
for link in api_links:
|
||||
if link.get("head_data"):
|
||||
title = link.get("head_data", {}).get("title", "").upper()
|
||||
text = link.get("text", "").upper()
|
||||
|
||||
# Simple categorization based on content
|
||||
if "GET" in title or "GET" in text:
|
||||
endpoints["GET"].append(link)
|
||||
elif "POST" in title or "POST" in text:
|
||||
endpoints["POST"].append(link)
|
||||
elif "PUT" in title or "PUT" in text:
|
||||
endpoints["PUT"].append(link)
|
||||
elif "DELETE" in title or "DELETE" in text:
|
||||
endpoints["DELETE"].append(link)
|
||||
else:
|
||||
endpoints["OTHER"].append(link)
|
||||
|
||||
# Display results
|
||||
total_found = sum(len(links) for links in endpoints.values())
|
||||
print(f"\n📡 Found {total_found} API-related links:")
|
||||
|
||||
for method, links in endpoints.items():
|
||||
if links:
|
||||
print(f"\n{method} Endpoints ({len(links)}):")
|
||||
for link in links[:3]: # Show first 3 of each type
|
||||
title = link.get("head_data", {}).get("title", "No title")
|
||||
score = link.get("intrinsic_score", 0)
|
||||
print(f" • [{score:.1f}] {title[:50]}...")
|
||||
print(f" {link['href']}")
|
||||
else:
|
||||
print(f"❌ API discovery failed: {result.error_message}")
|
||||
|
||||
|
||||
async def link_quality_analysis():
|
||||
"""
|
||||
Link Quality Analysis: Analyze website structure and link quality
|
||||
"""
|
||||
print("\n\n📊 Link Quality Analysis Example")
|
||||
print("=" * 50)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
max_links=30, # Analyze more links for better statistics
|
||||
concurrency=15,
|
||||
timeout=6,
|
||||
verbose=False
|
||||
),
|
||||
score_links=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Test with a content-rich site
|
||||
result = await crawler.arun("https://docs.python.org/3/", config=config)
|
||||
|
||||
if result.success:
|
||||
print(f"✅ Analyzed: {result.url}")
|
||||
|
||||
links = result.links.get("internal", [])
|
||||
|
||||
# Extract intrinsic scores for analysis
|
||||
scores = [link.get('intrinsic_score', 0) for link in links if link.get('intrinsic_score') is not None]
|
||||
|
||||
if scores:
|
||||
avg_score = sum(scores) / len(scores)
|
||||
high_quality = len([s for s in scores if s >= 7.0])
|
||||
medium_quality = len([s for s in scores if 4.0 <= s < 7.0])
|
||||
low_quality = len([s for s in scores if s < 4.0])
|
||||
|
||||
print(f"\n📈 Quality Analysis Results:")
|
||||
print(f" 📊 Average Score: {avg_score:.2f}/10.0")
|
||||
print(f" 🟢 High Quality (≥7.0): {high_quality} links")
|
||||
print(f" 🟡 Medium Quality (4.0-6.9): {medium_quality} links")
|
||||
print(f" 🔴 Low Quality (<4.0): {low_quality} links")
|
||||
|
||||
# Show best and worst links
|
||||
scored_links = [(link, link.get('intrinsic_score', 0)) for link in links
|
||||
if link.get('intrinsic_score') is not None]
|
||||
scored_links.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
print(f"\n🏆 Top 3 Quality Links:")
|
||||
for i, (link, score) in enumerate(scored_links[:3]):
|
||||
text = link.get('text', 'No text')[:40]
|
||||
print(f" {i+1}. [{score:.1f}] {text}...")
|
||||
print(f" {link['href']}")
|
||||
|
||||
print(f"\n⚠️ Bottom 3 Quality Links:")
|
||||
for i, (link, score) in enumerate(scored_links[-3:]):
|
||||
text = link.get('text', 'No text')[:40]
|
||||
print(f" {i+1}. [{score:.1f}] {text}...")
|
||||
print(f" {link['href']}")
|
||||
else:
|
||||
print("❌ No scoring data available")
|
||||
else:
|
||||
print(f"❌ Analysis failed: {result.error_message}")
|
||||
|
||||
|
||||
async def pattern_filtering_example():
|
||||
"""
|
||||
Pattern Filtering: Demonstrate advanced filtering capabilities
|
||||
"""
|
||||
print("\n\n🎯 Pattern Filtering Example")
|
||||
print("=" * 50)
|
||||
|
||||
# Example with multiple filtering strategies
|
||||
filters = [
|
||||
{
|
||||
"name": "Documentation Only",
|
||||
"config": LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
verbose=False,
|
||||
include_patterns=["*/docs/*", "*/documentation/*"],
|
||||
exclude_patterns=["*/api/*"]
|
||||
)
|
||||
},
|
||||
{
|
||||
"name": "API References Only",
|
||||
"config": LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
verbose=False,
|
||||
include_patterns=["*/api/*", "*/reference/*"],
|
||||
exclude_patterns=["*/tutorial/*"]
|
||||
)
|
||||
},
|
||||
{
|
||||
"name": "Exclude Admin Areas",
|
||||
"config": LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
verbose=False,
|
||||
exclude_patterns=["*/admin/*", "*/login/*", "*/dashboard/*"]
|
||||
)
|
||||
}
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
for filter_example in filters:
|
||||
print(f"\n🔍 Testing: {filter_example['name']}")
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
link_extraction_config=filter_example['config'],
|
||||
score_links=True
|
||||
)
|
||||
|
||||
result = await crawler.arun("https://docs.python.org/3/", config=config)
|
||||
|
||||
if result.success:
|
||||
links = result.links.get("internal", [])
|
||||
links_with_head = [link for link in links if link.get("head_data")]
|
||||
|
||||
print(f" 📊 Found {len(links_with_head)} matching links")
|
||||
|
||||
if links_with_head:
|
||||
# Show sample matches
|
||||
for link in links_with_head[:2]:
|
||||
title = link.get("head_data", {}).get("title", "No title")
|
||||
print(f" • {title[:50]}...")
|
||||
print(f" {link['href']}")
|
||||
else:
|
||||
print(f" ❌ Failed: {result.error_message}")
|
||||
|
||||
|
||||
async def main():
|
||||
"""
|
||||
Run all examples
|
||||
"""
|
||||
print("🚀 Crawl4AI Link Head Extraction Examples")
|
||||
print("=" * 60)
|
||||
print("This will demonstrate various link analysis capabilities.\n")
|
||||
|
||||
try:
|
||||
# Run all examples
|
||||
await basic_link_head_extraction()
|
||||
await research_assistant_example()
|
||||
await api_discovery_example()
|
||||
await link_quality_analysis()
|
||||
await pattern_filtering_example()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("✨ All examples completed successfully!")
|
||||
print("\nNext steps:")
|
||||
print("1. Try modifying the queries and patterns above")
|
||||
print("2. Test with your own websites")
|
||||
print("3. Experiment with different score thresholds")
|
||||
print("4. Check out the full documentation for more options")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n⏹️ Examples interrupted by user")
|
||||
except Exception as e:
|
||||
print(f"\n💥 Error running examples: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -105,7 +105,366 @@ result.links = {
|
||||
|
||||
---
|
||||
|
||||
## 2. Domain Filtering
|
||||
## 2. Advanced Link Head Extraction & Scoring
|
||||
|
||||
Ever wanted to not just extract links, but also get the actual content (title, description, metadata) from those linked pages? And score them for relevance? This is exactly what Link Head Extraction does - it fetches the `<head>` section from each discovered link and scores them using multiple algorithms.
|
||||
|
||||
### 2.1 Why Link Head Extraction?
|
||||
|
||||
When you crawl a page, you get hundreds of links. But which ones are actually valuable? Link Head Extraction solves this by:
|
||||
|
||||
1. **Fetching head content** from each link (title, description, meta tags)
|
||||
2. **Scoring links intrinsically** based on URL quality, text relevance, and context
|
||||
3. **Scoring links contextually** using BM25 algorithm when you provide a search query
|
||||
4. **Combining scores intelligently** to give you a final relevance ranking
|
||||
|
||||
### 2.2 Complete Working Example
|
||||
|
||||
Here's a full example you can copy, paste, and run immediately:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import LinkExtractionConfig
|
||||
|
||||
async def extract_link_heads_example():
|
||||
"""
|
||||
Complete example showing link head extraction with scoring.
|
||||
This will crawl a documentation site and extract head content from internal links.
|
||||
"""
|
||||
|
||||
# Configure link head extraction
|
||||
config = CrawlerRunConfig(
|
||||
# Enable link head extraction with detailed configuration
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
include_internal=True, # Extract from internal links
|
||||
include_external=False, # Skip external links for this example
|
||||
max_links=10, # Limit to 10 links for demo
|
||||
concurrency=5, # Process 5 links simultaneously
|
||||
timeout=10, # 10 second timeout per link
|
||||
query="API documentation guide", # Query for contextual scoring
|
||||
score_threshold=0.3, # Only include links scoring above 0.3
|
||||
verbose=True # Show detailed progress
|
||||
),
|
||||
# Enable intrinsic scoring (URL quality, text relevance)
|
||||
score_links=True,
|
||||
# Keep output clean
|
||||
only_text=True,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Crawl a documentation site (great for testing)
|
||||
result = await crawler.arun("https://docs.python.org/3/", config=config)
|
||||
|
||||
if result.success:
|
||||
print(f"✅ Successfully crawled: {result.url}")
|
||||
print(f"📄 Page title: {result.metadata.get('title', 'No title')}")
|
||||
|
||||
# Access links (now enhanced with head data and scores)
|
||||
internal_links = result.links.get("internal", [])
|
||||
external_links = result.links.get("external", [])
|
||||
|
||||
print(f"\n🔗 Found {len(internal_links)} internal links")
|
||||
print(f"🌍 Found {len(external_links)} external links")
|
||||
|
||||
# Count links with head data
|
||||
links_with_head = [link for link in internal_links
|
||||
if link.get("head_data") is not None]
|
||||
print(f"🧠 Links with head data extracted: {len(links_with_head)}")
|
||||
|
||||
# Show the top 3 scoring links
|
||||
print(f"\n🏆 Top 3 Links with Full Scoring:")
|
||||
for i, link in enumerate(links_with_head[:3]):
|
||||
print(f"\n{i+1}. {link['href']}")
|
||||
print(f" Link Text: '{link.get('text', 'No text')[:50]}...'")
|
||||
|
||||
# Show all three score types
|
||||
intrinsic = link.get('intrinsic_score')
|
||||
contextual = link.get('contextual_score')
|
||||
total = link.get('total_score')
|
||||
|
||||
if intrinsic is not None:
|
||||
print(f" 📊 Intrinsic Score: {intrinsic:.2f}/10.0 (URL quality & context)")
|
||||
if contextual is not None:
|
||||
print(f" 🎯 Contextual Score: {contextual:.3f} (BM25 relevance to query)")
|
||||
if total is not None:
|
||||
print(f" ⭐ Total Score: {total:.3f} (combined final score)")
|
||||
|
||||
# Show extracted head data
|
||||
head_data = link.get("head_data", {})
|
||||
if head_data:
|
||||
title = head_data.get("title", "No title")
|
||||
description = head_data.get("meta", {}).get("description", "No description")
|
||||
|
||||
print(f" 📰 Title: {title[:60]}...")
|
||||
if description:
|
||||
print(f" 📝 Description: {description[:80]}...")
|
||||
|
||||
# Show extraction status
|
||||
status = link.get("head_extraction_status", "unknown")
|
||||
print(f" ✅ Extraction Status: {status}")
|
||||
else:
|
||||
print(f"❌ Crawl failed: {result.error_message}")
|
||||
|
||||
# Run the example
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(extract_link_heads_example())
|
||||
```
|
||||
|
||||
**Expected Output:**
|
||||
```
|
||||
✅ Successfully crawled: https://docs.python.org/3/
|
||||
📄 Page title: 3.13.5 Documentation
|
||||
🔗 Found 53 internal links
|
||||
🌍 Found 1 external links
|
||||
🧠 Links with head data extracted: 10
|
||||
|
||||
🏆 Top 3 Links with Full Scoring:
|
||||
|
||||
1. https://docs.python.org/3.15/
|
||||
Link Text: 'Python 3.15 (in development)...'
|
||||
📊 Intrinsic Score: 4.17/10.0 (URL quality & context)
|
||||
🎯 Contextual Score: 1.000 (BM25 relevance to query)
|
||||
⭐ Total Score: 5.917 (combined final score)
|
||||
📰 Title: 3.15.0a0 Documentation...
|
||||
📝 Description: The official Python documentation...
|
||||
✅ Extraction Status: valid
|
||||
```
|
||||
|
||||
### 2.3 Configuration Deep Dive
|
||||
|
||||
The `LinkExtractionConfig` class supports these options:
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import LinkExtractionConfig
|
||||
|
||||
link_extraction_config = LinkExtractionConfig(
|
||||
# BASIC SETTINGS
|
||||
verbose=True, # Show detailed logs (recommended for learning)
|
||||
|
||||
# LINK FILTERING
|
||||
include_internal=True, # Include same-domain links
|
||||
include_external=True, # Include different-domain links
|
||||
max_links=50, # Maximum links to process (prevents overload)
|
||||
|
||||
# PATTERN FILTERING
|
||||
include_patterns=[ # Only process links matching these patterns
|
||||
"*/docs/*",
|
||||
"*/api/*",
|
||||
"*/reference/*"
|
||||
],
|
||||
exclude_patterns=[ # Skip links matching these patterns
|
||||
"*/login*",
|
||||
"*/admin*"
|
||||
],
|
||||
|
||||
# PERFORMANCE SETTINGS
|
||||
concurrency=10, # How many links to process simultaneously
|
||||
timeout=5, # Seconds to wait per link
|
||||
|
||||
# RELEVANCE SCORING
|
||||
query="machine learning API", # Query for BM25 contextual scoring
|
||||
score_threshold=0.3, # Only include links above this score
|
||||
)
|
||||
```
|
||||
|
||||
### 2.4 Understanding the Three Score Types
|
||||
|
||||
Each extracted link gets three different scores:
|
||||
|
||||
#### 1. **Intrinsic Score (0-10)** - URL and Content Quality
|
||||
Based on URL structure, link text quality, and page context:
|
||||
|
||||
```python
|
||||
# High intrinsic score indicators:
|
||||
# ✅ Clean URL structure (docs.python.org/api/reference)
|
||||
# ✅ Meaningful link text ("API Reference Guide")
|
||||
# ✅ Relevant to page context
|
||||
# ✅ Not buried deep in navigation
|
||||
|
||||
# Low intrinsic score indicators:
|
||||
# ❌ Random URLs (site.com/x7f9g2h)
|
||||
# ❌ No link text or generic text ("Click here")
|
||||
# ❌ Unrelated to page content
|
||||
```
|
||||
|
||||
#### 2. **Contextual Score (0-1)** - BM25 Relevance to Query
|
||||
Only available when you provide a `query`. Uses BM25 algorithm against head content:
|
||||
|
||||
```python
|
||||
# Example: query = "machine learning tutorial"
|
||||
# High contextual score: Link to "Complete Machine Learning Guide"
|
||||
# Low contextual score: Link to "Privacy Policy"
|
||||
```
|
||||
|
||||
#### 3. **Total Score** - Smart Combination
|
||||
Intelligently combines intrinsic and contextual scores with fallbacks:
|
||||
|
||||
```python
|
||||
# When both scores available: (intrinsic * 0.3) + (contextual * 0.7)
|
||||
# When only intrinsic: uses intrinsic score
|
||||
# When only contextual: uses contextual score
|
||||
# When neither: not calculated
|
||||
```
|
||||
|
||||
### 2.5 Practical Use Cases
|
||||
|
||||
#### Use Case 1: Research Assistant
|
||||
Find the most relevant documentation pages:
|
||||
|
||||
```python
|
||||
async def research_assistant():
|
||||
config = CrawlerRunConfig(
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
include_external=True,
|
||||
include_patterns=["*/docs/*", "*/tutorial/*", "*/guide/*"],
|
||||
query="machine learning neural networks",
|
||||
max_links=20,
|
||||
score_threshold=0.5, # Only high-relevance links
|
||||
verbose=True
|
||||
),
|
||||
score_links=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://scikit-learn.org/", config=config)
|
||||
|
||||
if result.success:
|
||||
# Get high-scoring links
|
||||
good_links = [link for link in result.links.get("internal", [])
|
||||
if link.get("total_score", 0) > 0.7]
|
||||
|
||||
print(f"🎯 Found {len(good_links)} highly relevant links:")
|
||||
for link in good_links[:5]:
|
||||
print(f"⭐ {link['total_score']:.3f} - {link['href']}")
|
||||
print(f" {link.get('head_data', {}).get('title', 'No title')}")
|
||||
```
|
||||
|
||||
#### Use Case 2: Content Discovery
|
||||
Find all API endpoints and references:
|
||||
|
||||
```python
|
||||
async def api_discovery():
|
||||
config = CrawlerRunConfig(
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
include_patterns=["*/api/*", "*/reference/*"],
|
||||
exclude_patterns=["*/deprecated/*"],
|
||||
max_links=100,
|
||||
concurrency=15,
|
||||
verbose=False # Clean output
|
||||
),
|
||||
score_links=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://docs.example-api.com/", config=config)
|
||||
|
||||
if result.success:
|
||||
api_links = result.links.get("internal", [])
|
||||
|
||||
# Group by endpoint type
|
||||
endpoints = {}
|
||||
for link in api_links:
|
||||
if link.get("head_data"):
|
||||
title = link["head_data"].get("title", "")
|
||||
if "GET" in title:
|
||||
endpoints.setdefault("GET", []).append(link)
|
||||
elif "POST" in title:
|
||||
endpoints.setdefault("POST", []).append(link)
|
||||
|
||||
for method, links in endpoints.items():
|
||||
print(f"\n{method} Endpoints ({len(links)}):")
|
||||
for link in links[:3]:
|
||||
print(f" • {link['href']}")
|
||||
```
|
||||
|
||||
#### Use Case 3: Link Quality Analysis
|
||||
Analyze website structure and content quality:
|
||||
|
||||
```python
|
||||
async def quality_analysis():
|
||||
config = CrawlerRunConfig(
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
max_links=200,
|
||||
concurrency=20,
|
||||
),
|
||||
score_links=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://your-website.com/", config=config)
|
||||
|
||||
if result.success:
|
||||
links = result.links.get("internal", [])
|
||||
|
||||
# Analyze intrinsic scores
|
||||
scores = [link.get('intrinsic_score', 0) for link in links]
|
||||
avg_score = sum(scores) / len(scores) if scores else 0
|
||||
|
||||
print(f"📊 Link Quality Analysis:")
|
||||
print(f" Average intrinsic score: {avg_score:.2f}/10.0")
|
||||
print(f" High quality links (>7.0): {len([s for s in scores if s > 7.0])}")
|
||||
print(f" Low quality links (<3.0): {len([s for s in scores if s < 3.0])}")
|
||||
|
||||
# Find problematic links
|
||||
bad_links = [link for link in links
|
||||
if link.get('intrinsic_score', 0) < 2.0]
|
||||
|
||||
if bad_links:
|
||||
print(f"\n⚠️ Links needing attention:")
|
||||
for link in bad_links[:5]:
|
||||
print(f" {link['href']} (score: {link.get('intrinsic_score', 0):.1f})")
|
||||
```
|
||||
|
||||
### 2.6 Performance Tips
|
||||
|
||||
1. **Start Small**: Begin with `max_links: 10` to understand the feature
|
||||
2. **Use Patterns**: Filter with `include_patterns` to focus on relevant sections
|
||||
3. **Adjust Concurrency**: Higher concurrency = faster but more resource usage
|
||||
4. **Set Timeouts**: Use `timeout: 5` to prevent hanging on slow sites
|
||||
5. **Use Score Thresholds**: Filter out low-quality links with `score_threshold`
|
||||
|
||||
### 2.7 Troubleshooting
|
||||
|
||||
**No head data extracted?**
|
||||
```python
|
||||
# Check your configuration:
|
||||
config = CrawlerRunConfig(
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
verbose=True # ← Enable to see what's happening
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
**Scores showing as None?**
|
||||
```python
|
||||
# Make sure scoring is enabled:
|
||||
config = CrawlerRunConfig(
|
||||
score_links=True, # ← Enable intrinsic scoring
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
query="your search terms" # ← For contextual scoring
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
**Process taking too long?**
|
||||
```python
|
||||
# Optimize performance:
|
||||
link_extraction_config = LinkExtractionConfig(
|
||||
max_links=20, # ← Reduce number
|
||||
concurrency=10, # ← Increase parallelism
|
||||
timeout=3, # ← Shorter timeout
|
||||
include_patterns=["*/important/*"] # ← Focus on key areas
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Domain Filtering
|
||||
|
||||
Some websites contain hundreds of third-party or affiliate links. You can filter out certain domains at **crawl time** by configuring the crawler. The most relevant parameters in `CrawlerRunConfig` are:
|
||||
|
||||
@@ -114,7 +473,7 @@ Some websites contain hundreds of third-party or affiliate links. You can filter
|
||||
- **`exclude_social_media_links`**: If `True`, automatically skip known social platforms.
|
||||
- **`exclude_domains`**: Provide a list of custom domains you want to exclude (e.g., `["spammyads.com", "tracker.net"]`).
|
||||
|
||||
### 2.1 Example: Excluding External & Social Media Links
|
||||
### 3.1 Example: Excluding External & Social Media Links
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
@@ -143,7 +502,7 @@ if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### 2.2 Example: Excluding Specific Domains
|
||||
### 3.2 Example: Excluding Specific Domains
|
||||
|
||||
If you want to let external links in, but specifically exclude a domain (e.g., `suspiciousads.com`), do this:
|
||||
|
||||
@@ -157,9 +516,9 @@ This approach is handy when you still want external links but need to block cert
|
||||
|
||||
---
|
||||
|
||||
## 3. Media Extraction
|
||||
## 4. Media Extraction
|
||||
|
||||
### 3.1 Accessing `result.media`
|
||||
### 4.1 Accessing `result.media`
|
||||
|
||||
By default, Crawl4AI collects images, audio, video URLs, and data tables it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`, `tables`).
|
||||
|
||||
@@ -237,7 +596,7 @@ Depending on your Crawl4AI version or scraping strategy, these dictionaries can
|
||||
|
||||
With these details, you can easily filter out or focus on certain images (for instance, ignoring images with very low scores or a different domain), or gather metadata for analytics.
|
||||
|
||||
### 3.2 Excluding External Images
|
||||
### 4.2 Excluding External Images
|
||||
|
||||
If you’re dealing with heavy pages or want to skip third-party images (advertisements, for example), you can turn on:
|
||||
|
||||
|
||||
262
tests/test_link_extractor.py
Normal file
262
tests/test_link_extractor.py
Normal file
@@ -0,0 +1,262 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for Link Extractor functionality
|
||||
"""
|
||||
|
||||
from crawl4ai.models import Link
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import LinkExtractionConfig
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add the crawl4ai directory to the path
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'crawl4ai'))
|
||||
|
||||
|
||||
async def test_link_extractor():
|
||||
"""Test the link extractor functionality"""
|
||||
|
||||
print("🔗 Testing Link Extractor Functionality")
|
||||
print("=" * 50)
|
||||
|
||||
# Test configuration with link extraction AND scoring enabled
|
||||
config = CrawlerRunConfig(
|
||||
link_extraction_config=LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
include_external=False, # Only internal links for this test
|
||||
# No include/exclude patterns for first test - let's see what we get
|
||||
query="API documentation reference guide",
|
||||
score_threshold=0.3,
|
||||
concurrency=5,
|
||||
timeout=10,
|
||||
max_links=5, # Just test with 5 links first
|
||||
verbose=True # Show detailed progress
|
||||
),
|
||||
score_links=True, # Enable intrinsic link scoring
|
||||
only_text=True,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
# Test URLs
|
||||
test_urls = [
|
||||
"https://docs.python.org/3/", # Python docs - should have many internal links
|
||||
"https://httpbin.org/", # Simple site for testing
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
for url in test_urls:
|
||||
print(f"\n🌐 Testing URL: {url}")
|
||||
print("-" * 40)
|
||||
|
||||
try:
|
||||
result = await crawler.arun(url, config=config)
|
||||
|
||||
# Debug: Check if link extraction config is being passed
|
||||
print(f"🔍 Debug - Link extraction config: {config.link_extraction_config.to_dict() if config.link_extraction_config else None}")
|
||||
print(f"🔍 Debug - Score links: {config.score_links}")
|
||||
|
||||
if result.success:
|
||||
print(f"✅ Crawl successful!")
|
||||
print(
|
||||
f"📄 Page title: {result.metadata.get('title', 'No title')}")
|
||||
|
||||
# Check links - handle both dict and Links object structure
|
||||
if isinstance(result.links, dict):
|
||||
internal_links = [
|
||||
Link(**link) for link in result.links.get('internal', [])]
|
||||
external_links = [
|
||||
Link(**link) for link in result.links.get('external', [])]
|
||||
else:
|
||||
internal_links = result.links.internal
|
||||
external_links = result.links.external
|
||||
|
||||
print(f"🔗 Found {len(internal_links)} internal links")
|
||||
print(f"🌍 Found {len(external_links)} external links")
|
||||
|
||||
# Show links with head data
|
||||
links_with_head = [link for link in internal_links + external_links
|
||||
if hasattr(link, 'head_data') and link.head_data]
|
||||
|
||||
print(
|
||||
f"🧠 Links with head data extracted: {len(links_with_head)}")
|
||||
|
||||
# Show all score types for all links (first 3)
|
||||
all_links = internal_links + external_links
|
||||
if all_links:
|
||||
print(f"\n🔢 Sample link scores (first 3 links):")
|
||||
for i, link in enumerate(all_links[:3]):
|
||||
print(f"\n {i+1}. {link.href}")
|
||||
|
||||
# Show intrinsic score
|
||||
if hasattr(link, 'intrinsic_score') and link.intrinsic_score is not None:
|
||||
if link.intrinsic_score == float('inf'):
|
||||
print(f" Intrinsic Score: ∞ (scoring disabled)")
|
||||
else:
|
||||
print(f" Intrinsic Score: {link.intrinsic_score:.2f}/10.0")
|
||||
else:
|
||||
print(f" Intrinsic Score: Not available")
|
||||
|
||||
# Show contextual score (BM25)
|
||||
if hasattr(link, 'contextual_score') and link.contextual_score is not None:
|
||||
print(f" Contextual Score: {link.contextual_score:.3f}")
|
||||
else:
|
||||
print(f" Contextual Score: Not available")
|
||||
|
||||
# Show total score
|
||||
if hasattr(link, 'total_score') and link.total_score is not None:
|
||||
print(f" Total Score: {link.total_score:.3f}")
|
||||
else:
|
||||
print(f" Total Score: Not available")
|
||||
|
||||
print(f" Text: '{link.text[:50]}...' " if link.text else " Text: (no text)")
|
||||
|
||||
if links_with_head:
|
||||
print("\n📊 Sample links with head data:")
|
||||
# Show top 3
|
||||
for i, link in enumerate(links_with_head[:3]):
|
||||
print(f"\n {i+1}. {link.href}")
|
||||
print(
|
||||
f" Status: {link.head_extraction_status}")
|
||||
|
||||
# Show all three score types
|
||||
print(f" 📊 Scoring Summary:")
|
||||
if hasattr(link, 'intrinsic_score') and link.intrinsic_score is not None:
|
||||
if link.intrinsic_score == float('inf'):
|
||||
print(f" • Intrinsic Score: ∞ (scoring disabled)")
|
||||
else:
|
||||
print(f" • Intrinsic Score: {link.intrinsic_score:.2f}/10.0")
|
||||
else:
|
||||
print(f" • Intrinsic Score: Not available")
|
||||
|
||||
if hasattr(link, 'contextual_score') and link.contextual_score is not None:
|
||||
print(f" • Contextual Score: {link.contextual_score:.3f}")
|
||||
else:
|
||||
print(f" • Contextual Score: Not available")
|
||||
|
||||
if hasattr(link, 'total_score') and link.total_score is not None:
|
||||
print(f" • Total Score: {link.total_score:.3f}")
|
||||
else:
|
||||
print(f" • Total Score: Not available")
|
||||
|
||||
if link.head_data:
|
||||
title = link.head_data.get('title', 'No title')
|
||||
if title:
|
||||
print(f" Title: {title[:60]}...")
|
||||
|
||||
meta = link.head_data.get('meta', {})
|
||||
if 'description' in meta and meta['description']:
|
||||
desc = meta['description']
|
||||
print(f" Description: {desc[:80]}...")
|
||||
|
||||
# Show link metadata keys (should now be properly formatted)
|
||||
link_data = link.head_data.get('link', {})
|
||||
if link_data:
|
||||
keys = list(link_data.keys())[:3]
|
||||
print(f" Link types: {keys}")
|
||||
|
||||
# Show failed extractions
|
||||
failed_links = [link for link in internal_links + external_links
|
||||
if hasattr(link, 'head_extraction_status') and
|
||||
link.head_extraction_status == 'failed']
|
||||
|
||||
if failed_links:
|
||||
print(
|
||||
f"\n❌ Failed head extractions: {len(failed_links)}")
|
||||
for link in failed_links[:2]: # Show first 2 failures
|
||||
print(f" - {link.href}")
|
||||
if hasattr(link, 'head_extraction_error') and link.head_extraction_error:
|
||||
print(
|
||||
f" Error: {link.head_extraction_error}")
|
||||
|
||||
else:
|
||||
print(f"❌ Crawl failed: {result.error_message}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"💥 Error testing {url}: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
def test_config_examples():
|
||||
"""Show example configurations"""
|
||||
|
||||
print("\n📚 Example Configurations")
|
||||
print("=" * 50)
|
||||
|
||||
examples = [
|
||||
{
|
||||
"name": "BM25 Scored Documentation Links",
|
||||
"config": LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
include_external=False,
|
||||
include_patterns=["*/docs/*", "*/api/*", "*/reference/*"],
|
||||
query="API documentation reference guide",
|
||||
score_threshold=0.3,
|
||||
max_links=30,
|
||||
verbose=True
|
||||
)
|
||||
},
|
||||
{
|
||||
"name": "Internal Links Only",
|
||||
"config": LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
include_external=False,
|
||||
max_links=50,
|
||||
verbose=True
|
||||
)
|
||||
},
|
||||
{
|
||||
"name": "External Links with Patterns",
|
||||
"config": LinkExtractionConfig(
|
||||
include_internal=False,
|
||||
include_external=True,
|
||||
include_patterns=["*github.com*", "*stackoverflow.com*"],
|
||||
max_links=20,
|
||||
concurrency=10
|
||||
)
|
||||
},
|
||||
{
|
||||
"name": "High-Performance Mode",
|
||||
"config": LinkExtractionConfig(
|
||||
include_internal=True,
|
||||
include_external=False,
|
||||
concurrency=20,
|
||||
timeout=3,
|
||||
max_links=100,
|
||||
verbose=False
|
||||
)
|
||||
}
|
||||
]
|
||||
|
||||
for example in examples:
|
||||
print(f"\n📝 {example['name']}:")
|
||||
print(" Configuration:")
|
||||
config_dict = example['config'].to_dict()
|
||||
for key, value in config_dict.items():
|
||||
print(f" {key}: {value}")
|
||||
|
||||
print(" Usage:")
|
||||
print(" from crawl4ai.async_configs import LinkExtractionConfig")
|
||||
print(" config = CrawlerRunConfig(")
|
||||
print(" link_extraction_config=LinkExtractionConfig(")
|
||||
for key, value in config_dict.items():
|
||||
if isinstance(value, str):
|
||||
print(f" {key}='{value}',")
|
||||
elif isinstance(value, list) and value:
|
||||
print(f" {key}={value},")
|
||||
elif value is not None:
|
||||
print(f" {key}={value},")
|
||||
print(" )")
|
||||
print(" )")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Show configuration examples first
|
||||
test_config_examples()
|
||||
|
||||
# Run the actual test
|
||||
print("\n🚀 Running Link Extractor Tests...")
|
||||
asyncio.run(test_link_extractor())
|
||||
|
||||
print("\n✨ Test completed!")
|
||||
Reference in New Issue
Block a user