feat: Add advanced link head extraction with three-layer scoring system (#1)
Squashed commit from feature/link-extractor branch implementing comprehensive link analysis: - Extract HTML head content from discovered links with parallel processing - Three-layer scoring: Intrinsic (URL quality), Contextual (BM25), and Total scores - New LinkExtractionConfig class for type-safe configuration - Pattern-based filtering for internal/external links - Comprehensive documentation and examples
This commit is contained in:
@@ -37,6 +37,7 @@ from .content_filter_strategy import (
|
||||
)
|
||||
from .models import CrawlResult, MarkdownGenerationResult, DisplayMode
|
||||
from .components.crawler_monitor import CrawlerMonitor
|
||||
from .link_extractor import LinkExtractor
|
||||
from .async_dispatcher import (
|
||||
MemoryAdaptiveDispatcher,
|
||||
SemaphoreDispatcher,
|
||||
@@ -141,6 +142,7 @@ __all__ = [
|
||||
"SemaphoreDispatcher",
|
||||
"RateLimiter",
|
||||
"CrawlerMonitor",
|
||||
"LinkExtractor",
|
||||
"DisplayMode",
|
||||
"MarkdownGenerationResult",
|
||||
"Crawl4aiDockerClient",
|
||||
|
||||
Reference in New Issue
Block a user