BREAKING CHANGE: Table extraction now uses Strategy Design Pattern This epic commit introduces a game-changing approach to table extraction in Crawl4AI: ✨ NEW FEATURES: - LLMTableExtraction: AI-powered extraction for complex HTML tables with rowspan/colspan - Smart Chunking: Automatically splits massive tables into optimal chunks at row boundaries - Parallel Processing: Processes multiple chunks simultaneously for blazing-fast extraction - Intelligent Merging: Seamlessly combines chunk results into complete tables - Header Preservation: Each chunk maintains context with original headers - Auto-retry Logic: Built-in resilience with configurable retry attempts 🏗️ ARCHITECTURE: - Strategy Design Pattern for pluggable table extraction strategies - ThreadPoolExecutor for concurrent chunk processing - Token-based chunking with configurable thresholds - Handles tables without headers gracefully ⚡ PERFORMANCE: - Process 1000+ row tables without timeout - Parallel processing with up to 5 concurrent chunks - Smart token estimation prevents LLM context overflow - Optimized for providers like Groq for massive tables 🔧 CONFIGURATION: - enable_chunking: Auto-handle large tables (default: True) - chunk_token_threshold: When to split (default: 3000 tokens) - min_rows_per_chunk: Meaningful chunk sizes (default: 10) - max_parallel_chunks: Concurrent processing (default: 5) 📚 BACKWARD COMPATIBILITY: - Existing code continues to work unchanged - DefaultTableExtraction remains the default strategy - Progressive enhancement approach This is the future of web table extraction - handling everything from simple tables to massive, complex data grids with merged cells and nested structures. The chunking is completely transparent to users while providing unprecedented scalability.
226 lines
5.7 KiB
Python
226 lines
5.7 KiB
Python
# __init__.py
|
|
import warnings
|
|
|
|
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
|
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
|
|
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig, MatchMode
|
|
|
|
from .content_scraping_strategy import (
|
|
ContentScrapingStrategy,
|
|
LXMLWebScrapingStrategy,
|
|
WebScrapingStrategy, # Backward compatibility alias
|
|
)
|
|
from .async_logger import (
|
|
AsyncLoggerBase,
|
|
AsyncLogger,
|
|
)
|
|
from .proxy_strategy import (
|
|
ProxyRotationStrategy,
|
|
RoundRobinProxyStrategy,
|
|
)
|
|
from .extraction_strategy import (
|
|
ExtractionStrategy,
|
|
LLMExtractionStrategy,
|
|
CosineStrategy,
|
|
JsonCssExtractionStrategy,
|
|
JsonXPathExtractionStrategy,
|
|
JsonLxmlExtractionStrategy,
|
|
RegexExtractionStrategy
|
|
)
|
|
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
|
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
|
from .table_extraction import (
|
|
TableExtractionStrategy,
|
|
DefaultTableExtraction,
|
|
NoTableExtraction,
|
|
LLMTableExtraction,
|
|
)
|
|
from .content_filter_strategy import (
|
|
PruningContentFilter,
|
|
BM25ContentFilter,
|
|
LLMContentFilter,
|
|
RelevantContentFilter,
|
|
)
|
|
from .models import CrawlResult, MarkdownGenerationResult, DisplayMode
|
|
from .components.crawler_monitor import CrawlerMonitor
|
|
from .link_preview import LinkPreview
|
|
from .async_dispatcher import (
|
|
MemoryAdaptiveDispatcher,
|
|
SemaphoreDispatcher,
|
|
RateLimiter,
|
|
BaseDispatcher,
|
|
)
|
|
from .docker_client import Crawl4aiDockerClient
|
|
from .hub import CrawlerHub
|
|
from .browser_profiler import BrowserProfiler
|
|
from .deep_crawling import (
|
|
DeepCrawlStrategy,
|
|
BFSDeepCrawlStrategy,
|
|
FilterChain,
|
|
URLPatternFilter,
|
|
DomainFilter,
|
|
ContentTypeFilter,
|
|
URLFilter,
|
|
FilterStats,
|
|
SEOFilter,
|
|
KeywordRelevanceScorer,
|
|
URLScorer,
|
|
CompositeScorer,
|
|
DomainAuthorityScorer,
|
|
FreshnessScorer,
|
|
PathDepthScorer,
|
|
BestFirstCrawlingStrategy,
|
|
DFSDeepCrawlStrategy,
|
|
DeepCrawlDecorator,
|
|
)
|
|
# NEW: Import AsyncUrlSeeder
|
|
from .async_url_seeder import AsyncUrlSeeder
|
|
# Adaptive Crawler
|
|
from .adaptive_crawler import (
|
|
AdaptiveCrawler,
|
|
AdaptiveConfig,
|
|
CrawlState,
|
|
CrawlStrategy,
|
|
StatisticalStrategy
|
|
)
|
|
|
|
# C4A Script Language Support
|
|
from .script import (
|
|
compile as c4a_compile,
|
|
validate as c4a_validate,
|
|
compile_file as c4a_compile_file,
|
|
CompilationResult,
|
|
ValidationResult,
|
|
ErrorDetail
|
|
)
|
|
|
|
# Browser Adapters
|
|
from .browser_adapter import (
|
|
BrowserAdapter,
|
|
PlaywrightAdapter,
|
|
UndetectedAdapter
|
|
)
|
|
|
|
from .utils import (
|
|
start_colab_display_server,
|
|
setup_colab_environment
|
|
)
|
|
|
|
__all__ = [
|
|
"AsyncLoggerBase",
|
|
"AsyncLogger",
|
|
"AsyncWebCrawler",
|
|
"BrowserProfiler",
|
|
"LLMConfig",
|
|
"GeolocationConfig",
|
|
# NEW: Add SeedingConfig and VirtualScrollConfig
|
|
"SeedingConfig",
|
|
"VirtualScrollConfig",
|
|
# NEW: Add AsyncUrlSeeder
|
|
"AsyncUrlSeeder",
|
|
# Adaptive Crawler
|
|
"AdaptiveCrawler",
|
|
"AdaptiveConfig",
|
|
"CrawlState",
|
|
"CrawlStrategy",
|
|
"StatisticalStrategy",
|
|
"DeepCrawlStrategy",
|
|
"BFSDeepCrawlStrategy",
|
|
"BestFirstCrawlingStrategy",
|
|
"DFSDeepCrawlStrategy",
|
|
"FilterChain",
|
|
"URLPatternFilter",
|
|
"ContentTypeFilter",
|
|
"DomainFilter",
|
|
"FilterStats",
|
|
"URLFilter",
|
|
"SEOFilter",
|
|
"KeywordRelevanceScorer",
|
|
"URLScorer",
|
|
"CompositeScorer",
|
|
"DomainAuthorityScorer",
|
|
"FreshnessScorer",
|
|
"PathDepthScorer",
|
|
"DeepCrawlDecorator",
|
|
"CrawlResult",
|
|
"CrawlerHub",
|
|
"CacheMode",
|
|
"MatchMode",
|
|
"ContentScrapingStrategy",
|
|
"WebScrapingStrategy",
|
|
"LXMLWebScrapingStrategy",
|
|
"BrowserConfig",
|
|
"CrawlerRunConfig",
|
|
"HTTPCrawlerConfig",
|
|
"ExtractionStrategy",
|
|
"LLMExtractionStrategy",
|
|
"CosineStrategy",
|
|
"JsonCssExtractionStrategy",
|
|
"JsonXPathExtractionStrategy",
|
|
"JsonLxmlExtractionStrategy",
|
|
"RegexExtractionStrategy",
|
|
"ChunkingStrategy",
|
|
"RegexChunking",
|
|
"DefaultMarkdownGenerator",
|
|
"TableExtractionStrategy",
|
|
"DefaultTableExtraction",
|
|
"NoTableExtraction",
|
|
"RelevantContentFilter",
|
|
"PruningContentFilter",
|
|
"BM25ContentFilter",
|
|
"LLMContentFilter",
|
|
"BaseDispatcher",
|
|
"MemoryAdaptiveDispatcher",
|
|
"SemaphoreDispatcher",
|
|
"RateLimiter",
|
|
"CrawlerMonitor",
|
|
"LinkPreview",
|
|
"DisplayMode",
|
|
"MarkdownGenerationResult",
|
|
"Crawl4aiDockerClient",
|
|
"ProxyRotationStrategy",
|
|
"RoundRobinProxyStrategy",
|
|
"ProxyConfig",
|
|
"start_colab_display_server",
|
|
"setup_colab_environment",
|
|
# C4A Script additions
|
|
"c4a_compile",
|
|
"c4a_validate",
|
|
"c4a_compile_file",
|
|
"CompilationResult",
|
|
"ValidationResult",
|
|
"ErrorDetail",
|
|
# Browser Adapters
|
|
"BrowserAdapter",
|
|
"PlaywrightAdapter",
|
|
"UndetectedAdapter",
|
|
"LinkPreviewConfig"
|
|
]
|
|
|
|
|
|
# def is_sync_version_installed():
|
|
# try:
|
|
# import selenium # noqa
|
|
|
|
# return True
|
|
# except ImportError:
|
|
# return False
|
|
|
|
|
|
# if is_sync_version_installed():
|
|
# try:
|
|
# from .web_crawler import WebCrawler
|
|
|
|
# __all__.append("WebCrawler")
|
|
# except ImportError:
|
|
# print(
|
|
# "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
|
|
# )
|
|
# else:
|
|
# WebCrawler = None
|
|
# # import warnings
|
|
# # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
|
|
|
|
# Disable all Pydantic warnings
|
|
warnings.filterwarnings("ignore", module="pydantic")
|
|
# pydantic_warnings.filter_warnings() |