Files
crawl4ai/crawl4ai/__init__.py
ntohidi a51545c883 feat: 🚀 Introduce revolutionary LLMTableExtraction with intelligent chunking for massive tables
BREAKING CHANGE: Table extraction now uses Strategy Design Pattern

This epic commit introduces a game-changing approach to table extraction in Crawl4AI:

 NEW FEATURES:
- LLMTableExtraction: AI-powered extraction for complex HTML tables with rowspan/colspan
- Smart Chunking: Automatically splits massive tables into optimal chunks at row boundaries
- Parallel Processing: Processes multiple chunks simultaneously for blazing-fast extraction
- Intelligent Merging: Seamlessly combines chunk results into complete tables
- Header Preservation: Each chunk maintains context with original headers
- Auto-retry Logic: Built-in resilience with configurable retry attempts

🏗️ ARCHITECTURE:
- Strategy Design Pattern for pluggable table extraction strategies
- ThreadPoolExecutor for concurrent chunk processing
- Token-based chunking with configurable thresholds
- Handles tables without headers gracefully

 PERFORMANCE:
- Process 1000+ row tables without timeout
- Parallel processing with up to 5 concurrent chunks
- Smart token estimation prevents LLM context overflow
- Optimized for providers like Groq for massive tables

🔧 CONFIGURATION:
- enable_chunking: Auto-handle large tables (default: True)
- chunk_token_threshold: When to split (default: 3000 tokens)
- min_rows_per_chunk: Meaningful chunk sizes (default: 10)
- max_parallel_chunks: Concurrent processing (default: 5)

📚 BACKWARD COMPATIBILITY:
- Existing code continues to work unchanged
- DefaultTableExtraction remains the default strategy
- Progressive enhancement approach

This is the future of web table extraction - handling everything from simple tables to massive, complex data grids with merged cells and nested structures. The chunking is completely transparent to users while providing unprecedented scalability.
2025-08-14 18:21:24 +08:00

226 lines
5.7 KiB
Python

# __init__.py
import warnings
from .async_webcrawler import AsyncWebCrawler, CacheMode
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig, MatchMode
from .content_scraping_strategy import (
ContentScrapingStrategy,
LXMLWebScrapingStrategy,
WebScrapingStrategy, # Backward compatibility alias
)
from .async_logger import (
AsyncLoggerBase,
AsyncLogger,
)
from .proxy_strategy import (
ProxyRotationStrategy,
RoundRobinProxyStrategy,
)
from .extraction_strategy import (
ExtractionStrategy,
LLMExtractionStrategy,
CosineStrategy,
JsonCssExtractionStrategy,
JsonXPathExtractionStrategy,
JsonLxmlExtractionStrategy,
RegexExtractionStrategy
)
from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import DefaultMarkdownGenerator
from .table_extraction import (
TableExtractionStrategy,
DefaultTableExtraction,
NoTableExtraction,
LLMTableExtraction,
)
from .content_filter_strategy import (
PruningContentFilter,
BM25ContentFilter,
LLMContentFilter,
RelevantContentFilter,
)
from .models import CrawlResult, MarkdownGenerationResult, DisplayMode
from .components.crawler_monitor import CrawlerMonitor
from .link_preview import LinkPreview
from .async_dispatcher import (
MemoryAdaptiveDispatcher,
SemaphoreDispatcher,
RateLimiter,
BaseDispatcher,
)
from .docker_client import Crawl4aiDockerClient
from .hub import CrawlerHub
from .browser_profiler import BrowserProfiler
from .deep_crawling import (
DeepCrawlStrategy,
BFSDeepCrawlStrategy,
FilterChain,
URLPatternFilter,
DomainFilter,
ContentTypeFilter,
URLFilter,
FilterStats,
SEOFilter,
KeywordRelevanceScorer,
URLScorer,
CompositeScorer,
DomainAuthorityScorer,
FreshnessScorer,
PathDepthScorer,
BestFirstCrawlingStrategy,
DFSDeepCrawlStrategy,
DeepCrawlDecorator,
)
# NEW: Import AsyncUrlSeeder
from .async_url_seeder import AsyncUrlSeeder
# Adaptive Crawler
from .adaptive_crawler import (
AdaptiveCrawler,
AdaptiveConfig,
CrawlState,
CrawlStrategy,
StatisticalStrategy
)
# C4A Script Language Support
from .script import (
compile as c4a_compile,
validate as c4a_validate,
compile_file as c4a_compile_file,
CompilationResult,
ValidationResult,
ErrorDetail
)
# Browser Adapters
from .browser_adapter import (
BrowserAdapter,
PlaywrightAdapter,
UndetectedAdapter
)
from .utils import (
start_colab_display_server,
setup_colab_environment
)
__all__ = [
"AsyncLoggerBase",
"AsyncLogger",
"AsyncWebCrawler",
"BrowserProfiler",
"LLMConfig",
"GeolocationConfig",
# NEW: Add SeedingConfig and VirtualScrollConfig
"SeedingConfig",
"VirtualScrollConfig",
# NEW: Add AsyncUrlSeeder
"AsyncUrlSeeder",
# Adaptive Crawler
"AdaptiveCrawler",
"AdaptiveConfig",
"CrawlState",
"CrawlStrategy",
"StatisticalStrategy",
"DeepCrawlStrategy",
"BFSDeepCrawlStrategy",
"BestFirstCrawlingStrategy",
"DFSDeepCrawlStrategy",
"FilterChain",
"URLPatternFilter",
"ContentTypeFilter",
"DomainFilter",
"FilterStats",
"URLFilter",
"SEOFilter",
"KeywordRelevanceScorer",
"URLScorer",
"CompositeScorer",
"DomainAuthorityScorer",
"FreshnessScorer",
"PathDepthScorer",
"DeepCrawlDecorator",
"CrawlResult",
"CrawlerHub",
"CacheMode",
"MatchMode",
"ContentScrapingStrategy",
"WebScrapingStrategy",
"LXMLWebScrapingStrategy",
"BrowserConfig",
"CrawlerRunConfig",
"HTTPCrawlerConfig",
"ExtractionStrategy",
"LLMExtractionStrategy",
"CosineStrategy",
"JsonCssExtractionStrategy",
"JsonXPathExtractionStrategy",
"JsonLxmlExtractionStrategy",
"RegexExtractionStrategy",
"ChunkingStrategy",
"RegexChunking",
"DefaultMarkdownGenerator",
"TableExtractionStrategy",
"DefaultTableExtraction",
"NoTableExtraction",
"RelevantContentFilter",
"PruningContentFilter",
"BM25ContentFilter",
"LLMContentFilter",
"BaseDispatcher",
"MemoryAdaptiveDispatcher",
"SemaphoreDispatcher",
"RateLimiter",
"CrawlerMonitor",
"LinkPreview",
"DisplayMode",
"MarkdownGenerationResult",
"Crawl4aiDockerClient",
"ProxyRotationStrategy",
"RoundRobinProxyStrategy",
"ProxyConfig",
"start_colab_display_server",
"setup_colab_environment",
# C4A Script additions
"c4a_compile",
"c4a_validate",
"c4a_compile_file",
"CompilationResult",
"ValidationResult",
"ErrorDetail",
# Browser Adapters
"BrowserAdapter",
"PlaywrightAdapter",
"UndetectedAdapter",
"LinkPreviewConfig"
]
# def is_sync_version_installed():
# try:
# import selenium # noqa
# return True
# except ImportError:
# return False
# if is_sync_version_installed():
# try:
# from .web_crawler import WebCrawler
# __all__.append("WebCrawler")
# except ImportError:
# print(
# "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
# )
# else:
# WebCrawler = None
# # import warnings
# # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
# Disable all Pydantic warnings
warnings.filterwarnings("ignore", module="pydantic")
# pydantic_warnings.filter_warnings()