feat: 🚀 Introduce revolutionary LLMTableExtraction with intelligent chunking for massive tables

BREAKING CHANGE: Table extraction now uses Strategy Design Pattern

This epic commit introduces a game-changing approach to table extraction in Crawl4AI:

 NEW FEATURES:
- LLMTableExtraction: AI-powered extraction for complex HTML tables with rowspan/colspan
- Smart Chunking: Automatically splits massive tables into optimal chunks at row boundaries
- Parallel Processing: Processes multiple chunks simultaneously for blazing-fast extraction
- Intelligent Merging: Seamlessly combines chunk results into complete tables
- Header Preservation: Each chunk maintains context with original headers
- Auto-retry Logic: Built-in resilience with configurable retry attempts

🏗️ ARCHITECTURE:
- Strategy Design Pattern for pluggable table extraction strategies
- ThreadPoolExecutor for concurrent chunk processing
- Token-based chunking with configurable thresholds
- Handles tables without headers gracefully

 PERFORMANCE:
- Process 1000+ row tables without timeout
- Parallel processing with up to 5 concurrent chunks
- Smart token estimation prevents LLM context overflow
- Optimized for providers like Groq for massive tables

🔧 CONFIGURATION:
- enable_chunking: Auto-handle large tables (default: True)
- chunk_token_threshold: When to split (default: 3000 tokens)
- min_rows_per_chunk: Meaningful chunk sizes (default: 10)
- max_parallel_chunks: Concurrent processing (default: 5)

📚 BACKWARD COMPATIBILITY:
- Existing code continues to work unchanged
- DefaultTableExtraction remains the default strategy
- Progressive enhancement approach

This is the future of web table extraction - handling everything from simple tables to massive, complex data grids with merged cells and nested structures. The chunking is completely transparent to users while providing unprecedented scalability.
This commit is contained in:
ntohidi
2025-08-14 18:21:24 +08:00
parent 926e41aab8
commit a51545c883
9 changed files with 3536 additions and 116 deletions

View File

@@ -586,117 +586,6 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
return root
def is_data_table(self, table: etree.Element, **kwargs) -> bool:
score = 0
# Check for thead and tbody
has_thead = len(table.xpath(".//thead")) > 0
has_tbody = len(table.xpath(".//tbody")) > 0
if has_thead:
score += 2
if has_tbody:
score += 1
# Check for th elements
th_count = len(table.xpath(".//th"))
if th_count > 0:
score += 2
if has_thead or table.xpath(".//tr[1]/th"):
score += 1
# Check for nested tables
if len(table.xpath(".//table")) > 0:
score -= 3
# Role attribute check
role = table.get("role", "").lower()
if role in {"presentation", "none"}:
score -= 3
# Column consistency
rows = table.xpath(".//tr")
if not rows:
return False
col_counts = [len(row.xpath(".//td|.//th")) for row in rows]
avg_cols = sum(col_counts) / len(col_counts)
variance = sum((c - avg_cols)**2 for c in col_counts) / len(col_counts)
if variance < 1:
score += 2
# Caption and summary
if table.xpath(".//caption"):
score += 2
if table.get("summary"):
score += 1
# Text density
total_text = sum(len(''.join(cell.itertext()).strip()) for row in rows for cell in row.xpath(".//td|.//th"))
total_tags = sum(1 for _ in table.iterdescendants())
text_ratio = total_text / (total_tags + 1e-5)
if text_ratio > 20:
score += 3
elif text_ratio > 10:
score += 2
# Data attributes
data_attrs = sum(1 for attr in table.attrib if attr.startswith('data-'))
score += data_attrs * 0.5
# Size check
if avg_cols >= 2 and len(rows) >= 2:
score += 2
threshold = kwargs.get("table_score_threshold", 7)
return score >= threshold
def extract_table_data(self, table: etree.Element) -> dict:
caption = table.xpath(".//caption/text()")
caption = caption[0].strip() if caption else ""
summary = table.get("summary", "").strip()
# Extract headers with colspan handling
headers = []
thead_rows = table.xpath(".//thead/tr")
if thead_rows:
header_cells = thead_rows[0].xpath(".//th")
for cell in header_cells:
text = cell.text_content().strip()
colspan = int(cell.get("colspan", 1))
headers.extend([text] * colspan)
else:
first_row = table.xpath(".//tr[1]")
if first_row:
for cell in first_row[0].xpath(".//th|.//td"):
text = cell.text_content().strip()
colspan = int(cell.get("colspan", 1))
headers.extend([text] * colspan)
# Extract rows with colspan handling
rows = []
for row in table.xpath(".//tr[not(ancestor::thead)]"):
row_data = []
for cell in row.xpath(".//td"):
text = cell.text_content().strip()
colspan = int(cell.get("colspan", 1))
row_data.extend([text] * colspan)
if row_data:
rows.append(row_data)
# Align rows with headers
max_columns = len(headers) if headers else (max(len(row) for row in rows) if rows else 0)
aligned_rows = []
for row in rows:
aligned = row[:max_columns] + [''] * (max_columns - len(row))
aligned_rows.append(aligned)
if not headers:
headers = [f"Column {i+1}" for i in range(max_columns)]
return {
"headers": headers,
"rows": aligned_rows,
"caption": caption,
"summary": summary,
}
def _scrap(
self,
@@ -839,12 +728,16 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
**kwargs,
)
# Extract tables using the table extraction strategy if provided
if 'table' not in excluded_tags:
tables = body.xpath(".//table")
for table in tables:
if self.is_data_table(table, **kwargs):
table_data = self.extract_table_data(table)
media["tables"].append(table_data)
table_extraction = kwargs.get('table_extraction')
if table_extraction:
# Pass logger to the strategy if it doesn't have one
if not table_extraction.logger:
table_extraction.logger = self.logger
# Extract tables using the strategy
extracted_tables = table_extraction.extract_tables(body, **kwargs)
media["tables"].extend(extracted_tables)
# Handle only_text option
if kwargs.get("only_text", False):