feat: 🚀 Introduce revolutionary LLMTableExtraction with intelligent chunking for massive tables

BREAKING CHANGE: Table extraction now uses Strategy Design Pattern

This epic commit introduces a game-changing approach to table extraction in Crawl4AI:

 NEW FEATURES:
- LLMTableExtraction: AI-powered extraction for complex HTML tables with rowspan/colspan
- Smart Chunking: Automatically splits massive tables into optimal chunks at row boundaries
- Parallel Processing: Processes multiple chunks simultaneously for blazing-fast extraction
- Intelligent Merging: Seamlessly combines chunk results into complete tables
- Header Preservation: Each chunk maintains context with original headers
- Auto-retry Logic: Built-in resilience with configurable retry attempts

🏗️ ARCHITECTURE:
- Strategy Design Pattern for pluggable table extraction strategies
- ThreadPoolExecutor for concurrent chunk processing
- Token-based chunking with configurable thresholds
- Handles tables without headers gracefully

 PERFORMANCE:
- Process 1000+ row tables without timeout
- Parallel processing with up to 5 concurrent chunks
- Smart token estimation prevents LLM context overflow
- Optimized for providers like Groq for massive tables

🔧 CONFIGURATION:
- enable_chunking: Auto-handle large tables (default: True)
- chunk_token_threshold: When to split (default: 3000 tokens)
- min_rows_per_chunk: Meaningful chunk sizes (default: 10)
- max_parallel_chunks: Concurrent processing (default: 5)

📚 BACKWARD COMPATIBILITY:
- Existing code continues to work unchanged
- DefaultTableExtraction remains the default strategy
- Progressive enhancement approach

This is the future of web table extraction - handling everything from simple tables to massive, complex data grids with merged cells and nested structures. The chunking is completely transparent to users while providing unprecedented scalability.
This commit is contained in:
ntohidi
2025-08-14 18:21:24 +08:00
parent 7f48655cf1
commit 9f7fee91a9
9 changed files with 3536 additions and 116 deletions

View File

@@ -29,6 +29,12 @@ from .extraction_strategy import (
)
from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import DefaultMarkdownGenerator
from .table_extraction import (
TableExtractionStrategy,
DefaultTableExtraction,
NoTableExtraction,
LLMTableExtraction,
)
from .content_filter_strategy import (
PruningContentFilter,
BM25ContentFilter,
@@ -156,6 +162,9 @@ __all__ = [
"ChunkingStrategy",
"RegexChunking",
"DefaultMarkdownGenerator",
"TableExtractionStrategy",
"DefaultTableExtraction",
"NoTableExtraction",
"RelevantContentFilter",
"PruningContentFilter",
"BM25ContentFilter",

View File

@@ -20,6 +20,7 @@ from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
from .content_scraping_strategy import ContentScrapingStrategy, LXMLWebScrapingStrategy
from .deep_crawling import DeepCrawlStrategy
from .table_extraction import TableExtractionStrategy, DefaultTableExtraction
from .cache_context import CacheMode
from .proxy_strategy import ProxyRotationStrategy
@@ -982,6 +983,8 @@ class CrawlerRunConfig():
Default: False.
table_score_threshold (int): Minimum score threshold for processing a table.
Default: 7.
table_extraction (TableExtractionStrategy): Strategy to use for table extraction.
Default: DefaultTableExtraction with table_score_threshold.
# Virtual Scroll Parameters
virtual_scroll_config (VirtualScrollConfig or dict or None): Configuration for handling virtual scroll containers.
@@ -1108,6 +1111,7 @@ class CrawlerRunConfig():
image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
table_score_threshold: int = 7,
table_extraction: TableExtractionStrategy = None,
exclude_external_images: bool = False,
exclude_all_images: bool = False,
# Link and Domain Handling Parameters
@@ -1224,6 +1228,12 @@ class CrawlerRunConfig():
self.exclude_external_images = exclude_external_images
self.exclude_all_images = exclude_all_images
self.table_score_threshold = table_score_threshold
# Table extraction strategy (default to DefaultTableExtraction if not specified)
if table_extraction is None:
self.table_extraction = DefaultTableExtraction(table_score_threshold=table_score_threshold)
else:
self.table_extraction = table_extraction
# Link and Domain Handling Parameters
self.exclude_social_media_domains = (
@@ -1495,6 +1505,7 @@ class CrawlerRunConfig():
"image_score_threshold", IMAGE_SCORE_THRESHOLD
),
table_score_threshold=kwargs.get("table_score_threshold", 7),
table_extraction=kwargs.get("table_extraction", None),
exclude_all_images=kwargs.get("exclude_all_images", False),
exclude_external_images=kwargs.get("exclude_external_images", False),
# Link and Domain Handling Parameters
@@ -1603,6 +1614,7 @@ class CrawlerRunConfig():
"image_description_min_word_threshold": self.image_description_min_word_threshold,
"image_score_threshold": self.image_score_threshold,
"table_score_threshold": self.table_score_threshold,
"table_extraction": self.table_extraction,
"exclude_all_images": self.exclude_all_images,
"exclude_external_images": self.exclude_external_images,
"exclude_social_media_domains": self.exclude_social_media_domains,

View File

@@ -586,117 +586,6 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
return root
def is_data_table(self, table: etree.Element, **kwargs) -> bool:
score = 0
# Check for thead and tbody
has_thead = len(table.xpath(".//thead")) > 0
has_tbody = len(table.xpath(".//tbody")) > 0
if has_thead:
score += 2
if has_tbody:
score += 1
# Check for th elements
th_count = len(table.xpath(".//th"))
if th_count > 0:
score += 2
if has_thead or table.xpath(".//tr[1]/th"):
score += 1
# Check for nested tables
if len(table.xpath(".//table")) > 0:
score -= 3
# Role attribute check
role = table.get("role", "").lower()
if role in {"presentation", "none"}:
score -= 3
# Column consistency
rows = table.xpath(".//tr")
if not rows:
return False
col_counts = [len(row.xpath(".//td|.//th")) for row in rows]
avg_cols = sum(col_counts) / len(col_counts)
variance = sum((c - avg_cols)**2 for c in col_counts) / len(col_counts)
if variance < 1:
score += 2
# Caption and summary
if table.xpath(".//caption"):
score += 2
if table.get("summary"):
score += 1
# Text density
total_text = sum(len(''.join(cell.itertext()).strip()) for row in rows for cell in row.xpath(".//td|.//th"))
total_tags = sum(1 for _ in table.iterdescendants())
text_ratio = total_text / (total_tags + 1e-5)
if text_ratio > 20:
score += 3
elif text_ratio > 10:
score += 2
# Data attributes
data_attrs = sum(1 for attr in table.attrib if attr.startswith('data-'))
score += data_attrs * 0.5
# Size check
if avg_cols >= 2 and len(rows) >= 2:
score += 2
threshold = kwargs.get("table_score_threshold", 7)
return score >= threshold
def extract_table_data(self, table: etree.Element) -> dict:
caption = table.xpath(".//caption/text()")
caption = caption[0].strip() if caption else ""
summary = table.get("summary", "").strip()
# Extract headers with colspan handling
headers = []
thead_rows = table.xpath(".//thead/tr")
if thead_rows:
header_cells = thead_rows[0].xpath(".//th")
for cell in header_cells:
text = cell.text_content().strip()
colspan = int(cell.get("colspan", 1))
headers.extend([text] * colspan)
else:
first_row = table.xpath(".//tr[1]")
if first_row:
for cell in first_row[0].xpath(".//th|.//td"):
text = cell.text_content().strip()
colspan = int(cell.get("colspan", 1))
headers.extend([text] * colspan)
# Extract rows with colspan handling
rows = []
for row in table.xpath(".//tr[not(ancestor::thead)]"):
row_data = []
for cell in row.xpath(".//td"):
text = cell.text_content().strip()
colspan = int(cell.get("colspan", 1))
row_data.extend([text] * colspan)
if row_data:
rows.append(row_data)
# Align rows with headers
max_columns = len(headers) if headers else (max(len(row) for row in rows) if rows else 0)
aligned_rows = []
for row in rows:
aligned = row[:max_columns] + [''] * (max_columns - len(row))
aligned_rows.append(aligned)
if not headers:
headers = [f"Column {i+1}" for i in range(max_columns)]
return {
"headers": headers,
"rows": aligned_rows,
"caption": caption,
"summary": summary,
}
def _scrap(
self,
@@ -839,12 +728,16 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
**kwargs,
)
# Extract tables using the table extraction strategy if provided
if 'table' not in excluded_tags:
tables = body.xpath(".//table")
for table in tables:
if self.is_data_table(table, **kwargs):
table_data = self.extract_table_data(table)
media["tables"].append(table_data)
table_extraction = kwargs.get('table_extraction')
if table_extraction:
# Pass logger to the strategy if it doesn't have one
if not table_extraction.logger:
table_extraction.logger = self.logger
# Extract tables using the strategy
extracted_tables = table_extraction.extract_tables(body, **kwargs)
media["tables"].extend(extracted_tables)
# Handle only_text option
if kwargs.get("only_text", False):

1396
crawl4ai/table_extraction.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,406 @@
#!/usr/bin/env python3
"""
Example demonstrating LLM-based table extraction in Crawl4AI.
This example shows how to use the LLMTableExtraction strategy to extract
complex tables from web pages, including handling rowspan, colspan, and nested tables.
"""
import os
import sys
# Get the grandparent directory
grandparent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(grandparent_dir)
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
import asyncio
import json
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
LLMConfig,
LLMTableExtraction,
CacheMode
)
import pandas as pd
# Example 1: Basic LLM Table Extraction
async def basic_llm_extraction():
"""Extract tables using LLM with default settings."""
print("\n=== Example 1: Basic LLM Table Extraction ===")
# Configure LLM (using OpenAI GPT-4o-mini for cost efficiency)
llm_config = LLMConfig(
provider="openai/gpt-4.1-mini",
api_token="env:OPENAI_API_KEY", # Uses environment variable
temperature=0.1, # Low temperature for consistency
max_tokens=2000
)
# Create LLM table extraction strategy
table_strategy = LLMTableExtraction(
llm_config=llm_config,
verbose=True
)
# Configure crawler with the strategy
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
table_extraction=table_strategy
)
async with AsyncWebCrawler() as crawler:
# Extract tables from a Wikipedia page
result = await crawler.arun(
url="https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
config=config
)
if result.success:
print(f"✓ Found {len(result.tables)} tables")
# Display first table
if result.tables:
first_table = result.tables[0]
print(f"\nFirst table:")
print(f" Headers: {first_table['headers'][:5]}...")
print(f" Rows: {len(first_table['rows'])}")
# Convert to pandas DataFrame
df = pd.DataFrame(
first_table['rows'],
columns=first_table['headers']
)
print(f"\nDataFrame shape: {df.shape}")
print(df.head())
else:
print(f"✗ Extraction failed: {result.error}")
# Example 2: Focused Extraction with CSS Selector
async def focused_extraction():
"""Extract tables from specific page sections using CSS selectors."""
print("\n=== Example 2: Focused Extraction with CSS Selector ===")
# HTML with multiple tables
test_html = """
<html>
<body>
<div class="sidebar">
<table role="presentation">
<tr><td>Navigation</td></tr>
</table>
</div>
<div class="main-content">
<table id="data-table">
<caption>Quarterly Sales Report</caption>
<thead>
<tr>
<th rowspan="2">Product</th>
<th colspan="3">Q1 2024</th>
</tr>
<tr>
<th>Jan</th>
<th>Feb</th>
<th>Mar</th>
</tr>
</thead>
<tbody>
<tr>
<td>Widget A</td>
<td>100</td>
<td>120</td>
<td>140</td>
</tr>
<tr>
<td>Widget B</td>
<td>200</td>
<td>180</td>
<td>220</td>
</tr>
</tbody>
</table>
</div>
</body>
</html>
"""
llm_config = LLMConfig(
provider="openai/gpt-4.1-mini",
api_token="env:OPENAI_API_KEY"
)
# Focus only on main content area
table_strategy = LLMTableExtraction(
llm_config=llm_config,
css_selector=".main-content", # Only extract from main content
verbose=True
)
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
table_extraction=table_strategy
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=f"raw:{test_html}",
config=config
)
if result.success and result.tables:
table = result.tables[0]
print(f"✓ Extracted table: {table.get('caption', 'No caption')}")
print(f" Headers: {table['headers']}")
print(f" Metadata: {table['metadata']}")
# The LLM should have handled the rowspan/colspan correctly
print("\nProcessed data (rowspan/colspan handled):")
for i, row in enumerate(table['rows']):
print(f" Row {i+1}: {row}")
# Example 3: Comparing with Default Extraction
async def compare_strategies():
"""Compare LLM extraction with default extraction on complex tables."""
print("\n=== Example 3: Comparing LLM vs Default Extraction ===")
# Complex table with nested structure
complex_html = """
<html>
<body>
<table>
<tr>
<th rowspan="3">Category</th>
<th colspan="2">2023</th>
<th colspan="2">2024</th>
</tr>
<tr>
<th>H1</th>
<th>H2</th>
<th>H1</th>
<th>H2</th>
</tr>
<tr>
<td colspan="4">All values in millions</td>
</tr>
<tr>
<td>Revenue</td>
<td>100</td>
<td>120</td>
<td>130</td>
<td>145</td>
</tr>
<tr>
<td>Profit</td>
<td>20</td>
<td>25</td>
<td>28</td>
<td>32</td>
</tr>
</table>
</body>
</html>
"""
async with AsyncWebCrawler() as crawler:
# Test with default extraction
from crawl4ai import DefaultTableExtraction
default_strategy = DefaultTableExtraction(
table_score_threshold=3,
verbose=True
)
config_default = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
table_extraction=default_strategy
)
result_default = await crawler.arun(
url=f"raw:{complex_html}",
config=config_default
)
# Test with LLM extraction
llm_strategy = LLMTableExtraction(
llm_config=LLMConfig(
provider="openai/gpt-4.1-mini",
api_token="env:OPENAI_API_KEY"
),
verbose=True
)
config_llm = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
table_extraction=llm_strategy
)
result_llm = await crawler.arun(
url=f"raw:{complex_html}",
config=config_llm
)
# Compare results
print("\nDefault Extraction:")
if result_default.tables:
table = result_default.tables[0]
print(f" Headers: {table.get('headers', [])}")
print(f" Rows: {len(table.get('rows', []))}")
for i, row in enumerate(table.get('rows', [])[:3]):
print(f" Row {i+1}: {row}")
print("\nLLM Extraction (handles complex structure better):")
if result_llm.tables:
table = result_llm.tables[0]
print(f" Headers: {table.get('headers', [])}")
print(f" Rows: {len(table.get('rows', []))}")
for i, row in enumerate(table.get('rows', [])):
print(f" Row {i+1}: {row}")
print(f" Metadata: {table.get('metadata', {})}")
# Example 4: Using Local Models (Ollama)
async def local_model_extraction():
"""Extract tables using local Ollama models for privacy/cost."""
print("\n=== Example 4: Local Model Extraction with Ollama ===")
# Configure for local Ollama
llm_config = LLMConfig(
provider="ollama/llama3.3",
api_token=None, # Not needed for Ollama
base_url="http://localhost:11434",
temperature=0.1
)
table_strategy = LLMTableExtraction(
llm_config=llm_config,
verbose=True
)
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
table_extraction=table_strategy
)
# Simple test HTML
test_html = """
<table>
<thead>
<tr><th>Product</th><th>Price</th><th>Stock</th></tr>
</thead>
<tbody>
<tr><td>Apple</td><td>$1.50</td><td>100</td></tr>
<tr><td>Banana</td><td>$0.50</td><td>200</td></tr>
<tr><td>Orange</td><td>$2.00</td><td>50</td></tr>
</tbody>
</table>
"""
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=f"raw:{test_html}",
config=config
)
if result.success and result.tables:
table = result.tables[0]
print(f"✓ Extracted with local model:")
# Create DataFrame
df = pd.DataFrame(table['rows'], columns=table['headers'])
print(df.to_string())
else:
print("✗ Make sure Ollama is running locally with llama3.3 model")
# Example 5: Batch Processing Multiple Pages
async def batch_extraction():
"""Extract tables from multiple pages efficiently."""
print("\n=== Example 5: Batch Table Extraction ===")
urls = [
"https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)",
"https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
"https://en.wikipedia.org/wiki/List_of_countries_by_Human_Development_Index"
]
llm_config = LLMConfig(
provider="openai/gpt-4.1-mini",
api_token="env:OPENAI_API_KEY",
temperature=0.1,
max_tokens=1500
)
table_strategy = LLMTableExtraction(
llm_config=llm_config,
css_selector="table.wikitable", # Wikipedia data tables
verbose=False
)
config = CrawlerRunConfig(
table_extraction=table_strategy,
cache_mode=CacheMode.BYPASS
)
all_tables = []
async with AsyncWebCrawler() as crawler:
for url in urls:
print(f"\nProcessing: {url.split('/')[-1][:50]}...")
result = await crawler.arun(url=url, config=config)
if result.success and result.tables:
print(f" ✓ Found {len(result.tables)} tables")
# Store first table from each page
if result.tables:
all_tables.append({
'url': url,
'table': result.tables[0]
})
# Summary
print(f"\n=== Summary ===")
print(f"Extracted {len(all_tables)} tables from {len(urls)} pages")
for item in all_tables:
table = item['table']
print(f"\nFrom {item['url'].split('/')[-1][:30]}:")
print(f" Columns: {len(table['headers'])}")
print(f" Rows: {len(table['rows'])}")
async def main():
"""Run all examples."""
print("=" * 60)
print("LLM TABLE EXTRACTION EXAMPLES")
print("=" * 60)
# Run examples (comment out ones you don't want to run)
# Basic extraction
await basic_llm_extraction()
# # Focused extraction with CSS
# await focused_extraction()
# # Compare strategies
# await compare_strategies()
# # Local model (requires Ollama)
# # await local_model_extraction()
# # Batch processing
# await batch_extraction()
print("\n" + "=" * 60)
print("ALL EXAMPLES COMPLETED")
print("=" * 60)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,276 @@
"""
Example: Using Table Extraction Strategies in Crawl4AI
This example demonstrates how to use different table extraction strategies
to extract tables from web pages.
"""
import asyncio
import pandas as pd
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
CacheMode,
DefaultTableExtraction,
NoTableExtraction,
TableExtractionStrategy
)
from typing import Dict, List, Any
async def example_default_extraction():
"""Example 1: Using default table extraction (automatic)."""
print("\n" + "="*50)
print("Example 1: Default Table Extraction")
print("="*50)
async with AsyncWebCrawler() as crawler:
# No need to specify table_extraction - uses DefaultTableExtraction automatically
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
table_score_threshold=7 # Adjust sensitivity (default: 7)
)
result = await crawler.arun(
"https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
config=config
)
if result.success and result.tables:
print(f"Found {len(result.tables)} tables")
# Convert first table to pandas DataFrame
if result.tables:
first_table = result.tables[0]
df = pd.DataFrame(
first_table['rows'],
columns=first_table['headers'] if first_table['headers'] else None
)
print(f"\nFirst table preview:")
print(df.head())
print(f"Shape: {df.shape}")
async def example_custom_configuration():
"""Example 2: Custom table extraction configuration."""
print("\n" + "="*50)
print("Example 2: Custom Table Configuration")
print("="*50)
async with AsyncWebCrawler() as crawler:
# Create custom extraction strategy with specific settings
table_strategy = DefaultTableExtraction(
table_score_threshold=5, # Lower threshold for more permissive detection
min_rows=3, # Only extract tables with at least 3 rows
min_cols=2, # Only extract tables with at least 2 columns
verbose=True
)
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
table_extraction=table_strategy,
# Target specific tables using CSS selector
css_selector="div.main-content"
)
result = await crawler.arun(
"https://example.com/data",
config=config
)
if result.success:
print(f"Found {len(result.tables)} tables matching criteria")
for i, table in enumerate(result.tables):
print(f"\nTable {i+1}:")
print(f" Caption: {table.get('caption', 'No caption')}")
print(f" Size: {table['metadata']['row_count']} rows × {table['metadata']['column_count']} columns")
print(f" Has headers: {table['metadata']['has_headers']}")
async def example_disable_extraction():
"""Example 3: Disable table extraction when not needed."""
print("\n" + "="*50)
print("Example 3: Disable Table Extraction")
print("="*50)
async with AsyncWebCrawler() as crawler:
# Use NoTableExtraction to skip table processing entirely
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
table_extraction=NoTableExtraction() # No tables will be extracted
)
result = await crawler.arun(
"https://example.com",
config=config
)
if result.success:
print(f"Tables extracted: {len(result.tables)} (should be 0)")
print("Table extraction disabled - better performance for non-table content")
class FinancialTableExtraction(TableExtractionStrategy):
"""
Custom strategy for extracting financial tables with specific requirements.
"""
def __init__(self, currency_symbols=None, **kwargs):
super().__init__(**kwargs)
self.currency_symbols = currency_symbols or ['$', '', '£', '¥']
def extract_tables(self, element, **kwargs):
"""Extract only tables that appear to contain financial data."""
tables_data = []
for table in element.xpath(".//table"):
# Check if table contains currency symbols
table_text = ''.join(table.itertext())
has_currency = any(symbol in table_text for symbol in self.currency_symbols)
if not has_currency:
continue
# Extract using base logic (could reuse DefaultTableExtraction logic)
headers = []
rows = []
# Extract headers
for th in table.xpath(".//thead//th | .//tr[1]//th"):
headers.append(th.text_content().strip())
# Extract rows
for tr in table.xpath(".//tbody//tr | .//tr[position()>1]"):
row = []
for td in tr.xpath(".//td"):
cell_text = td.text_content().strip()
# Clean currency values
for symbol in self.currency_symbols:
cell_text = cell_text.replace(symbol, '')
row.append(cell_text)
if row:
rows.append(row)
if headers or rows:
tables_data.append({
"headers": headers,
"rows": rows,
"caption": table.xpath(".//caption/text()")[0] if table.xpath(".//caption") else "",
"summary": table.get("summary", ""),
"metadata": {
"type": "financial",
"has_currency": True,
"row_count": len(rows),
"column_count": len(headers) if headers else len(rows[0]) if rows else 0
}
})
return tables_data
async def example_custom_strategy():
"""Example 4: Custom table extraction strategy."""
print("\n" + "="*50)
print("Example 4: Custom Financial Table Strategy")
print("="*50)
async with AsyncWebCrawler() as crawler:
# Use custom strategy for financial tables
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
table_extraction=FinancialTableExtraction(
currency_symbols=['$', ''],
verbose=True
)
)
result = await crawler.arun(
"https://finance.yahoo.com/",
config=config
)
if result.success:
print(f"Found {len(result.tables)} financial tables")
for table in result.tables:
if table['metadata'].get('type') == 'financial':
print(f" ✓ Financial table with {table['metadata']['row_count']} rows")
async def example_combined_extraction():
"""Example 5: Combine table extraction with other strategies."""
print("\n" + "="*50)
print("Example 5: Combined Extraction Strategies")
print("="*50)
from crawl4ai import LLMExtractionStrategy, LLMConfig
async with AsyncWebCrawler() as crawler:
# Define schema for structured extraction
schema = {
"type": "object",
"properties": {
"page_title": {"type": "string"},
"main_topic": {"type": "string"},
"key_figures": {
"type": "array",
"items": {"type": "string"}
}
}
}
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
# Table extraction
table_extraction=DefaultTableExtraction(
table_score_threshold=6,
min_rows=2
),
# LLM extraction for structured data
extraction_strategy=LLMExtractionStrategy(
llm_config=LLMConfig(provider="openai"),
schema=schema
)
)
result = await crawler.arun(
"https://en.wikipedia.org/wiki/Economy_of_the_United_States",
config=config
)
if result.success:
print(f"Tables found: {len(result.tables)}")
# Tables are in result.tables
if result.tables:
print(f"First table has {len(result.tables[0]['rows'])} rows")
# Structured data is in result.extracted_content
if result.extracted_content:
import json
structured_data = json.loads(result.extracted_content)
print(f"Page title: {structured_data.get('page_title', 'N/A')}")
print(f"Main topic: {structured_data.get('main_topic', 'N/A')}")
async def main():
"""Run all examples."""
print("\n" + "="*60)
print("CRAWL4AI TABLE EXTRACTION EXAMPLES")
print("="*60)
# Run examples
await example_default_extraction()
await example_custom_configuration()
await example_disable_extraction()
await example_custom_strategy()
# await example_combined_extraction() # Requires OpenAI API key
print("\n" + "="*60)
print("EXAMPLES COMPLETED")
print("="*60)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,807 @@
# Table Extraction Strategies
## Overview
**New in v0.7.3+**: Table extraction now follows the **Strategy Design Pattern**, providing unprecedented flexibility and power for handling different table structures. Don't worry - **your existing code still works!** We maintain full backward compatibility while offering new capabilities.
### What's Changed?
- **Architecture**: Table extraction now uses pluggable strategies
- **Backward Compatible**: Your existing code with `table_score_threshold` continues to work
- **More Power**: Choose from multiple strategies or create your own
- **Same Default Behavior**: By default, uses `DefaultTableExtraction` (same as before)
### Key Points
**Old code still works** - No breaking changes
**Same default behavior** - Uses the proven extraction algorithm
**New capabilities** - Add LLM extraction or custom strategies when needed
**Strategy pattern** - Clean, extensible architecture
## Quick Start
### The Simplest Way (Works Like Before)
If you're already using Crawl4AI, nothing changes:
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
async def extract_tables():
async with AsyncWebCrawler() as crawler:
# This works exactly like before - uses DefaultTableExtraction internally
result = await crawler.arun("https://example.com/data")
# Tables are automatically extracted and available in result.tables
for table in result.tables:
print(f"Table with {len(table['rows'])} rows and {len(table['headers'])} columns")
print(f"Headers: {table['headers']}")
print(f"First row: {table['rows'][0] if table['rows'] else 'No data'}")
asyncio.run(extract_tables())
```
### Using the Old Configuration (Still Supported)
Your existing code with `table_score_threshold` continues to work:
```python
# This old approach STILL WORKS - we maintain backward compatibility
config = CrawlerRunConfig(
table_score_threshold=7 # Internally creates DefaultTableExtraction(table_score_threshold=7)
)
result = await crawler.arun(url, config)
```
## Table Extraction Strategies
### Understanding the Strategy Pattern
The strategy pattern allows you to choose different table extraction algorithms at runtime. Think of it as having different tools in a toolbox - you pick the right one for the job:
- **No explicit strategy?** → Uses `DefaultTableExtraction` automatically (same as v0.7.2 and earlier)
- **Need complex table handling?** → Choose `LLMTableExtraction` (costs money, use sparingly)
- **Want to disable tables?** → Use `NoTableExtraction`
- **Have special requirements?** → Create a custom strategy
### Available Strategies
| Strategy | Description | Use Case | Cost | When to Use |
|----------|-------------|----------|------|-------------|
| `DefaultTableExtraction` | **RECOMMENDED**: Same algorithm as before v0.7.3 | General purpose (default) | Free | **Use this first - handles 95% of cases** |
| `LLMTableExtraction` | AI-powered extraction for complex tables | Tables with complex rowspan/colspan | **$$$ Per API call** | Only when DefaultTableExtraction fails |
| `NoTableExtraction` | Disables table extraction | When tables aren't needed | Free | For text-only extraction |
| Custom strategies | User-defined extraction logic | Specialized requirements | Free | Domain-specific needs |
> **⚠️ CRITICAL COST WARNING for LLMTableExtraction**:
>
> **DO NOT USE `LLMTableExtraction` UNLESS ABSOLUTELY NECESSARY!**
>
> - **Always try `DefaultTableExtraction` first** - It's free and handles most tables perfectly
> - LLM extraction **costs money** with every API call
> - For large tables (100+ rows), LLM extraction can be **very slow**
> - **For large tables**: If you must use LLM, choose fast providers:
> - ✅ **Groq** (fastest inference)
> - ✅ **Cerebras** (optimized for speed)
> - ⚠️ Avoid: OpenAI, Anthropic for large tables (slower)
>
> **🚧 WORK IN PROGRESS**:
> We are actively developing an **advanced non-LLM algorithm** that will handle complex table structures (rowspan, colspan, nested tables) for **FREE**. This will replace the need for costly LLM extraction in most cases. Coming soon!
### DefaultTableExtraction
The default strategy uses a sophisticated scoring system to identify data tables:
```python
from crawl4ai import DefaultTableExtraction, CrawlerRunConfig
# Customize the default extraction
table_strategy = DefaultTableExtraction(
table_score_threshold=7, # Scoring threshold (default: 7)
min_rows=2, # Minimum rows required
min_cols=2, # Minimum columns required
verbose=True # Enable detailed logging
)
config = CrawlerRunConfig(
table_extraction=table_strategy
)
```
#### Scoring System
The scoring system evaluates multiple factors:
| Factor | Score Impact | Description |
|--------|--------------|-------------|
| Has `<thead>` | +2 | Semantic table structure |
| Has `<tbody>` | +1 | Organized table body |
| Has `<th>` elements | +2 | Header cells present |
| Headers in correct position | +1 | Proper semantic structure |
| Consistent column count | +2 | Regular data structure |
| Has caption | +2 | Descriptive caption |
| Has summary | +1 | Summary attribute |
| High text density | +2 to +3 | Content-rich cells |
| Data attributes | +0.5 each | Data-* attributes |
| Nested tables | -3 | Often indicates layout |
| Role="presentation" | -3 | Explicitly non-data |
| Too few rows | -2 | Insufficient data |
### LLMTableExtraction (Use Sparingly!)
**⚠️ WARNING**: Only use this when `DefaultTableExtraction` fails with complex tables!
LLMTableExtraction uses AI to understand complex table structures that traditional parsers struggle with. It automatically handles large tables through intelligent chunking and parallel processing:
```python
from crawl4ai import LLMTableExtraction, LLMConfig, CrawlerRunConfig
# Configure LLM (costs money per call!)
llm_config = LLMConfig(
provider="groq/llama-3.3-70b-versatile", # Fast provider for large tables
api_token="your_api_key",
temperature=0.1
)
# Create LLM extraction strategy with smart chunking
table_strategy = LLMTableExtraction(
llm_config=llm_config,
max_tries=3, # Retry up to 3 times if extraction fails
css_selector="table", # Optional: focus on specific tables
enable_chunking=True, # Automatically chunk large tables (default: True)
chunk_token_threshold=3000, # Split tables larger than this (default: 3000 tokens)
min_rows_per_chunk=10, # Minimum rows per chunk (default: 10)
max_parallel_chunks=5, # Process up to 5 chunks in parallel (default: 5)
verbose=True
)
config = CrawlerRunConfig(
table_extraction=table_strategy
)
result = await crawler.arun(url, config)
```
#### When to Use LLMTableExtraction
**Use ONLY when**:
- Tables have complex merged cells (rowspan/colspan) that break DefaultTableExtraction
- Nested tables that need semantic understanding
- Tables with irregular structures
- You've tried DefaultTableExtraction and it failed
**Never use when**:
- DefaultTableExtraction works (99% of cases)
- Tables are simple or well-structured
- You're processing many pages (costs add up!)
- Tables have 100+ rows (very slow)
#### How Smart Chunking Works
LLMTableExtraction automatically handles large tables through intelligent chunking:
1. **Automatic Detection**: Tables exceeding the token threshold are automatically split
2. **Smart Splitting**: Chunks are created at row boundaries, preserving table structure
3. **Header Preservation**: Each chunk includes the original headers for context
4. **Parallel Processing**: Multiple chunks are processed simultaneously for speed
5. **Intelligent Merging**: Results are merged back into a single, complete table
**Chunking Parameters**:
- `enable_chunking` (default: `True`): Automatically handle large tables
- `chunk_token_threshold` (default: `3000`): When to split tables
- `min_rows_per_chunk` (default: `10`): Ensures meaningful chunk sizes
- `max_parallel_chunks` (default: `5`): Concurrent processing for speed
The chunking is completely transparent - you get the same output format whether the table was processed in one piece or multiple chunks.
#### Performance Optimization for LLMTableExtraction
**Provider Recommendations by Table Size**:
| Table Size | Recommended Providers | Why |
|------------|----------------------|-----|
| Small (<50 rows) | Any provider | Fast enough |
| Medium (50-200 rows) | Groq, Cerebras | Optimized inference |
| Large (200+ rows) | **Groq** (best), Cerebras | Fastest inference + automatic chunking |
| Very Large (500+ rows) | Groq with chunking | Parallel processing keeps it fast |
### NoTableExtraction
Disable table extraction for better performance when tables aren't needed:
```python
from crawl4ai import NoTableExtraction, CrawlerRunConfig
config = CrawlerRunConfig(
table_extraction=NoTableExtraction()
)
# Tables won't be extracted, improving performance
result = await crawler.arun(url, config)
assert len(result.tables) == 0
```
## Extracted Table Structure
Each extracted table contains:
```python
{
"headers": ["Column 1", "Column 2", ...], # Column headers
"rows": [ # Data rows
["Row 1 Col 1", "Row 1 Col 2", ...],
["Row 2 Col 1", "Row 2 Col 2", ...],
],
"caption": "Table Caption", # If present
"summary": "Table Summary", # If present
"metadata": {
"row_count": 10, # Number of rows
"column_count": 3, # Number of columns
"has_headers": True, # Headers detected
"has_caption": True, # Caption exists
"has_summary": False, # Summary exists
"id": "data-table-1", # Table ID if present
"class": "financial-data" # Table class if present
}
}
```
## Configuration Options
### Basic Configuration
```python
config = CrawlerRunConfig(
# Table extraction settings
table_score_threshold=7, # Default threshold (backward compatible)
table_extraction=strategy, # Optional: custom strategy
# Filter what to process
css_selector="main", # Focus on specific area
excluded_tags=["nav", "aside"] # Exclude page sections
)
```
### Advanced Configuration
```python
from crawl4ai import DefaultTableExtraction, CrawlerRunConfig
# Fine-tuned extraction
strategy = DefaultTableExtraction(
table_score_threshold=5, # Lower = more permissive
min_rows=3, # Require at least 3 rows
min_cols=2, # Require at least 2 columns
verbose=True # Detailed logging
)
config = CrawlerRunConfig(
table_extraction=strategy,
css_selector="article.content", # Target specific content
exclude_domains=["ads.com"], # Exclude ad domains
cache_mode=CacheMode.BYPASS # Fresh extraction
)
```
## Working with Extracted Tables
### Convert to Pandas DataFrame
```python
import pandas as pd
async def tables_to_dataframes(url):
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url)
dataframes = []
for table_data in result.tables:
# Create DataFrame
if table_data['headers']:
df = pd.DataFrame(
table_data['rows'],
columns=table_data['headers']
)
else:
df = pd.DataFrame(table_data['rows'])
# Add metadata as DataFrame attributes
df.attrs['caption'] = table_data.get('caption', '')
df.attrs['metadata'] = table_data.get('metadata', {})
dataframes.append(df)
return dataframes
```
### Filter Tables by Criteria
```python
async def extract_large_tables(url):
async with AsyncWebCrawler() as crawler:
# Configure minimum size requirements
strategy = DefaultTableExtraction(
min_rows=10,
min_cols=3,
table_score_threshold=6
)
config = CrawlerRunConfig(
table_extraction=strategy
)
result = await crawler.arun(url, config)
# Further filter results
large_tables = [
table for table in result.tables
if table['metadata']['row_count'] > 10
and table['metadata']['column_count'] > 3
]
return large_tables
```
### Export Tables to Different Formats
```python
import json
import csv
async def export_tables(url):
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url)
for i, table in enumerate(result.tables):
# Export as JSON
with open(f'table_{i}.json', 'w') as f:
json.dump(table, f, indent=2)
# Export as CSV
with open(f'table_{i}.csv', 'w', newline='') as f:
writer = csv.writer(f)
if table['headers']:
writer.writerow(table['headers'])
writer.writerows(table['rows'])
# Export as Markdown
with open(f'table_{i}.md', 'w') as f:
# Write headers
if table['headers']:
f.write('| ' + ' | '.join(table['headers']) + ' |\n')
f.write('|' + '---|' * len(table['headers']) + '\n')
# Write rows
for row in table['rows']:
f.write('| ' + ' | '.join(str(cell) for cell in row) + ' |\n')
```
## Creating Custom Strategies
Extend `TableExtractionStrategy` to create custom extraction logic:
### Example: Financial Table Extractor
```python
from crawl4ai import TableExtractionStrategy
from typing import List, Dict, Any
import re
class FinancialTableExtractor(TableExtractionStrategy):
"""Extract tables containing financial data."""
def __init__(self, currency_symbols=None, require_numbers=True, **kwargs):
super().__init__(**kwargs)
self.currency_symbols = currency_symbols or ['$', '', '£', '¥']
self.require_numbers = require_numbers
self.number_pattern = re.compile(r'\d+[,.]?\d*')
def extract_tables(self, element, **kwargs):
tables_data = []
for table in element.xpath(".//table"):
# Check if table contains financial indicators
table_text = ''.join(table.itertext())
# Must contain currency symbols
has_currency = any(sym in table_text for sym in self.currency_symbols)
if not has_currency:
continue
# Must contain numbers if required
if self.require_numbers:
numbers = self.number_pattern.findall(table_text)
if len(numbers) < 3: # Arbitrary minimum
continue
# Extract the table data
table_data = self._extract_financial_data(table)
if table_data:
tables_data.append(table_data)
return tables_data
def _extract_financial_data(self, table):
"""Extract and clean financial data from table."""
headers = []
rows = []
# Extract headers
for th in table.xpath(".//thead//th | .//tr[1]//th"):
headers.append(th.text_content().strip())
# Extract and clean rows
for tr in table.xpath(".//tbody//tr | .//tr[position()>1]"):
row = []
for td in tr.xpath(".//td"):
text = td.text_content().strip()
# Clean currency formatting
text = re.sub(r'[$€£¥,]', '', text)
row.append(text)
if row:
rows.append(row)
return {
"headers": headers,
"rows": rows,
"caption": self._get_caption(table),
"summary": table.get("summary", ""),
"metadata": {
"type": "financial",
"row_count": len(rows),
"column_count": len(headers) or len(rows[0]) if rows else 0
}
}
def _get_caption(self, table):
caption = table.xpath(".//caption/text()")
return caption[0].strip() if caption else ""
# Usage
strategy = FinancialTableExtractor(
currency_symbols=['$', 'EUR'],
require_numbers=True
)
config = CrawlerRunConfig(
table_extraction=strategy
)
```
### Example: Specific Table Extractor
```python
class SpecificTableExtractor(TableExtractionStrategy):
"""Extract only tables matching specific criteria."""
def __init__(self,
required_headers=None,
id_pattern=None,
class_pattern=None,
**kwargs):
super().__init__(**kwargs)
self.required_headers = required_headers or []
self.id_pattern = id_pattern
self.class_pattern = class_pattern
def extract_tables(self, element, **kwargs):
tables_data = []
for table in element.xpath(".//table"):
# Check ID pattern
if self.id_pattern:
table_id = table.get('id', '')
if not re.match(self.id_pattern, table_id):
continue
# Check class pattern
if self.class_pattern:
table_class = table.get('class', '')
if not re.match(self.class_pattern, table_class):
continue
# Extract headers to check requirements
headers = self._extract_headers(table)
# Check if required headers are present
if self.required_headers:
if not all(req in headers for req in self.required_headers):
continue
# Extract full table data
table_data = self._extract_table_data(table, headers)
tables_data.append(table_data)
return tables_data
```
## Combining with Other Strategies
Table extraction works seamlessly with other Crawl4AI strategies:
```python
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
DefaultTableExtraction,
LLMExtractionStrategy,
JsonCssExtractionStrategy
)
async def combined_extraction(url):
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
# Table extraction
table_extraction=DefaultTableExtraction(
table_score_threshold=6,
min_rows=2
),
# CSS-based extraction for specific elements
extraction_strategy=JsonCssExtractionStrategy({
"title": "h1",
"summary": "p.summary",
"date": "time"
}),
# Focus on main content
css_selector="main.content"
)
result = await crawler.arun(url, config)
# Access different extraction results
tables = result.tables # Table data
structured = json.loads(result.extracted_content) # CSS extraction
return {
"tables": tables,
"structured_data": structured,
"markdown": result.markdown
}
```
## Performance Considerations
### Optimization Tips
1. **Disable when not needed**: Use `NoTableExtraction` if tables aren't required
2. **Target specific areas**: Use `css_selector` to limit processing scope
3. **Set minimum thresholds**: Filter out small/irrelevant tables early
4. **Cache results**: Use appropriate cache modes for repeated extractions
```python
# Optimized configuration for large pages
config = CrawlerRunConfig(
# Only process main content area
css_selector="article.main-content",
# Exclude navigation and sidebars
excluded_tags=["nav", "aside", "footer"],
# Higher threshold for stricter filtering
table_extraction=DefaultTableExtraction(
table_score_threshold=8,
min_rows=5,
min_cols=3
),
# Enable caching for repeated access
cache_mode=CacheMode.ENABLED
)
```
## Migration Guide
### Important: Your Code Still Works!
**No changes required!** The transition to the strategy pattern is **fully backward compatible**.
### How It Works Internally
#### v0.7.2 and Earlier
```python
# Old way - directly passing table_score_threshold
config = CrawlerRunConfig(
table_score_threshold=7
)
# Internally: No strategy pattern, direct implementation
```
#### v0.7.3+ (Current)
```python
# Old way STILL WORKS - we handle it internally
config = CrawlerRunConfig(
table_score_threshold=7
)
# Internally: Automatically creates DefaultTableExtraction(table_score_threshold=7)
```
### Taking Advantage of New Features
While your old code works, you can now use the strategy pattern for more control:
```python
# Option 1: Keep using the old way (perfectly fine!)
config = CrawlerRunConfig(
table_score_threshold=7 # Still supported
)
# Option 2: Use the new strategy pattern (more flexibility)
from crawl4ai import DefaultTableExtraction
strategy = DefaultTableExtraction(
table_score_threshold=7,
min_rows=2, # New capability!
min_cols=2 # New capability!
)
config = CrawlerRunConfig(
table_extraction=strategy
)
# Option 3: Use advanced strategies when needed
from crawl4ai import LLMTableExtraction, LLMConfig
# Only for complex tables that DefaultTableExtraction can't handle
# Automatically handles large tables with smart chunking
llm_strategy = LLMTableExtraction(
llm_config=LLMConfig(
provider="groq/llama-3.3-70b-versatile",
api_token="your_key"
),
max_tries=3,
enable_chunking=True, # Automatically chunk large tables
chunk_token_threshold=3000, # Chunk when exceeding 3000 tokens
max_parallel_chunks=5 # Process up to 5 chunks in parallel
)
config = CrawlerRunConfig(
table_extraction=llm_strategy # Advanced extraction with automatic chunking
)
```
### Summary
-**No breaking changes** - Old code works as-is
-**Same defaults** - DefaultTableExtraction is automatically used
-**Gradual adoption** - Use new features when you need them
-**Full compatibility** - result.tables structure unchanged
## Best Practices
### 1. Choose the Right Strategy (Cost-Conscious Approach)
**Decision Flow**:
```
1. Do you need tables?
→ No: Use NoTableExtraction
→ Yes: Continue to #2
2. Try DefaultTableExtraction first (FREE)
→ Works? Done! ✅
→ Fails? Continue to #3
3. Is the table critical and complex?
→ No: Accept DefaultTableExtraction results
→ Yes: Continue to #4
4. Use LLMTableExtraction (COSTS MONEY)
→ Small table (<50 rows): Any LLM provider
→ Large table (50+ rows): Use Groq or Cerebras
→ Very large (500+ rows): Reconsider - maybe chunk the page
```
**Strategy Selection Guide**:
- **DefaultTableExtraction**: Use for 99% of cases - it's free and effective
- **LLMTableExtraction**: Only for complex tables with merged cells that break DefaultTableExtraction
- **NoTableExtraction**: When you only need text/markdown content
- **Custom Strategy**: For specialized requirements (financial, scientific, etc.)
### 2. Validate Extracted Data
```python
def validate_table(table):
"""Validate table data quality."""
# Check structure
if not table.get('rows'):
return False
# Check consistency
if table.get('headers'):
expected_cols = len(table['headers'])
for row in table['rows']:
if len(row) != expected_cols:
return False
# Check minimum content
total_cells = sum(len(row) for row in table['rows'])
non_empty = sum(1 for row in table['rows']
for cell in row if cell.strip())
if non_empty / total_cells < 0.5: # Less than 50% non-empty
return False
return True
# Filter valid tables
valid_tables = [t for t in result.tables if validate_table(t)]
```
### 3. Handle Edge Cases
```python
async def robust_table_extraction(url):
"""Extract tables with error handling."""
async with AsyncWebCrawler() as crawler:
try:
config = CrawlerRunConfig(
table_extraction=DefaultTableExtraction(
table_score_threshold=6,
verbose=True
)
)
result = await crawler.arun(url, config)
if not result.success:
print(f"Crawl failed: {result.error}")
return []
# Process tables safely
processed_tables = []
for table in result.tables:
try:
# Validate and process
if validate_table(table):
processed_tables.append(table)
except Exception as e:
print(f"Error processing table: {e}")
continue
return processed_tables
except Exception as e:
print(f"Extraction error: {e}")
return []
```
## Troubleshooting
### Common Issues and Solutions
| Issue | Cause | Solution |
|-------|-------|----------|
| No tables extracted | Score too high | Lower `table_score_threshold` |
| Layout tables included | Score too low | Increase `table_score_threshold` |
| Missing tables | CSS selector too specific | Broaden or remove `css_selector` |
| Incomplete data | Complex table structure | Create custom strategy |
| Performance issues | Processing entire page | Use `css_selector` to limit scope |
### Debug Logging
Enable verbose logging to understand extraction decisions:
```python
import logging
# Configure logging
logging.basicConfig(level=logging.DEBUG)
# Enable verbose mode in strategy
strategy = DefaultTableExtraction(
table_score_threshold=7,
verbose=True # Detailed extraction logs
)
config = CrawlerRunConfig(
table_extraction=strategy,
verbose=True # General crawler logs
)
```
## See Also
- [Extraction Strategies](extraction-strategies.md) - Overview of all extraction strategies
- [Content Selection](content-selection.md) - Using CSS selectors and filters
- [Performance Optimization](../optimization/performance-tuning.md) - Speed up extraction
- [Examples](../examples/table_extraction_example.py) - Complete working examples

View File

@@ -0,0 +1,376 @@
# Migration Guide: Table Extraction v0.7.3
## Overview
Version 0.7.3 introduces the **Table Extraction Strategy Pattern**, providing a more flexible and extensible approach to table extraction while maintaining full backward compatibility.
## What's New
### Strategy Pattern Implementation
Table extraction now follows the same strategy pattern used throughout Crawl4AI:
- **Consistent Architecture**: Aligns with extraction, chunking, and markdown strategies
- **Extensibility**: Easy to create custom table extraction strategies
- **Better Separation**: Table logic moved from content scraping to dedicated module
- **Full Control**: Fine-grained control over table detection and extraction
### New Classes
```python
from crawl4ai import (
TableExtractionStrategy, # Abstract base class
DefaultTableExtraction, # Current implementation (default)
NoTableExtraction # Explicitly disable extraction
)
```
## Backward Compatibility
**✅ All existing code continues to work without changes.**
### No Changes Required
If your code looks like this, it will continue to work:
```python
# This still works exactly the same
config = CrawlerRunConfig(
table_score_threshold=7
)
result = await crawler.arun(url, config)
tables = result.tables # Same structure, same data
```
### What Happens Behind the Scenes
When you don't specify a `table_extraction` strategy:
1. `CrawlerRunConfig` automatically creates `DefaultTableExtraction`
2. It uses your `table_score_threshold` parameter
3. Tables are extracted exactly as before
4. Results appear in `result.tables` with the same structure
## New Capabilities
### 1. Explicit Strategy Configuration
You can now explicitly configure table extraction:
```python
# New: Explicit control
strategy = DefaultTableExtraction(
table_score_threshold=7,
min_rows=2, # New: minimum row filter
min_cols=2, # New: minimum column filter
verbose=True # New: detailed logging
)
config = CrawlerRunConfig(
table_extraction=strategy
)
```
### 2. Disable Table Extraction
Improve performance when tables aren't needed:
```python
# New: Skip table extraction entirely
config = CrawlerRunConfig(
table_extraction=NoTableExtraction()
)
# No CPU cycles spent on table detection/extraction
```
### 3. Custom Extraction Strategies
Create specialized extractors:
```python
class MyTableExtractor(TableExtractionStrategy):
def extract_tables(self, element, **kwargs):
# Custom extraction logic
return custom_tables
config = CrawlerRunConfig(
table_extraction=MyTableExtractor()
)
```
## Migration Scenarios
### Scenario 1: Basic Usage (No Changes Needed)
**Before (v0.7.2):**
```python
config = CrawlerRunConfig()
result = await crawler.arun(url, config)
for table in result.tables:
print(table['headers'])
```
**After (v0.7.3):**
```python
# Exactly the same - no changes required
config = CrawlerRunConfig()
result = await crawler.arun(url, config)
for table in result.tables:
print(table['headers'])
```
### Scenario 2: Custom Threshold (No Changes Needed)
**Before (v0.7.2):**
```python
config = CrawlerRunConfig(
table_score_threshold=5
)
```
**After (v0.7.3):**
```python
# Still works the same
config = CrawlerRunConfig(
table_score_threshold=5
)
# Or use new explicit approach for more control
strategy = DefaultTableExtraction(
table_score_threshold=5,
min_rows=2 # Additional filtering
)
config = CrawlerRunConfig(
table_extraction=strategy
)
```
### Scenario 3: Advanced Filtering (New Feature)
**Before (v0.7.2):**
```python
# Had to filter after extraction
config = CrawlerRunConfig(
table_score_threshold=5
)
result = await crawler.arun(url, config)
# Manual filtering
large_tables = [
t for t in result.tables
if len(t['rows']) >= 5 and len(t['headers']) >= 3
]
```
**After (v0.7.3):**
```python
# Filter during extraction (more efficient)
strategy = DefaultTableExtraction(
table_score_threshold=5,
min_rows=5,
min_cols=3
)
config = CrawlerRunConfig(
table_extraction=strategy
)
result = await crawler.arun(url, config)
# result.tables already filtered
```
## Code Organization Changes
### Module Structure
**Before (v0.7.2):**
```
crawl4ai/
content_scraping_strategy.py
- LXMLWebScrapingStrategy
- is_data_table() # Table detection
- extract_table_data() # Table extraction
```
**After (v0.7.3):**
```
crawl4ai/
content_scraping_strategy.py
- LXMLWebScrapingStrategy
# Table methods removed, uses strategy
table_extraction.py (NEW)
- TableExtractionStrategy # Base class
- DefaultTableExtraction # Moved logic here
- NoTableExtraction # New option
```
### Import Changes
**New imports available (optional):**
```python
# These are now available but not required for existing code
from crawl4ai import (
TableExtractionStrategy,
DefaultTableExtraction,
NoTableExtraction
)
```
## Performance Implications
### No Performance Impact
For existing code, performance remains identical:
- Same extraction logic
- Same scoring algorithm
- Same processing time
### Performance Improvements Available
New options for better performance:
```python
# Skip tables entirely (faster)
config = CrawlerRunConfig(
table_extraction=NoTableExtraction()
)
# Process only specific areas (faster)
config = CrawlerRunConfig(
css_selector="main.content",
table_extraction=DefaultTableExtraction(
min_rows=5, # Skip small tables
min_cols=3
)
)
```
## Testing Your Migration
### Verification Script
Run this to verify your extraction still works:
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
async def verify_extraction():
url = "your_url_here"
async with AsyncWebCrawler() as crawler:
# Test 1: Old approach
config_old = CrawlerRunConfig(
table_score_threshold=7
)
result_old = await crawler.arun(url, config_old)
# Test 2: New explicit approach
from crawl4ai import DefaultTableExtraction
config_new = CrawlerRunConfig(
table_extraction=DefaultTableExtraction(
table_score_threshold=7
)
)
result_new = await crawler.arun(url, config_new)
# Compare results
assert len(result_old.tables) == len(result_new.tables)
print(f"✓ Both approaches extracted {len(result_old.tables)} tables")
# Verify structure
for old, new in zip(result_old.tables, result_new.tables):
assert old['headers'] == new['headers']
assert old['rows'] == new['rows']
print("✓ Table content identical")
asyncio.run(verify_extraction())
```
## Deprecation Notes
### No Deprecations
- All existing parameters continue to work
- `table_score_threshold` in `CrawlerRunConfig` is still supported
- No breaking changes
### Internal Changes (Transparent to Users)
- `LXMLWebScrapingStrategy.is_data_table()` - Moved to `DefaultTableExtraction`
- `LXMLWebScrapingStrategy.extract_table_data()` - Moved to `DefaultTableExtraction`
These methods were internal and not part of the public API.
## Benefits of Upgrading
While not required, using the new pattern provides:
1. **Better Control**: Filter tables during extraction, not after
2. **Performance Options**: Skip extraction when not needed
3. **Extensibility**: Create custom extractors for specific needs
4. **Consistency**: Same pattern as other Crawl4AI strategies
5. **Future-Proof**: Ready for upcoming advanced strategies
## Troubleshooting
### Issue: Different Number of Tables
**Cause**: Threshold or filtering differences
**Solution**:
```python
# Ensure same threshold
strategy = DefaultTableExtraction(
table_score_threshold=7, # Match your old setting
min_rows=0, # No filtering (default)
min_cols=0 # No filtering (default)
)
```
### Issue: Import Errors
**Cause**: Using new classes without importing
**Solution**:
```python
# Add imports if using new features
from crawl4ai import (
DefaultTableExtraction,
NoTableExtraction,
TableExtractionStrategy
)
```
### Issue: Custom Strategy Not Working
**Cause**: Incorrect method signature
**Solution**:
```python
class CustomExtractor(TableExtractionStrategy):
def extract_tables(self, element, **kwargs): # Correct signature
# Not: extract_tables(self, html)
# Not: extract(self, element)
return tables_list
```
## Getting Help
If you encounter issues:
1. Check your `table_score_threshold` matches previous settings
2. Verify imports if using new classes
3. Enable verbose logging: `DefaultTableExtraction(verbose=True)`
4. Review the [Table Extraction Documentation](../core/table_extraction.md)
5. Check [examples](../examples/table_extraction_example.py)
## Summary
-**Full backward compatibility** - No code changes required
-**Same results** - Identical extraction behavior by default
-**New options** - Additional control when needed
-**Better architecture** - Consistent with Crawl4AI patterns
-**Ready for future** - Foundation for advanced strategies
The migration to v0.7.3 is seamless with no required changes while providing new capabilities for those who need them.

View File

@@ -0,0 +1,245 @@
#!/usr/bin/env python3
"""
Test LLMTableExtraction with controlled HTML
"""
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import asyncio
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
LLMConfig,
LLMTableExtraction,
DefaultTableExtraction,
CacheMode
)
async def test_controlled_html():
"""Test with controlled HTML content."""
print("\n" + "=" * 60)
print("LLM TABLE EXTRACTION TEST")
print("=" * 60)
# Create test HTML with complex tables
test_html = """
<!DOCTYPE html>
<html>
<head><title>Test Tables</title></head>
<body>
<h1>Sales Data</h1>
<table border="1">
<caption>Q1 2024 Sales Report</caption>
<thead>
<tr>
<th rowspan="2">Product</th>
<th colspan="3">January</th>
<th colspan="3">February</th>
</tr>
<tr>
<th>Week 1</th>
<th>Week 2</th>
<th>Week 3</th>
<th>Week 1</th>
<th>Week 2</th>
<th>Week 3</th>
</tr>
</thead>
<tbody>
<tr>
<td>Widget A</td>
<td>100</td>
<td>120</td>
<td>110</td>
<td>130</td>
<td>140</td>
<td>150</td>
</tr>
<tr>
<td>Widget B</td>
<td>200</td>
<td>180</td>
<td>190</td>
<td>210</td>
<td>220</td>
<td>230</td>
</tr>
<tr>
<td colspan="7">Note: All values in thousands USD</td>
</tr>
</tbody>
</table>
<br>
<table>
<tr>
<th>Country</th>
<th>Population</th>
<th>GDP</th>
</tr>
<tr>
<td>USA</td>
<td>331M</td>
<td>$21T</td>
</tr>
<tr>
<td>China</td>
<td>1.4B</td>
<td>$14T</td>
</tr>
</table>
</body>
</html>
"""
# url = "https://www.w3schools.com/html/html_tables.asp"
url = "https://en.wikipedia.org/wiki/List_of_chemical_elements"
# url = "https://en.wikipedia.org/wiki/List_of_prime_ministers_of_India"
# Configure LLM
llm_config = LLMConfig(
provider="openai/gpt-4.1-mini",
# provider="groq/llama-3.3-70b-versatile",
api_token=os.getenv("OPENAI_API_KEY"),
# api_token=os.getenv("GROQ_API_KEY"),
# api_token="os.getenv("GROQ_API_KEY")",
temperature=0.1,
max_tokens=32000
)
print("\n1. Testing LLMTableExtraction:")
# Create LLM extraction strategy
llm_strategy = LLMTableExtraction(
llm_config=llm_config,
verbose=True,
# css_selector="div.w3-example"
css_selector="div.mw-content-ltr",
# css_selector="table.wikitable",
max_tries=2,
enable_chunking=True,
chunk_token_threshold=5000, # Lower threshold to force chunking
min_rows_per_chunk=10,
max_parallel_chunks=3
)
config_llm = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
table_extraction=llm_strategy
)
async with AsyncWebCrawler() as crawler:
# Test with LLM extraction
result_llm = await crawler.arun(
# url=f"raw:{test_html}",
url=url,
config=config_llm
)
if result_llm.success:
print(f"\n ✓ LLM Extraction: Found {len(result_llm.tables)} table(s)")
for i, table in enumerate(result_llm.tables, 1):
print(f"\n Table {i}:")
print(f" - Caption: {table.get('caption', 'No caption')}")
print(f" - Headers: {table['headers']}")
print(f" - Rows: {len(table['rows'])}")
# Show how colspan/rowspan were handled
print(f" - Sample rows:")
for j, row in enumerate(table['rows'][:2], 1):
print(f" Row {j}: {row}")
metadata = table.get('metadata', {})
print(f" - Metadata:")
print(f" • Has merged cells: {metadata.get('has_merged_cells', False)}")
print(f" • Table type: {metadata.get('table_type', 'unknown')}")
# # Compare with default extraction
# print("\n2. Comparing with DefaultTableExtraction:")
# default_strategy = DefaultTableExtraction(
# table_score_threshold=3,
# verbose=False
# )
# config_default = CrawlerRunConfig(
# cache_mode=CacheMode.BYPASS,
# table_extraction=default_strategy
# )
# result_default = await crawler.arun(
# # url=f"raw:{test_html}",
# url=url,
# config=config_default
# )
# if result_default.success:
# print(f" ✓ Default Extraction: Found {len(result_default.tables)} table(s)")
# # Compare handling of complex structures
# print("\n3. Comparison Summary:")
# print(f" LLM found: {len(result_llm.tables)} tables")
# print(f" Default found: {len(result_default.tables)} tables")
# if result_llm.tables and result_default.tables:
# llm_first = result_llm.tables[0]
# default_first = result_default.tables[0]
# print(f"\n First table comparison:")
# print(f" LLM headers: {len(llm_first['headers'])} columns")
# print(f" Default headers: {len(default_first['headers'])} columns")
# # Check if LLM better handled the complex structure
# if llm_first.get('metadata', {}).get('has_merged_cells'):
# print(" ✓ LLM correctly identified merged cells")
# # Test pandas compatibility
# try:
# import pandas as pd
# print("\n4. Testing Pandas compatibility:")
# # Create DataFrame from LLM extraction
# df_llm = pd.DataFrame(
# llm_first['rows'],
# columns=llm_first['headers']
# )
# print(f" ✓ LLM table -> DataFrame: Shape {df_llm.shape}")
# # Create DataFrame from default extraction
# df_default = pd.DataFrame(
# default_first['rows'],
# columns=default_first['headers']
# )
# print(f" ✓ Default table -> DataFrame: Shape {df_default.shape}")
# print("\n LLM DataFrame preview:")
# print(df_llm.head(2).to_string())
# except ImportError:
# print("\n4. Pandas not installed, skipping DataFrame test")
print("\n✅ Test completed successfully!")
async def main():
"""Run the test."""
# Check for API key
if not os.getenv("OPENAI_API_KEY"):
print("⚠️ OPENAI_API_KEY not set. Please set it to test LLM extraction.")
print(" You can set it with: export OPENAI_API_KEY='your-key-here'")
return
await test_controlled_html()
if __name__ == "__main__":
asyncio.run(main())