feat: 🚀 Introduce revolutionary LLMTableExtraction with intelligent chunking for massive tables
BREAKING CHANGE: Table extraction now uses Strategy Design Pattern This epic commit introduces a game-changing approach to table extraction in Crawl4AI: ✨ NEW FEATURES: - LLMTableExtraction: AI-powered extraction for complex HTML tables with rowspan/colspan - Smart Chunking: Automatically splits massive tables into optimal chunks at row boundaries - Parallel Processing: Processes multiple chunks simultaneously for blazing-fast extraction - Intelligent Merging: Seamlessly combines chunk results into complete tables - Header Preservation: Each chunk maintains context with original headers - Auto-retry Logic: Built-in resilience with configurable retry attempts 🏗️ ARCHITECTURE: - Strategy Design Pattern for pluggable table extraction strategies - ThreadPoolExecutor for concurrent chunk processing - Token-based chunking with configurable thresholds - Handles tables without headers gracefully ⚡ PERFORMANCE: - Process 1000+ row tables without timeout - Parallel processing with up to 5 concurrent chunks - Smart token estimation prevents LLM context overflow - Optimized for providers like Groq for massive tables 🔧 CONFIGURATION: - enable_chunking: Auto-handle large tables (default: True) - chunk_token_threshold: When to split (default: 3000 tokens) - min_rows_per_chunk: Meaningful chunk sizes (default: 10) - max_parallel_chunks: Concurrent processing (default: 5) 📚 BACKWARD COMPATIBILITY: - Existing code continues to work unchanged - DefaultTableExtraction remains the default strategy - Progressive enhancement approach This is the future of web table extraction - handling everything from simple tables to massive, complex data grids with merged cells and nested structures. The chunking is completely transparent to users while providing unprecedented scalability.
This commit is contained in:
245
tests/test_llm_simple_url.py
Normal file
245
tests/test_llm_simple_url.py
Normal file
@@ -0,0 +1,245 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test LLMTableExtraction with controlled HTML
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
LLMConfig,
|
||||
LLMTableExtraction,
|
||||
DefaultTableExtraction,
|
||||
CacheMode
|
||||
)
|
||||
|
||||
async def test_controlled_html():
|
||||
"""Test with controlled HTML content."""
|
||||
print("\n" + "=" * 60)
|
||||
print("LLM TABLE EXTRACTION TEST")
|
||||
print("=" * 60)
|
||||
|
||||
# Create test HTML with complex tables
|
||||
test_html = """
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>Test Tables</title></head>
|
||||
<body>
|
||||
<h1>Sales Data</h1>
|
||||
|
||||
<table border="1">
|
||||
<caption>Q1 2024 Sales Report</caption>
|
||||
<thead>
|
||||
<tr>
|
||||
<th rowspan="2">Product</th>
|
||||
<th colspan="3">January</th>
|
||||
<th colspan="3">February</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Week 1</th>
|
||||
<th>Week 2</th>
|
||||
<th>Week 3</th>
|
||||
<th>Week 1</th>
|
||||
<th>Week 2</th>
|
||||
<th>Week 3</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Widget A</td>
|
||||
<td>100</td>
|
||||
<td>120</td>
|
||||
<td>110</td>
|
||||
<td>130</td>
|
||||
<td>140</td>
|
||||
<td>150</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Widget B</td>
|
||||
<td>200</td>
|
||||
<td>180</td>
|
||||
<td>190</td>
|
||||
<td>210</td>
|
||||
<td>220</td>
|
||||
<td>230</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="7">Note: All values in thousands USD</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<br>
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th>Country</th>
|
||||
<th>Population</th>
|
||||
<th>GDP</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>USA</td>
|
||||
<td>331M</td>
|
||||
<td>$21T</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>China</td>
|
||||
<td>1.4B</td>
|
||||
<td>$14T</td>
|
||||
</tr>
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
# url = "https://www.w3schools.com/html/html_tables.asp"
|
||||
url = "https://en.wikipedia.org/wiki/List_of_chemical_elements"
|
||||
# url = "https://en.wikipedia.org/wiki/List_of_prime_ministers_of_India"
|
||||
|
||||
# Configure LLM
|
||||
llm_config = LLMConfig(
|
||||
provider="openai/gpt-4.1-mini",
|
||||
# provider="groq/llama-3.3-70b-versatile",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
# api_token=os.getenv("GROQ_API_KEY"),
|
||||
# api_token="os.getenv("GROQ_API_KEY")",
|
||||
temperature=0.1,
|
||||
max_tokens=32000
|
||||
)
|
||||
|
||||
print("\n1. Testing LLMTableExtraction:")
|
||||
|
||||
# Create LLM extraction strategy
|
||||
llm_strategy = LLMTableExtraction(
|
||||
llm_config=llm_config,
|
||||
verbose=True,
|
||||
# css_selector="div.w3-example"
|
||||
css_selector="div.mw-content-ltr",
|
||||
# css_selector="table.wikitable",
|
||||
max_tries=2,
|
||||
|
||||
enable_chunking=True,
|
||||
chunk_token_threshold=5000, # Lower threshold to force chunking
|
||||
min_rows_per_chunk=10,
|
||||
max_parallel_chunks=3
|
||||
)
|
||||
|
||||
config_llm = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
table_extraction=llm_strategy
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Test with LLM extraction
|
||||
result_llm = await crawler.arun(
|
||||
# url=f"raw:{test_html}",
|
||||
url=url,
|
||||
config=config_llm
|
||||
)
|
||||
|
||||
if result_llm.success:
|
||||
print(f"\n ✓ LLM Extraction: Found {len(result_llm.tables)} table(s)")
|
||||
|
||||
for i, table in enumerate(result_llm.tables, 1):
|
||||
print(f"\n Table {i}:")
|
||||
print(f" - Caption: {table.get('caption', 'No caption')}")
|
||||
print(f" - Headers: {table['headers']}")
|
||||
print(f" - Rows: {len(table['rows'])}")
|
||||
|
||||
# Show how colspan/rowspan were handled
|
||||
print(f" - Sample rows:")
|
||||
for j, row in enumerate(table['rows'][:2], 1):
|
||||
print(f" Row {j}: {row}")
|
||||
|
||||
metadata = table.get('metadata', {})
|
||||
print(f" - Metadata:")
|
||||
print(f" • Has merged cells: {metadata.get('has_merged_cells', False)}")
|
||||
print(f" • Table type: {metadata.get('table_type', 'unknown')}")
|
||||
|
||||
# # Compare with default extraction
|
||||
# print("\n2. Comparing with DefaultTableExtraction:")
|
||||
|
||||
# default_strategy = DefaultTableExtraction(
|
||||
# table_score_threshold=3,
|
||||
# verbose=False
|
||||
# )
|
||||
|
||||
# config_default = CrawlerRunConfig(
|
||||
# cache_mode=CacheMode.BYPASS,
|
||||
# table_extraction=default_strategy
|
||||
# )
|
||||
|
||||
# result_default = await crawler.arun(
|
||||
# # url=f"raw:{test_html}",
|
||||
# url=url,
|
||||
# config=config_default
|
||||
# )
|
||||
|
||||
# if result_default.success:
|
||||
# print(f" ✓ Default Extraction: Found {len(result_default.tables)} table(s)")
|
||||
|
||||
# # Compare handling of complex structures
|
||||
# print("\n3. Comparison Summary:")
|
||||
# print(f" LLM found: {len(result_llm.tables)} tables")
|
||||
# print(f" Default found: {len(result_default.tables)} tables")
|
||||
|
||||
# if result_llm.tables and result_default.tables:
|
||||
# llm_first = result_llm.tables[0]
|
||||
# default_first = result_default.tables[0]
|
||||
|
||||
# print(f"\n First table comparison:")
|
||||
# print(f" LLM headers: {len(llm_first['headers'])} columns")
|
||||
# print(f" Default headers: {len(default_first['headers'])} columns")
|
||||
|
||||
# # Check if LLM better handled the complex structure
|
||||
# if llm_first.get('metadata', {}).get('has_merged_cells'):
|
||||
# print(" ✓ LLM correctly identified merged cells")
|
||||
|
||||
# # Test pandas compatibility
|
||||
# try:
|
||||
# import pandas as pd
|
||||
|
||||
# print("\n4. Testing Pandas compatibility:")
|
||||
|
||||
# # Create DataFrame from LLM extraction
|
||||
# df_llm = pd.DataFrame(
|
||||
# llm_first['rows'],
|
||||
# columns=llm_first['headers']
|
||||
# )
|
||||
# print(f" ✓ LLM table -> DataFrame: Shape {df_llm.shape}")
|
||||
|
||||
# # Create DataFrame from default extraction
|
||||
# df_default = pd.DataFrame(
|
||||
# default_first['rows'],
|
||||
# columns=default_first['headers']
|
||||
# )
|
||||
# print(f" ✓ Default table -> DataFrame: Shape {df_default.shape}")
|
||||
|
||||
# print("\n LLM DataFrame preview:")
|
||||
# print(df_llm.head(2).to_string())
|
||||
|
||||
# except ImportError:
|
||||
# print("\n4. Pandas not installed, skipping DataFrame test")
|
||||
|
||||
print("\n✅ Test completed successfully!")
|
||||
|
||||
async def main():
|
||||
"""Run the test."""
|
||||
|
||||
# Check for API key
|
||||
if not os.getenv("OPENAI_API_KEY"):
|
||||
print("⚠️ OPENAI_API_KEY not set. Please set it to test LLM extraction.")
|
||||
print(" You can set it with: export OPENAI_API_KEY='your-key-here'")
|
||||
return
|
||||
|
||||
await test_controlled_html()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user