feat: 🚀 Introduce revolutionary LLMTableExtraction with intelligent chunking for massive tables
BREAKING CHANGE: Table extraction now uses Strategy Design Pattern This epic commit introduces a game-changing approach to table extraction in Crawl4AI: ✨ NEW FEATURES: - LLMTableExtraction: AI-powered extraction for complex HTML tables with rowspan/colspan - Smart Chunking: Automatically splits massive tables into optimal chunks at row boundaries - Parallel Processing: Processes multiple chunks simultaneously for blazing-fast extraction - Intelligent Merging: Seamlessly combines chunk results into complete tables - Header Preservation: Each chunk maintains context with original headers - Auto-retry Logic: Built-in resilience with configurable retry attempts 🏗️ ARCHITECTURE: - Strategy Design Pattern for pluggable table extraction strategies - ThreadPoolExecutor for concurrent chunk processing - Token-based chunking with configurable thresholds - Handles tables without headers gracefully ⚡ PERFORMANCE: - Process 1000+ row tables without timeout - Parallel processing with up to 5 concurrent chunks - Smart token estimation prevents LLM context overflow - Optimized for providers like Groq for massive tables 🔧 CONFIGURATION: - enable_chunking: Auto-handle large tables (default: True) - chunk_token_threshold: When to split (default: 3000 tokens) - min_rows_per_chunk: Meaningful chunk sizes (default: 10) - max_parallel_chunks: Concurrent processing (default: 5) 📚 BACKWARD COMPATIBILITY: - Existing code continues to work unchanged - DefaultTableExtraction remains the default strategy - Progressive enhancement approach This is the future of web table extraction - handling everything from simple tables to massive, complex data grids with merged cells and nested structures. The chunking is completely transparent to users while providing unprecedented scalability.
This commit is contained in:
406
docs/examples/llm_table_extraction_example.py
Normal file
406
docs/examples/llm_table_extraction_example.py
Normal file
@@ -0,0 +1,406 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Example demonstrating LLM-based table extraction in Crawl4AI.
|
||||
|
||||
This example shows how to use the LLMTableExtraction strategy to extract
|
||||
complex tables from web pages, including handling rowspan, colspan, and nested tables.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Get the grandparent directory
|
||||
grandparent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
sys.path.append(grandparent_dir)
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
LLMConfig,
|
||||
LLMTableExtraction,
|
||||
CacheMode
|
||||
)
|
||||
import pandas as pd
|
||||
|
||||
|
||||
# Example 1: Basic LLM Table Extraction
|
||||
async def basic_llm_extraction():
|
||||
"""Extract tables using LLM with default settings."""
|
||||
print("\n=== Example 1: Basic LLM Table Extraction ===")
|
||||
|
||||
# Configure LLM (using OpenAI GPT-4o-mini for cost efficiency)
|
||||
llm_config = LLMConfig(
|
||||
provider="openai/gpt-4.1-mini",
|
||||
api_token="env:OPENAI_API_KEY", # Uses environment variable
|
||||
temperature=0.1, # Low temperature for consistency
|
||||
max_tokens=2000
|
||||
)
|
||||
|
||||
# Create LLM table extraction strategy
|
||||
table_strategy = LLMTableExtraction(
|
||||
llm_config=llm_config,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
# Configure crawler with the strategy
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
table_extraction=table_strategy
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Extract tables from a Wikipedia page
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print(f"✓ Found {len(result.tables)} tables")
|
||||
|
||||
# Display first table
|
||||
if result.tables:
|
||||
first_table = result.tables[0]
|
||||
print(f"\nFirst table:")
|
||||
print(f" Headers: {first_table['headers'][:5]}...")
|
||||
print(f" Rows: {len(first_table['rows'])}")
|
||||
|
||||
# Convert to pandas DataFrame
|
||||
df = pd.DataFrame(
|
||||
first_table['rows'],
|
||||
columns=first_table['headers']
|
||||
)
|
||||
print(f"\nDataFrame shape: {df.shape}")
|
||||
print(df.head())
|
||||
else:
|
||||
print(f"✗ Extraction failed: {result.error}")
|
||||
|
||||
|
||||
# Example 2: Focused Extraction with CSS Selector
|
||||
async def focused_extraction():
|
||||
"""Extract tables from specific page sections using CSS selectors."""
|
||||
print("\n=== Example 2: Focused Extraction with CSS Selector ===")
|
||||
|
||||
# HTML with multiple tables
|
||||
test_html = """
|
||||
<html>
|
||||
<body>
|
||||
<div class="sidebar">
|
||||
<table role="presentation">
|
||||
<tr><td>Navigation</td></tr>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<div class="main-content">
|
||||
<table id="data-table">
|
||||
<caption>Quarterly Sales Report</caption>
|
||||
<thead>
|
||||
<tr>
|
||||
<th rowspan="2">Product</th>
|
||||
<th colspan="3">Q1 2024</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Jan</th>
|
||||
<th>Feb</th>
|
||||
<th>Mar</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Widget A</td>
|
||||
<td>100</td>
|
||||
<td>120</td>
|
||||
<td>140</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Widget B</td>
|
||||
<td>200</td>
|
||||
<td>180</td>
|
||||
<td>220</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
llm_config = LLMConfig(
|
||||
provider="openai/gpt-4.1-mini",
|
||||
api_token="env:OPENAI_API_KEY"
|
||||
)
|
||||
|
||||
# Focus only on main content area
|
||||
table_strategy = LLMTableExtraction(
|
||||
llm_config=llm_config,
|
||||
css_selector=".main-content", # Only extract from main content
|
||||
verbose=True
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
table_extraction=table_strategy
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url=f"raw:{test_html}",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success and result.tables:
|
||||
table = result.tables[0]
|
||||
print(f"✓ Extracted table: {table.get('caption', 'No caption')}")
|
||||
print(f" Headers: {table['headers']}")
|
||||
print(f" Metadata: {table['metadata']}")
|
||||
|
||||
# The LLM should have handled the rowspan/colspan correctly
|
||||
print("\nProcessed data (rowspan/colspan handled):")
|
||||
for i, row in enumerate(table['rows']):
|
||||
print(f" Row {i+1}: {row}")
|
||||
|
||||
|
||||
# Example 3: Comparing with Default Extraction
|
||||
async def compare_strategies():
|
||||
"""Compare LLM extraction with default extraction on complex tables."""
|
||||
print("\n=== Example 3: Comparing LLM vs Default Extraction ===")
|
||||
|
||||
# Complex table with nested structure
|
||||
complex_html = """
|
||||
<html>
|
||||
<body>
|
||||
<table>
|
||||
<tr>
|
||||
<th rowspan="3">Category</th>
|
||||
<th colspan="2">2023</th>
|
||||
<th colspan="2">2024</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>H1</th>
|
||||
<th>H2</th>
|
||||
<th>H1</th>
|
||||
<th>H2</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="4">All values in millions</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Revenue</td>
|
||||
<td>100</td>
|
||||
<td>120</td>
|
||||
<td>130</td>
|
||||
<td>145</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Profit</td>
|
||||
<td>20</td>
|
||||
<td>25</td>
|
||||
<td>28</td>
|
||||
<td>32</td>
|
||||
</tr>
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Test with default extraction
|
||||
from crawl4ai import DefaultTableExtraction
|
||||
|
||||
default_strategy = DefaultTableExtraction(
|
||||
table_score_threshold=3,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
config_default = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
table_extraction=default_strategy
|
||||
)
|
||||
|
||||
result_default = await crawler.arun(
|
||||
url=f"raw:{complex_html}",
|
||||
config=config_default
|
||||
)
|
||||
|
||||
# Test with LLM extraction
|
||||
llm_strategy = LLMTableExtraction(
|
||||
llm_config=LLMConfig(
|
||||
provider="openai/gpt-4.1-mini",
|
||||
api_token="env:OPENAI_API_KEY"
|
||||
),
|
||||
verbose=True
|
||||
)
|
||||
|
||||
config_llm = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
table_extraction=llm_strategy
|
||||
)
|
||||
|
||||
result_llm = await crawler.arun(
|
||||
url=f"raw:{complex_html}",
|
||||
config=config_llm
|
||||
)
|
||||
|
||||
# Compare results
|
||||
print("\nDefault Extraction:")
|
||||
if result_default.tables:
|
||||
table = result_default.tables[0]
|
||||
print(f" Headers: {table.get('headers', [])}")
|
||||
print(f" Rows: {len(table.get('rows', []))}")
|
||||
for i, row in enumerate(table.get('rows', [])[:3]):
|
||||
print(f" Row {i+1}: {row}")
|
||||
|
||||
print("\nLLM Extraction (handles complex structure better):")
|
||||
if result_llm.tables:
|
||||
table = result_llm.tables[0]
|
||||
print(f" Headers: {table.get('headers', [])}")
|
||||
print(f" Rows: {len(table.get('rows', []))}")
|
||||
for i, row in enumerate(table.get('rows', [])):
|
||||
print(f" Row {i+1}: {row}")
|
||||
print(f" Metadata: {table.get('metadata', {})}")
|
||||
|
||||
|
||||
# Example 4: Using Local Models (Ollama)
|
||||
async def local_model_extraction():
|
||||
"""Extract tables using local Ollama models for privacy/cost."""
|
||||
print("\n=== Example 4: Local Model Extraction with Ollama ===")
|
||||
|
||||
# Configure for local Ollama
|
||||
llm_config = LLMConfig(
|
||||
provider="ollama/llama3.3",
|
||||
api_token=None, # Not needed for Ollama
|
||||
base_url="http://localhost:11434",
|
||||
temperature=0.1
|
||||
)
|
||||
|
||||
table_strategy = LLMTableExtraction(
|
||||
llm_config=llm_config,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
table_extraction=table_strategy
|
||||
)
|
||||
|
||||
# Simple test HTML
|
||||
test_html = """
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>Product</th><th>Price</th><th>Stock</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr><td>Apple</td><td>$1.50</td><td>100</td></tr>
|
||||
<tr><td>Banana</td><td>$0.50</td><td>200</td></tr>
|
||||
<tr><td>Orange</td><td>$2.00</td><td>50</td></tr>
|
||||
</tbody>
|
||||
</table>
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url=f"raw:{test_html}",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success and result.tables:
|
||||
table = result.tables[0]
|
||||
print(f"✓ Extracted with local model:")
|
||||
|
||||
# Create DataFrame
|
||||
df = pd.DataFrame(table['rows'], columns=table['headers'])
|
||||
print(df.to_string())
|
||||
else:
|
||||
print("✗ Make sure Ollama is running locally with llama3.3 model")
|
||||
|
||||
|
||||
# Example 5: Batch Processing Multiple Pages
|
||||
async def batch_extraction():
|
||||
"""Extract tables from multiple pages efficiently."""
|
||||
print("\n=== Example 5: Batch Table Extraction ===")
|
||||
|
||||
urls = [
|
||||
"https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)",
|
||||
"https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
|
||||
"https://en.wikipedia.org/wiki/List_of_countries_by_Human_Development_Index"
|
||||
]
|
||||
|
||||
llm_config = LLMConfig(
|
||||
provider="openai/gpt-4.1-mini",
|
||||
api_token="env:OPENAI_API_KEY",
|
||||
temperature=0.1,
|
||||
max_tokens=1500
|
||||
)
|
||||
|
||||
table_strategy = LLMTableExtraction(
|
||||
llm_config=llm_config,
|
||||
css_selector="table.wikitable", # Wikipedia data tables
|
||||
verbose=False
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
table_extraction=table_strategy,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
all_tables = []
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
for url in urls:
|
||||
print(f"\nProcessing: {url.split('/')[-1][:50]}...")
|
||||
result = await crawler.arun(url=url, config=config)
|
||||
|
||||
if result.success and result.tables:
|
||||
print(f" ✓ Found {len(result.tables)} tables")
|
||||
# Store first table from each page
|
||||
if result.tables:
|
||||
all_tables.append({
|
||||
'url': url,
|
||||
'table': result.tables[0]
|
||||
})
|
||||
|
||||
# Summary
|
||||
print(f"\n=== Summary ===")
|
||||
print(f"Extracted {len(all_tables)} tables from {len(urls)} pages")
|
||||
for item in all_tables:
|
||||
table = item['table']
|
||||
print(f"\nFrom {item['url'].split('/')[-1][:30]}:")
|
||||
print(f" Columns: {len(table['headers'])}")
|
||||
print(f" Rows: {len(table['rows'])}")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all examples."""
|
||||
print("=" * 60)
|
||||
print("LLM TABLE EXTRACTION EXAMPLES")
|
||||
print("=" * 60)
|
||||
|
||||
# Run examples (comment out ones you don't want to run)
|
||||
|
||||
# Basic extraction
|
||||
await basic_llm_extraction()
|
||||
|
||||
# # Focused extraction with CSS
|
||||
# await focused_extraction()
|
||||
|
||||
# # Compare strategies
|
||||
# await compare_strategies()
|
||||
|
||||
# # Local model (requires Ollama)
|
||||
# # await local_model_extraction()
|
||||
|
||||
# # Batch processing
|
||||
# await batch_extraction()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("ALL EXAMPLES COMPLETED")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
276
docs/examples/table_extraction_example.py
Normal file
276
docs/examples/table_extraction_example.py
Normal file
@@ -0,0 +1,276 @@
|
||||
"""
|
||||
Example: Using Table Extraction Strategies in Crawl4AI
|
||||
|
||||
This example demonstrates how to use different table extraction strategies
|
||||
to extract tables from web pages.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import pandas as pd
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
CacheMode,
|
||||
DefaultTableExtraction,
|
||||
NoTableExtraction,
|
||||
TableExtractionStrategy
|
||||
)
|
||||
from typing import Dict, List, Any
|
||||
|
||||
|
||||
async def example_default_extraction():
|
||||
"""Example 1: Using default table extraction (automatic)."""
|
||||
print("\n" + "="*50)
|
||||
print("Example 1: Default Table Extraction")
|
||||
print("="*50)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# No need to specify table_extraction - uses DefaultTableExtraction automatically
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
table_score_threshold=7 # Adjust sensitivity (default: 7)
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
"https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success and result.tables:
|
||||
print(f"Found {len(result.tables)} tables")
|
||||
|
||||
# Convert first table to pandas DataFrame
|
||||
if result.tables:
|
||||
first_table = result.tables[0]
|
||||
df = pd.DataFrame(
|
||||
first_table['rows'],
|
||||
columns=first_table['headers'] if first_table['headers'] else None
|
||||
)
|
||||
print(f"\nFirst table preview:")
|
||||
print(df.head())
|
||||
print(f"Shape: {df.shape}")
|
||||
|
||||
|
||||
async def example_custom_configuration():
|
||||
"""Example 2: Custom table extraction configuration."""
|
||||
print("\n" + "="*50)
|
||||
print("Example 2: Custom Table Configuration")
|
||||
print("="*50)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Create custom extraction strategy with specific settings
|
||||
table_strategy = DefaultTableExtraction(
|
||||
table_score_threshold=5, # Lower threshold for more permissive detection
|
||||
min_rows=3, # Only extract tables with at least 3 rows
|
||||
min_cols=2, # Only extract tables with at least 2 columns
|
||||
verbose=True
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
table_extraction=table_strategy,
|
||||
# Target specific tables using CSS selector
|
||||
css_selector="div.main-content"
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
"https://example.com/data",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print(f"Found {len(result.tables)} tables matching criteria")
|
||||
|
||||
for i, table in enumerate(result.tables):
|
||||
print(f"\nTable {i+1}:")
|
||||
print(f" Caption: {table.get('caption', 'No caption')}")
|
||||
print(f" Size: {table['metadata']['row_count']} rows × {table['metadata']['column_count']} columns")
|
||||
print(f" Has headers: {table['metadata']['has_headers']}")
|
||||
|
||||
|
||||
async def example_disable_extraction():
|
||||
"""Example 3: Disable table extraction when not needed."""
|
||||
print("\n" + "="*50)
|
||||
print("Example 3: Disable Table Extraction")
|
||||
print("="*50)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Use NoTableExtraction to skip table processing entirely
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
table_extraction=NoTableExtraction() # No tables will be extracted
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
"https://example.com",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print(f"Tables extracted: {len(result.tables)} (should be 0)")
|
||||
print("Table extraction disabled - better performance for non-table content")
|
||||
|
||||
|
||||
class FinancialTableExtraction(TableExtractionStrategy):
|
||||
"""
|
||||
Custom strategy for extracting financial tables with specific requirements.
|
||||
"""
|
||||
|
||||
def __init__(self, currency_symbols=None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.currency_symbols = currency_symbols or ['$', '€', '£', '¥']
|
||||
|
||||
def extract_tables(self, element, **kwargs):
|
||||
"""Extract only tables that appear to contain financial data."""
|
||||
tables_data = []
|
||||
|
||||
for table in element.xpath(".//table"):
|
||||
# Check if table contains currency symbols
|
||||
table_text = ''.join(table.itertext())
|
||||
has_currency = any(symbol in table_text for symbol in self.currency_symbols)
|
||||
|
||||
if not has_currency:
|
||||
continue
|
||||
|
||||
# Extract using base logic (could reuse DefaultTableExtraction logic)
|
||||
headers = []
|
||||
rows = []
|
||||
|
||||
# Extract headers
|
||||
for th in table.xpath(".//thead//th | .//tr[1]//th"):
|
||||
headers.append(th.text_content().strip())
|
||||
|
||||
# Extract rows
|
||||
for tr in table.xpath(".//tbody//tr | .//tr[position()>1]"):
|
||||
row = []
|
||||
for td in tr.xpath(".//td"):
|
||||
cell_text = td.text_content().strip()
|
||||
# Clean currency values
|
||||
for symbol in self.currency_symbols:
|
||||
cell_text = cell_text.replace(symbol, '')
|
||||
row.append(cell_text)
|
||||
if row:
|
||||
rows.append(row)
|
||||
|
||||
if headers or rows:
|
||||
tables_data.append({
|
||||
"headers": headers,
|
||||
"rows": rows,
|
||||
"caption": table.xpath(".//caption/text()")[0] if table.xpath(".//caption") else "",
|
||||
"summary": table.get("summary", ""),
|
||||
"metadata": {
|
||||
"type": "financial",
|
||||
"has_currency": True,
|
||||
"row_count": len(rows),
|
||||
"column_count": len(headers) if headers else len(rows[0]) if rows else 0
|
||||
}
|
||||
})
|
||||
|
||||
return tables_data
|
||||
|
||||
|
||||
async def example_custom_strategy():
|
||||
"""Example 4: Custom table extraction strategy."""
|
||||
print("\n" + "="*50)
|
||||
print("Example 4: Custom Financial Table Strategy")
|
||||
print("="*50)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Use custom strategy for financial tables
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
table_extraction=FinancialTableExtraction(
|
||||
currency_symbols=['$', '€'],
|
||||
verbose=True
|
||||
)
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
"https://finance.yahoo.com/",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print(f"Found {len(result.tables)} financial tables")
|
||||
|
||||
for table in result.tables:
|
||||
if table['metadata'].get('type') == 'financial':
|
||||
print(f" ✓ Financial table with {table['metadata']['row_count']} rows")
|
||||
|
||||
|
||||
async def example_combined_extraction():
|
||||
"""Example 5: Combine table extraction with other strategies."""
|
||||
print("\n" + "="*50)
|
||||
print("Example 5: Combined Extraction Strategies")
|
||||
print("="*50)
|
||||
|
||||
from crawl4ai import LLMExtractionStrategy, LLMConfig
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Define schema for structured extraction
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"page_title": {"type": "string"},
|
||||
"main_topic": {"type": "string"},
|
||||
"key_figures": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
# Table extraction
|
||||
table_extraction=DefaultTableExtraction(
|
||||
table_score_threshold=6,
|
||||
min_rows=2
|
||||
),
|
||||
# LLM extraction for structured data
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
llm_config=LLMConfig(provider="openai"),
|
||||
schema=schema
|
||||
)
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
"https://en.wikipedia.org/wiki/Economy_of_the_United_States",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print(f"Tables found: {len(result.tables)}")
|
||||
|
||||
# Tables are in result.tables
|
||||
if result.tables:
|
||||
print(f"First table has {len(result.tables[0]['rows'])} rows")
|
||||
|
||||
# Structured data is in result.extracted_content
|
||||
if result.extracted_content:
|
||||
import json
|
||||
structured_data = json.loads(result.extracted_content)
|
||||
print(f"Page title: {structured_data.get('page_title', 'N/A')}")
|
||||
print(f"Main topic: {structured_data.get('main_topic', 'N/A')}")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all examples."""
|
||||
print("\n" + "="*60)
|
||||
print("CRAWL4AI TABLE EXTRACTION EXAMPLES")
|
||||
print("="*60)
|
||||
|
||||
# Run examples
|
||||
await example_default_extraction()
|
||||
await example_custom_configuration()
|
||||
await example_disable_extraction()
|
||||
await example_custom_strategy()
|
||||
# await example_combined_extraction() # Requires OpenAI API key
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("EXAMPLES COMPLETED")
|
||||
print("="*60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user