refactor: Update LLMTableExtraction examples and tests

This commit is contained in:
ntohidi
2025-08-15 18:47:31 +08:00
parent 9f7fee91a9
commit ada7441bd1
2 changed files with 23 additions and 148 deletions

View File

@@ -17,7 +17,6 @@ __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file
import asyncio
import json
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
@@ -38,13 +37,19 @@ async def basic_llm_extraction():
provider="openai/gpt-4.1-mini",
api_token="env:OPENAI_API_KEY", # Uses environment variable
temperature=0.1, # Low temperature for consistency
max_tokens=2000
max_tokens=32000
)
# Create LLM table extraction strategy
table_strategy = LLMTableExtraction(
llm_config=llm_config,
verbose=True
verbose=True,
# css_selector="div.mw-content-ltr",
max_tries=2,
enable_chunking=True,
chunk_token_threshold=5000, # Lower threshold to force chunking
min_rows_per_chunk=10,
max_parallel_chunks=3
)
# Configure crawler with the strategy
@@ -56,7 +61,7 @@ async def basic_llm_extraction():
async with AsyncWebCrawler() as crawler:
# Extract tables from a Wikipedia page
result = await crawler.arun(
url="https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
url="https://en.wikipedia.org/wiki/List_of_chemical_elements",
config=config
)
@@ -264,70 +269,14 @@ async def compare_strategies():
print(f" Row {i+1}: {row}")
print(f" Metadata: {table.get('metadata', {})}")
# Example 4: Using Local Models (Ollama)
async def local_model_extraction():
"""Extract tables using local Ollama models for privacy/cost."""
print("\n=== Example 4: Local Model Extraction with Ollama ===")
# Configure for local Ollama
llm_config = LLMConfig(
provider="ollama/llama3.3",
api_token=None, # Not needed for Ollama
base_url="http://localhost:11434",
temperature=0.1
)
table_strategy = LLMTableExtraction(
llm_config=llm_config,
verbose=True
)
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
table_extraction=table_strategy
)
# Simple test HTML
test_html = """
<table>
<thead>
<tr><th>Product</th><th>Price</th><th>Stock</th></tr>
</thead>
<tbody>
<tr><td>Apple</td><td>$1.50</td><td>100</td></tr>
<tr><td>Banana</td><td>$0.50</td><td>200</td></tr>
<tr><td>Orange</td><td>$2.00</td><td>50</td></tr>
</tbody>
</table>
"""
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=f"raw:{test_html}",
config=config
)
if result.success and result.tables:
table = result.tables[0]
print(f"✓ Extracted with local model:")
# Create DataFrame
df = pd.DataFrame(table['rows'], columns=table['headers'])
print(df.to_string())
else:
print("✗ Make sure Ollama is running locally with llama3.3 model")
# Example 5: Batch Processing Multiple Pages
# Example 4: Batch Processing Multiple Pages
async def batch_extraction():
"""Extract tables from multiple pages efficiently."""
print("\n=== Example 5: Batch Table Extraction ===")
print("\n=== Example 4: Batch Table Extraction ===")
urls = [
"https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)",
"https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
"https://en.wikipedia.org/wiki/List_of_countries_by_Human_Development_Index"
"https://www.worldometers.info/geography/alphabetical-list-of-countries/",
# "https://en.wikipedia.org/wiki/List_of_chemical_elements",
]
llm_config = LLMConfig(
@@ -339,8 +288,12 @@ async def batch_extraction():
table_strategy = LLMTableExtraction(
llm_config=llm_config,
css_selector="table.wikitable", # Wikipedia data tables
verbose=False
css_selector="div.datatable-container", # Wikipedia data tables
verbose=False,
enable_chunking=True,
chunk_token_threshold=5000, # Lower threshold to force chunking
min_rows_per_chunk=10,
max_parallel_chunks=3
)
config = CrawlerRunConfig(
@@ -391,9 +344,6 @@ async def main():
# # Compare strategies
# await compare_strategies()
# # Local model (requires Ollama)
# # await local_model_extraction()
# # Batch processing
# await batch_extraction()

View File

@@ -23,90 +23,15 @@ async def test_controlled_html():
print("LLM TABLE EXTRACTION TEST")
print("=" * 60)
# Create test HTML with complex tables
test_html = """
<!DOCTYPE html>
<html>
<head><title>Test Tables</title></head>
<body>
<h1>Sales Data</h1>
<table border="1">
<caption>Q1 2024 Sales Report</caption>
<thead>
<tr>
<th rowspan="2">Product</th>
<th colspan="3">January</th>
<th colspan="3">February</th>
</tr>
<tr>
<th>Week 1</th>
<th>Week 2</th>
<th>Week 3</th>
<th>Week 1</th>
<th>Week 2</th>
<th>Week 3</th>
</tr>
</thead>
<tbody>
<tr>
<td>Widget A</td>
<td>100</td>
<td>120</td>
<td>110</td>
<td>130</td>
<td>140</td>
<td>150</td>
</tr>
<tr>
<td>Widget B</td>
<td>200</td>
<td>180</td>
<td>190</td>
<td>210</td>
<td>220</td>
<td>230</td>
</tr>
<tr>
<td colspan="7">Note: All values in thousands USD</td>
</tr>
</tbody>
</table>
<br>
<table>
<tr>
<th>Country</th>
<th>Population</th>
<th>GDP</th>
</tr>
<tr>
<td>USA</td>
<td>331M</td>
<td>$21T</td>
</tr>
<tr>
<td>China</td>
<td>1.4B</td>
<td>$14T</td>
</tr>
</table>
</body>
</html>
"""
# url = "https://www.w3schools.com/html/html_tables.asp"
url = "https://en.wikipedia.org/wiki/List_of_chemical_elements"
# url = "https://en.wikipedia.org/wiki/List_of_prime_ministers_of_India"
# Configure LLM
llm_config = LLMConfig(
provider="openai/gpt-4.1-mini",
# provider="groq/llama-3.3-70b-versatile",
api_token=os.getenv("OPENAI_API_KEY"),
# api_token=os.getenv("GROQ_API_KEY"),
# api_token="os.getenv("GROQ_API_KEY")",
# provider="openai/gpt-4.1-mini",
# api_token=os.getenv("OPENAI_API_KEY"),
provider="groq/llama-3.3-70b-versatile",
api_token="GROQ_API_TOKEN",
temperature=0.1,
max_tokens=32000
)