refactor: Update LLMTableExtraction examples and tests
This commit is contained in:
@@ -17,7 +17,6 @@ __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file
|
||||
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
@@ -38,13 +37,19 @@ async def basic_llm_extraction():
|
||||
provider="openai/gpt-4.1-mini",
|
||||
api_token="env:OPENAI_API_KEY", # Uses environment variable
|
||||
temperature=0.1, # Low temperature for consistency
|
||||
max_tokens=2000
|
||||
max_tokens=32000
|
||||
)
|
||||
|
||||
# Create LLM table extraction strategy
|
||||
table_strategy = LLMTableExtraction(
|
||||
llm_config=llm_config,
|
||||
verbose=True
|
||||
verbose=True,
|
||||
# css_selector="div.mw-content-ltr",
|
||||
max_tries=2,
|
||||
enable_chunking=True,
|
||||
chunk_token_threshold=5000, # Lower threshold to force chunking
|
||||
min_rows_per_chunk=10,
|
||||
max_parallel_chunks=3
|
||||
)
|
||||
|
||||
# Configure crawler with the strategy
|
||||
@@ -56,7 +61,7 @@ async def basic_llm_extraction():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Extract tables from a Wikipedia page
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
|
||||
url="https://en.wikipedia.org/wiki/List_of_chemical_elements",
|
||||
config=config
|
||||
)
|
||||
|
||||
@@ -264,70 +269,14 @@ async def compare_strategies():
|
||||
print(f" Row {i+1}: {row}")
|
||||
print(f" Metadata: {table.get('metadata', {})}")
|
||||
|
||||
|
||||
# Example 4: Using Local Models (Ollama)
|
||||
async def local_model_extraction():
|
||||
"""Extract tables using local Ollama models for privacy/cost."""
|
||||
print("\n=== Example 4: Local Model Extraction with Ollama ===")
|
||||
|
||||
# Configure for local Ollama
|
||||
llm_config = LLMConfig(
|
||||
provider="ollama/llama3.3",
|
||||
api_token=None, # Not needed for Ollama
|
||||
base_url="http://localhost:11434",
|
||||
temperature=0.1
|
||||
)
|
||||
|
||||
table_strategy = LLMTableExtraction(
|
||||
llm_config=llm_config,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
table_extraction=table_strategy
|
||||
)
|
||||
|
||||
# Simple test HTML
|
||||
test_html = """
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>Product</th><th>Price</th><th>Stock</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr><td>Apple</td><td>$1.50</td><td>100</td></tr>
|
||||
<tr><td>Banana</td><td>$0.50</td><td>200</td></tr>
|
||||
<tr><td>Orange</td><td>$2.00</td><td>50</td></tr>
|
||||
</tbody>
|
||||
</table>
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url=f"raw:{test_html}",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success and result.tables:
|
||||
table = result.tables[0]
|
||||
print(f"✓ Extracted with local model:")
|
||||
|
||||
# Create DataFrame
|
||||
df = pd.DataFrame(table['rows'], columns=table['headers'])
|
||||
print(df.to_string())
|
||||
else:
|
||||
print("✗ Make sure Ollama is running locally with llama3.3 model")
|
||||
|
||||
|
||||
# Example 5: Batch Processing Multiple Pages
|
||||
# Example 4: Batch Processing Multiple Pages
|
||||
async def batch_extraction():
|
||||
"""Extract tables from multiple pages efficiently."""
|
||||
print("\n=== Example 5: Batch Table Extraction ===")
|
||||
print("\n=== Example 4: Batch Table Extraction ===")
|
||||
|
||||
urls = [
|
||||
"https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)",
|
||||
"https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
|
||||
"https://en.wikipedia.org/wiki/List_of_countries_by_Human_Development_Index"
|
||||
"https://www.worldometers.info/geography/alphabetical-list-of-countries/",
|
||||
# "https://en.wikipedia.org/wiki/List_of_chemical_elements",
|
||||
]
|
||||
|
||||
llm_config = LLMConfig(
|
||||
@@ -339,8 +288,12 @@ async def batch_extraction():
|
||||
|
||||
table_strategy = LLMTableExtraction(
|
||||
llm_config=llm_config,
|
||||
css_selector="table.wikitable", # Wikipedia data tables
|
||||
verbose=False
|
||||
css_selector="div.datatable-container", # Wikipedia data tables
|
||||
verbose=False,
|
||||
enable_chunking=True,
|
||||
chunk_token_threshold=5000, # Lower threshold to force chunking
|
||||
min_rows_per_chunk=10,
|
||||
max_parallel_chunks=3
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
@@ -391,9 +344,6 @@ async def main():
|
||||
# # Compare strategies
|
||||
# await compare_strategies()
|
||||
|
||||
# # Local model (requires Ollama)
|
||||
# # await local_model_extraction()
|
||||
|
||||
# # Batch processing
|
||||
# await batch_extraction()
|
||||
|
||||
|
||||
@@ -23,90 +23,15 @@ async def test_controlled_html():
|
||||
print("LLM TABLE EXTRACTION TEST")
|
||||
print("=" * 60)
|
||||
|
||||
# Create test HTML with complex tables
|
||||
test_html = """
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>Test Tables</title></head>
|
||||
<body>
|
||||
<h1>Sales Data</h1>
|
||||
|
||||
<table border="1">
|
||||
<caption>Q1 2024 Sales Report</caption>
|
||||
<thead>
|
||||
<tr>
|
||||
<th rowspan="2">Product</th>
|
||||
<th colspan="3">January</th>
|
||||
<th colspan="3">February</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Week 1</th>
|
||||
<th>Week 2</th>
|
||||
<th>Week 3</th>
|
||||
<th>Week 1</th>
|
||||
<th>Week 2</th>
|
||||
<th>Week 3</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Widget A</td>
|
||||
<td>100</td>
|
||||
<td>120</td>
|
||||
<td>110</td>
|
||||
<td>130</td>
|
||||
<td>140</td>
|
||||
<td>150</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Widget B</td>
|
||||
<td>200</td>
|
||||
<td>180</td>
|
||||
<td>190</td>
|
||||
<td>210</td>
|
||||
<td>220</td>
|
||||
<td>230</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="7">Note: All values in thousands USD</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<br>
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th>Country</th>
|
||||
<th>Population</th>
|
||||
<th>GDP</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>USA</td>
|
||||
<td>331M</td>
|
||||
<td>$21T</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>China</td>
|
||||
<td>1.4B</td>
|
||||
<td>$14T</td>
|
||||
</tr>
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
# url = "https://www.w3schools.com/html/html_tables.asp"
|
||||
url = "https://en.wikipedia.org/wiki/List_of_chemical_elements"
|
||||
# url = "https://en.wikipedia.org/wiki/List_of_prime_ministers_of_India"
|
||||
|
||||
# Configure LLM
|
||||
llm_config = LLMConfig(
|
||||
provider="openai/gpt-4.1-mini",
|
||||
# provider="groq/llama-3.3-70b-versatile",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
# api_token=os.getenv("GROQ_API_KEY"),
|
||||
# api_token="os.getenv("GROQ_API_KEY")",
|
||||
# provider="openai/gpt-4.1-mini",
|
||||
# api_token=os.getenv("OPENAI_API_KEY"),
|
||||
provider="groq/llama-3.3-70b-versatile",
|
||||
api_token="GROQ_API_TOKEN",
|
||||
temperature=0.1,
|
||||
max_tokens=32000
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user