refactor: Update LLMTableExtraction examples and tests
This commit is contained in:
@@ -17,7 +17,6 @@ __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file
|
|||||||
|
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
|
||||||
from crawl4ai import (
|
from crawl4ai import (
|
||||||
AsyncWebCrawler,
|
AsyncWebCrawler,
|
||||||
CrawlerRunConfig,
|
CrawlerRunConfig,
|
||||||
@@ -38,13 +37,19 @@ async def basic_llm_extraction():
|
|||||||
provider="openai/gpt-4.1-mini",
|
provider="openai/gpt-4.1-mini",
|
||||||
api_token="env:OPENAI_API_KEY", # Uses environment variable
|
api_token="env:OPENAI_API_KEY", # Uses environment variable
|
||||||
temperature=0.1, # Low temperature for consistency
|
temperature=0.1, # Low temperature for consistency
|
||||||
max_tokens=2000
|
max_tokens=32000
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create LLM table extraction strategy
|
# Create LLM table extraction strategy
|
||||||
table_strategy = LLMTableExtraction(
|
table_strategy = LLMTableExtraction(
|
||||||
llm_config=llm_config,
|
llm_config=llm_config,
|
||||||
verbose=True
|
verbose=True,
|
||||||
|
# css_selector="div.mw-content-ltr",
|
||||||
|
max_tries=2,
|
||||||
|
enable_chunking=True,
|
||||||
|
chunk_token_threshold=5000, # Lower threshold to force chunking
|
||||||
|
min_rows_per_chunk=10,
|
||||||
|
max_parallel_chunks=3
|
||||||
)
|
)
|
||||||
|
|
||||||
# Configure crawler with the strategy
|
# Configure crawler with the strategy
|
||||||
@@ -56,7 +61,7 @@ async def basic_llm_extraction():
|
|||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
# Extract tables from a Wikipedia page
|
# Extract tables from a Wikipedia page
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url="https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
|
url="https://en.wikipedia.org/wiki/List_of_chemical_elements",
|
||||||
config=config
|
config=config
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -264,70 +269,14 @@ async def compare_strategies():
|
|||||||
print(f" Row {i+1}: {row}")
|
print(f" Row {i+1}: {row}")
|
||||||
print(f" Metadata: {table.get('metadata', {})}")
|
print(f" Metadata: {table.get('metadata', {})}")
|
||||||
|
|
||||||
|
# Example 4: Batch Processing Multiple Pages
|
||||||
# Example 4: Using Local Models (Ollama)
|
|
||||||
async def local_model_extraction():
|
|
||||||
"""Extract tables using local Ollama models for privacy/cost."""
|
|
||||||
print("\n=== Example 4: Local Model Extraction with Ollama ===")
|
|
||||||
|
|
||||||
# Configure for local Ollama
|
|
||||||
llm_config = LLMConfig(
|
|
||||||
provider="ollama/llama3.3",
|
|
||||||
api_token=None, # Not needed for Ollama
|
|
||||||
base_url="http://localhost:11434",
|
|
||||||
temperature=0.1
|
|
||||||
)
|
|
||||||
|
|
||||||
table_strategy = LLMTableExtraction(
|
|
||||||
llm_config=llm_config,
|
|
||||||
verbose=True
|
|
||||||
)
|
|
||||||
|
|
||||||
config = CrawlerRunConfig(
|
|
||||||
cache_mode=CacheMode.BYPASS,
|
|
||||||
table_extraction=table_strategy
|
|
||||||
)
|
|
||||||
|
|
||||||
# Simple test HTML
|
|
||||||
test_html = """
|
|
||||||
<table>
|
|
||||||
<thead>
|
|
||||||
<tr><th>Product</th><th>Price</th><th>Stock</th></tr>
|
|
||||||
</thead>
|
|
||||||
<tbody>
|
|
||||||
<tr><td>Apple</td><td>$1.50</td><td>100</td></tr>
|
|
||||||
<tr><td>Banana</td><td>$0.50</td><td>200</td></tr>
|
|
||||||
<tr><td>Orange</td><td>$2.00</td><td>50</td></tr>
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
"""
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
result = await crawler.arun(
|
|
||||||
url=f"raw:{test_html}",
|
|
||||||
config=config
|
|
||||||
)
|
|
||||||
|
|
||||||
if result.success and result.tables:
|
|
||||||
table = result.tables[0]
|
|
||||||
print(f"✓ Extracted with local model:")
|
|
||||||
|
|
||||||
# Create DataFrame
|
|
||||||
df = pd.DataFrame(table['rows'], columns=table['headers'])
|
|
||||||
print(df.to_string())
|
|
||||||
else:
|
|
||||||
print("✗ Make sure Ollama is running locally with llama3.3 model")
|
|
||||||
|
|
||||||
|
|
||||||
# Example 5: Batch Processing Multiple Pages
|
|
||||||
async def batch_extraction():
|
async def batch_extraction():
|
||||||
"""Extract tables from multiple pages efficiently."""
|
"""Extract tables from multiple pages efficiently."""
|
||||||
print("\n=== Example 5: Batch Table Extraction ===")
|
print("\n=== Example 4: Batch Table Extraction ===")
|
||||||
|
|
||||||
urls = [
|
urls = [
|
||||||
"https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)",
|
"https://www.worldometers.info/geography/alphabetical-list-of-countries/",
|
||||||
"https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
|
# "https://en.wikipedia.org/wiki/List_of_chemical_elements",
|
||||||
"https://en.wikipedia.org/wiki/List_of_countries_by_Human_Development_Index"
|
|
||||||
]
|
]
|
||||||
|
|
||||||
llm_config = LLMConfig(
|
llm_config = LLMConfig(
|
||||||
@@ -339,8 +288,12 @@ async def batch_extraction():
|
|||||||
|
|
||||||
table_strategy = LLMTableExtraction(
|
table_strategy = LLMTableExtraction(
|
||||||
llm_config=llm_config,
|
llm_config=llm_config,
|
||||||
css_selector="table.wikitable", # Wikipedia data tables
|
css_selector="div.datatable-container", # Wikipedia data tables
|
||||||
verbose=False
|
verbose=False,
|
||||||
|
enable_chunking=True,
|
||||||
|
chunk_token_threshold=5000, # Lower threshold to force chunking
|
||||||
|
min_rows_per_chunk=10,
|
||||||
|
max_parallel_chunks=3
|
||||||
)
|
)
|
||||||
|
|
||||||
config = CrawlerRunConfig(
|
config = CrawlerRunConfig(
|
||||||
@@ -391,9 +344,6 @@ async def main():
|
|||||||
# # Compare strategies
|
# # Compare strategies
|
||||||
# await compare_strategies()
|
# await compare_strategies()
|
||||||
|
|
||||||
# # Local model (requires Ollama)
|
|
||||||
# # await local_model_extraction()
|
|
||||||
|
|
||||||
# # Batch processing
|
# # Batch processing
|
||||||
# await batch_extraction()
|
# await batch_extraction()
|
||||||
|
|
||||||
|
|||||||
@@ -23,90 +23,15 @@ async def test_controlled_html():
|
|||||||
print("LLM TABLE EXTRACTION TEST")
|
print("LLM TABLE EXTRACTION TEST")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
|
|
||||||
# Create test HTML with complex tables
|
|
||||||
test_html = """
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head><title>Test Tables</title></head>
|
|
||||||
<body>
|
|
||||||
<h1>Sales Data</h1>
|
|
||||||
|
|
||||||
<table border="1">
|
|
||||||
<caption>Q1 2024 Sales Report</caption>
|
|
||||||
<thead>
|
|
||||||
<tr>
|
|
||||||
<th rowspan="2">Product</th>
|
|
||||||
<th colspan="3">January</th>
|
|
||||||
<th colspan="3">February</th>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<th>Week 1</th>
|
|
||||||
<th>Week 2</th>
|
|
||||||
<th>Week 3</th>
|
|
||||||
<th>Week 1</th>
|
|
||||||
<th>Week 2</th>
|
|
||||||
<th>Week 3</th>
|
|
||||||
</tr>
|
|
||||||
</thead>
|
|
||||||
<tbody>
|
|
||||||
<tr>
|
|
||||||
<td>Widget A</td>
|
|
||||||
<td>100</td>
|
|
||||||
<td>120</td>
|
|
||||||
<td>110</td>
|
|
||||||
<td>130</td>
|
|
||||||
<td>140</td>
|
|
||||||
<td>150</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td>Widget B</td>
|
|
||||||
<td>200</td>
|
|
||||||
<td>180</td>
|
|
||||||
<td>190</td>
|
|
||||||
<td>210</td>
|
|
||||||
<td>220</td>
|
|
||||||
<td>230</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td colspan="7">Note: All values in thousands USD</td>
|
|
||||||
</tr>
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<br>
|
|
||||||
|
|
||||||
<table>
|
|
||||||
<tr>
|
|
||||||
<th>Country</th>
|
|
||||||
<th>Population</th>
|
|
||||||
<th>GDP</th>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td>USA</td>
|
|
||||||
<td>331M</td>
|
|
||||||
<td>$21T</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td>China</td>
|
|
||||||
<td>1.4B</td>
|
|
||||||
<td>$14T</td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
"""
|
|
||||||
|
|
||||||
# url = "https://www.w3schools.com/html/html_tables.asp"
|
|
||||||
url = "https://en.wikipedia.org/wiki/List_of_chemical_elements"
|
url = "https://en.wikipedia.org/wiki/List_of_chemical_elements"
|
||||||
# url = "https://en.wikipedia.org/wiki/List_of_prime_ministers_of_India"
|
# url = "https://en.wikipedia.org/wiki/List_of_prime_ministers_of_India"
|
||||||
|
|
||||||
# Configure LLM
|
# Configure LLM
|
||||||
llm_config = LLMConfig(
|
llm_config = LLMConfig(
|
||||||
provider="openai/gpt-4.1-mini",
|
# provider="openai/gpt-4.1-mini",
|
||||||
# provider="groq/llama-3.3-70b-versatile",
|
# api_token=os.getenv("OPENAI_API_KEY"),
|
||||||
api_token=os.getenv("OPENAI_API_KEY"),
|
provider="groq/llama-3.3-70b-versatile",
|
||||||
# api_token=os.getenv("GROQ_API_KEY"),
|
api_token="GROQ_API_TOKEN",
|
||||||
# api_token="os.getenv("GROQ_API_KEY")",
|
|
||||||
temperature=0.1,
|
temperature=0.1,
|
||||||
max_tokens=32000
|
max_tokens=32000
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user