diff --git a/docs/examples/llm_table_extraction_example.py b/docs/examples/llm_table_extraction_example.py
index 845f7eb9..b97d2bbe 100644
--- a/docs/examples/llm_table_extraction_example.py
+++ b/docs/examples/llm_table_extraction_example.py
@@ -17,7 +17,6 @@ __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file
import asyncio
-import json
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
@@ -38,13 +37,19 @@ async def basic_llm_extraction():
provider="openai/gpt-4.1-mini",
api_token="env:OPENAI_API_KEY", # Uses environment variable
temperature=0.1, # Low temperature for consistency
- max_tokens=2000
+ max_tokens=32000
)
# Create LLM table extraction strategy
table_strategy = LLMTableExtraction(
llm_config=llm_config,
- verbose=True
+ verbose=True,
+ # css_selector="div.mw-content-ltr",
+ max_tries=2,
+ enable_chunking=True,
+ chunk_token_threshold=5000, # Lower threshold to force chunking
+ min_rows_per_chunk=10,
+ max_parallel_chunks=3
)
# Configure crawler with the strategy
@@ -56,7 +61,7 @@ async def basic_llm_extraction():
async with AsyncWebCrawler() as crawler:
# Extract tables from a Wikipedia page
result = await crawler.arun(
- url="https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
+ url="https://en.wikipedia.org/wiki/List_of_chemical_elements",
config=config
)
@@ -264,70 +269,14 @@ async def compare_strategies():
print(f" Row {i+1}: {row}")
print(f" Metadata: {table.get('metadata', {})}")
-
-# Example 4: Using Local Models (Ollama)
-async def local_model_extraction():
- """Extract tables using local Ollama models for privacy/cost."""
- print("\n=== Example 4: Local Model Extraction with Ollama ===")
-
- # Configure for local Ollama
- llm_config = LLMConfig(
- provider="ollama/llama3.3",
- api_token=None, # Not needed for Ollama
- base_url="http://localhost:11434",
- temperature=0.1
- )
-
- table_strategy = LLMTableExtraction(
- llm_config=llm_config,
- verbose=True
- )
-
- config = CrawlerRunConfig(
- cache_mode=CacheMode.BYPASS,
- table_extraction=table_strategy
- )
-
- # Simple test HTML
- test_html = """
-
-
- | Product | Price | Stock |
-
-
- | Apple | $1.50 | 100 |
- | Banana | $0.50 | 200 |
- | Orange | $2.00 | 50 |
-
-
- """
-
- async with AsyncWebCrawler() as crawler:
- result = await crawler.arun(
- url=f"raw:{test_html}",
- config=config
- )
-
- if result.success and result.tables:
- table = result.tables[0]
- print(f"✓ Extracted with local model:")
-
- # Create DataFrame
- df = pd.DataFrame(table['rows'], columns=table['headers'])
- print(df.to_string())
- else:
- print("✗ Make sure Ollama is running locally with llama3.3 model")
-
-
-# Example 5: Batch Processing Multiple Pages
+# Example 4: Batch Processing Multiple Pages
async def batch_extraction():
"""Extract tables from multiple pages efficiently."""
- print("\n=== Example 5: Batch Table Extraction ===")
+ print("\n=== Example 4: Batch Table Extraction ===")
urls = [
- "https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)",
- "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
- "https://en.wikipedia.org/wiki/List_of_countries_by_Human_Development_Index"
+ "https://www.worldometers.info/geography/alphabetical-list-of-countries/",
+ # "https://en.wikipedia.org/wiki/List_of_chemical_elements",
]
llm_config = LLMConfig(
@@ -339,8 +288,12 @@ async def batch_extraction():
table_strategy = LLMTableExtraction(
llm_config=llm_config,
- css_selector="table.wikitable", # Wikipedia data tables
- verbose=False
+ css_selector="div.datatable-container", # Wikipedia data tables
+ verbose=False,
+ enable_chunking=True,
+ chunk_token_threshold=5000, # Lower threshold to force chunking
+ min_rows_per_chunk=10,
+ max_parallel_chunks=3
)
config = CrawlerRunConfig(
@@ -391,9 +344,6 @@ async def main():
# # Compare strategies
# await compare_strategies()
- # # Local model (requires Ollama)
- # # await local_model_extraction()
-
# # Batch processing
# await batch_extraction()
diff --git a/tests/test_llm_simple_url.py b/tests/test_llm_simple_url.py
index c5f4068a..bb31434c 100644
--- a/tests/test_llm_simple_url.py
+++ b/tests/test_llm_simple_url.py
@@ -23,90 +23,15 @@ async def test_controlled_html():
print("LLM TABLE EXTRACTION TEST")
print("=" * 60)
- # Create test HTML with complex tables
- test_html = """
-
-
- Test Tables
-
- Sales Data
-
-
- Q1 2024 Sales Report
-
-
- | Product |
- January |
- February |
-
-
- | Week 1 |
- Week 2 |
- Week 3 |
- Week 1 |
- Week 2 |
- Week 3 |
-
-
-
-
- | Widget A |
- 100 |
- 120 |
- 110 |
- 130 |
- 140 |
- 150 |
-
-
- | Widget B |
- 200 |
- 180 |
- 190 |
- 210 |
- 220 |
- 230 |
-
-
- | Note: All values in thousands USD |
-
-
-
-
-
-
-
-
- | Country |
- Population |
- GDP |
-
-
- | USA |
- 331M |
- $21T |
-
-
- | China |
- 1.4B |
- $14T |
-
-
-
-
- """
-
- # url = "https://www.w3schools.com/html/html_tables.asp"
url = "https://en.wikipedia.org/wiki/List_of_chemical_elements"
# url = "https://en.wikipedia.org/wiki/List_of_prime_ministers_of_India"
# Configure LLM
llm_config = LLMConfig(
- provider="openai/gpt-4.1-mini",
- # provider="groq/llama-3.3-70b-versatile",
- api_token=os.getenv("OPENAI_API_KEY"),
- # api_token=os.getenv("GROQ_API_KEY"),
- # api_token="os.getenv("GROQ_API_KEY")",
+ # provider="openai/gpt-4.1-mini",
+ # api_token=os.getenv("OPENAI_API_KEY"),
+ provider="groq/llama-3.3-70b-versatile",
+ api_token="GROQ_API_TOKEN",
temperature=0.1,
max_tokens=32000
)