refactor: Update LLMTableExtraction examples and tests

2025-08-15 18:47:31 +08:00
parent 9f7fee91a9
commit ada7441bd1
2 changed files with 23 additions and 148 deletions
--- a/docs/examples/llm_table_extraction_example.py
+++ b/docs/examples/llm_table_extraction_example.py
@@ -17,7 +17,6 @@ __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file


 import asyncio
-import json
 from crawl4ai import (
    AsyncWebCrawler,
    CrawlerRunConfig,
@@ -38,13 +37,19 @@ async def basic_llm_extraction():
        provider="openai/gpt-4.1-mini",
        api_token="env:OPENAI_API_KEY",  # Uses environment variable
        temperature=0.1,  # Low temperature for consistency
-        max_tokens=2000
+        max_tokens=32000
    )
    
    # Create LLM table extraction strategy
    table_strategy = LLMTableExtraction(
        llm_config=llm_config,
-        verbose=True
+        verbose=True,
+        # css_selector="div.mw-content-ltr",
+        max_tries=2,
+        enable_chunking=True,
+        chunk_token_threshold=5000,  # Lower threshold to force chunking
+        min_rows_per_chunk=10,
+        max_parallel_chunks=3
    )
    
    # Configure crawler with the strategy
@@ -56,7 +61,7 @@ async def basic_llm_extraction():
    async with AsyncWebCrawler() as crawler:
        # Extract tables from a Wikipedia page
        result = await crawler.arun(
-            url="https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
+            url="https://en.wikipedia.org/wiki/List_of_chemical_elements",
            config=config
        )
        
@@ -264,70 +269,14 @@ async def compare_strategies():
                print(f"    Row {i+1}: {row}")
            print(f"  Metadata: {table.get('metadata', {})}")

-
-# Example 4: Using Local Models (Ollama)
-async def local_model_extraction():
-    """Extract tables using local Ollama models for privacy/cost."""
-    print("\n=== Example 4: Local Model Extraction with Ollama ===")
-    
-    # Configure for local Ollama
-    llm_config = LLMConfig(
-        provider="ollama/llama3.3",
-        api_token=None,  # Not needed for Ollama
-        base_url="http://localhost:11434",
-        temperature=0.1
-    )
-    
-    table_strategy = LLMTableExtraction(
-        llm_config=llm_config,
-        verbose=True
-    )
-    
-    config = CrawlerRunConfig(
-        cache_mode=CacheMode.BYPASS,
-        table_extraction=table_strategy
-    )
-    
-    # Simple test HTML
-    test_html = """
-    <table>
-        <thead>
-            <tr><th>Product</th><th>Price</th><th>Stock</th></tr>
-        </thead>
-        <tbody>
-            <tr><td>Apple</td><td>$1.50</td><td>100</td></tr>
-            <tr><td>Banana</td><td>$0.50</td><td>200</td></tr>
-            <tr><td>Orange</td><td>$2.00</td><td>50</td></tr>
-        </tbody>
-    </table>
-    """
-    
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url=f"raw:{test_html}",
-            config=config
-        )
-        
-        if result.success and result.tables:
-            table = result.tables[0]
-            print(f"✓ Extracted with local model:")
-            
-            # Create DataFrame
-            df = pd.DataFrame(table['rows'], columns=table['headers'])
-            print(df.to_string())
-        else:
-            print("✗ Make sure Ollama is running locally with llama3.3 model")
-
-
-# Example 5: Batch Processing Multiple Pages
+# Example 4: Batch Processing Multiple Pages
 async def batch_extraction():
    """Extract tables from multiple pages efficiently."""
-    print("\n=== Example 5: Batch Table Extraction ===")
+    print("\n=== Example 4: Batch Table Extraction ===")
    
    urls = [
-        "https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)",
-        "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
-        "https://en.wikipedia.org/wiki/List_of_countries_by_Human_Development_Index"
+        "https://www.worldometers.info/geography/alphabetical-list-of-countries/",
+        # "https://en.wikipedia.org/wiki/List_of_chemical_elements",
    ]
    
    llm_config = LLMConfig(
@@ -339,8 +288,12 @@ async def batch_extraction():
    
    table_strategy = LLMTableExtraction(
        llm_config=llm_config,
-        css_selector="table.wikitable",  # Wikipedia data tables
-        verbose=False
+        css_selector="div.datatable-container",  # Wikipedia data tables
+        verbose=False,
+        enable_chunking=True,
+        chunk_token_threshold=5000,  # Lower threshold to force chunking
+        min_rows_per_chunk=10,
+        max_parallel_chunks=3
    )
    
    config = CrawlerRunConfig(
@@ -391,9 +344,6 @@ async def main():
    # # Compare strategies
    # await compare_strategies()
    
-    # # Local model (requires Ollama)
-    # # await local_model_extraction()
-    
    # # Batch processing
    # await batch_extraction()
    
--- a/tests/test_llm_simple_url.py
+++ b/tests/test_llm_simple_url.py
@@ -23,90 +23,15 @@ async def test_controlled_html():
    print("LLM TABLE EXTRACTION TEST")
    print("=" * 60)
    
-    # Create test HTML with complex tables
-    test_html = """
-    <!DOCTYPE html>
-    <html>
-    <head><title>Test Tables</title></head>
-    <body>
-        <h1>Sales Data</h1>
-        
-        <table border="1">
-            <caption>Q1 2024 Sales Report</caption>
-            <thead>
-                <tr>
-                    <th rowspan="2">Product</th>
-                    <th colspan="3">January</th>
-                    <th colspan="3">February</th>
-                </tr>
-                <tr>
-                    <th>Week 1</th>
-                    <th>Week 2</th>
-                    <th>Week 3</th>
-                    <th>Week 1</th>
-                    <th>Week 2</th>
-                    <th>Week 3</th>
-                </tr>
-            </thead>
-            <tbody>
-                <tr>
-                    <td>Widget A</td>
-                    <td>100</td>
-                    <td>120</td>
-                    <td>110</td>
-                    <td>130</td>
-                    <td>140</td>
-                    <td>150</td>
-                </tr>
-                <tr>
-                    <td>Widget B</td>
-                    <td>200</td>
-                    <td>180</td>
-                    <td>190</td>
-                    <td>210</td>
-                    <td>220</td>
-                    <td>230</td>
-                </tr>
-                <tr>
-                    <td colspan="7">Note: All values in thousands USD</td>
-                </tr>
-            </tbody>
-        </table>
-        
-        <br>
-        
-        <table>
-            <tr>
-                <th>Country</th>
-                <th>Population</th>
-                <th>GDP</th>
-            </tr>
-            <tr>
-                <td>USA</td>
-                <td>331M</td>
-                <td>$21T</td>
-            </tr>
-            <tr>
-                <td>China</td>
-                <td>1.4B</td>
-                <td>$14T</td>
-            </tr>
-        </table>
-    </body>
-    </html>
-    """
-    
-    # url = "https://www.w3schools.com/html/html_tables.asp"
    url = "https://en.wikipedia.org/wiki/List_of_chemical_elements"
    # url = "https://en.wikipedia.org/wiki/List_of_prime_ministers_of_India"
    
    # Configure LLM
    llm_config = LLMConfig(
-        provider="openai/gpt-4.1-mini",
-        # provider="groq/llama-3.3-70b-versatile",
-        api_token=os.getenv("OPENAI_API_KEY"),
-        # api_token=os.getenv("GROQ_API_KEY"),
-        # api_token="os.getenv("GROQ_API_KEY")",
+        # provider="openai/gpt-4.1-mini",
+        # api_token=os.getenv("OPENAI_API_KEY"),
+        provider="groq/llama-3.3-70b-versatile",
+        api_token="GROQ_API_TOKEN",
        temperature=0.1,
        max_tokens=32000
    )