refactor: Update LLMTableExtraction examples and tests

2025-08-15 18:47:31 +08:00
parent 9f7fee91a9
commit ada7441bd1
2 changed files with 23 additions and 148 deletions
--- a/docs/examples/llm_table_extraction_example.py
+++ b/docs/examples/llm_table_extraction_example.py
@@ -17,7 +17,6 @@ __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file
 import asyncio
 import json
 from crawl4ai import (
    AsyncWebCrawler,
    CrawlerRunConfig,
@@ -38,13 +37,19 @@ async def basic_llm_extraction():
        provider="openai/gpt-4.1-mini",
        api_token="env:OPENAI_API_KEY",  # Uses environment variable
        temperature=0.1,  # Low temperature for consistency
-        max_tokens=2000
+        max_tokens=32000
    )
    # Create LLM table extraction strategy
    table_strategy = LLMTableExtraction(
        llm_config=llm_config,
-        verbose=True
+        verbose=True,
        # css_selector="div.mw-content-ltr",
        max_tries=2,
        enable_chunking=True,
        chunk_token_threshold=5000,  # Lower threshold to force chunking
        min_rows_per_chunk=10,
        max_parallel_chunks=3
    )
    # Configure crawler with the strategy
@@ -56,7 +61,7 @@ async def basic_llm_extraction():
    async with AsyncWebCrawler() as crawler:
        # Extract tables from a Wikipedia page
        result = await crawler.arun(
-            url="https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
+            url="https://en.wikipedia.org/wiki/List_of_chemical_elements",
            config=config
        )
@@ -264,70 +269,14 @@ async def compare_strategies():
                print(f"    Row {i+1}: {row}")
            print(f"  Metadata: {table.get('metadata', {})}")
-
+# Example 4: Batch Processing Multiple Pages
 # Example 4: Using Local Models (Ollama)
 async def local_model_extraction():
    """Extract tables using local Ollama models for privacy/cost."""
    print("\n=== Example 4: Local Model Extraction with Ollama ===")
    # Configure for local Ollama
    llm_config = LLMConfig(
        provider="ollama/llama3.3",
        api_token=None,  # Not needed for Ollama
        base_url="http://localhost:11434",
        temperature=0.1
    )
    table_strategy = LLMTableExtraction(
        llm_config=llm_config,
        verbose=True
    )
    config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        table_extraction=table_strategy
    )
    # Simple test HTML
    test_html = """
    <table>
        <thead>
            <tr><th>Product</th><th>Price</th><th>Stock</th></tr>
        </thead>
        <tbody>
            <tr><td>Apple</td><td>$1.50</td><td>100</td></tr>
            <tr><td>Banana</td><td>$0.50</td><td>200</td></tr>
            <tr><td>Orange</td><td>$2.00</td><td>50</td></tr>
        </tbody>
    </table>
    """
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url=f"raw:{test_html}",
            config=config
        )
        if result.success and result.tables:
            table = result.tables[0]
            print(f"✓ Extracted with local model:")
            # Create DataFrame
            df = pd.DataFrame(table['rows'], columns=table['headers'])
            print(df.to_string())
        else:
            print("✗ Make sure Ollama is running locally with llama3.3 model")
 # Example 5: Batch Processing Multiple Pages
 async def batch_extraction():
    """Extract tables from multiple pages efficiently."""
-    print("\n=== Example 5: Batch Table Extraction ===")
+    print("\n=== Example 4: Batch Table Extraction ===")
    urls = [
-        "https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)",
+        "https://www.worldometers.info/geography/alphabetical-list-of-countries/",
-        "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)",
+        # "https://en.wikipedia.org/wiki/List_of_chemical_elements",
        "https://en.wikipedia.org/wiki/List_of_countries_by_Human_Development_Index"
    ]
    llm_config = LLMConfig(
@@ -339,8 +288,12 @@ async def batch_extraction():
    table_strategy = LLMTableExtraction(
        llm_config=llm_config,
-        css_selector="table.wikitable",  # Wikipedia data tables
+        css_selector="div.datatable-container",  # Wikipedia data tables
-        verbose=False
+        verbose=False,
        enable_chunking=True,
        chunk_token_threshold=5000,  # Lower threshold to force chunking
        min_rows_per_chunk=10,
        max_parallel_chunks=3
    )
    config = CrawlerRunConfig(
@@ -391,9 +344,6 @@ async def main():
    # # Compare strategies
    # await compare_strategies()
    # # Local model (requires Ollama)
    # # await local_model_extraction()
    # # Batch processing
    # await batch_extraction()
--- a/tests/test_llm_simple_url.py
+++ b/tests/test_llm_simple_url.py
@@ -23,90 +23,15 @@ async def test_controlled_html():
    print("LLM TABLE EXTRACTION TEST")
    print("=" * 60)
    # Create test HTML with complex tables
    test_html = """
    <!DOCTYPE html>
    <html>
    <head><title>Test Tables</title></head>
    <body>
        <h1>Sales Data</h1>
        <table border="1">
            <caption>Q1 2024 Sales Report</caption>
            <thead>
                <tr>
                    <th rowspan="2">Product</th>
                    <th colspan="3">January</th>
                    <th colspan="3">February</th>
                </tr>
                <tr>
                    <th>Week 1</th>
                    <th>Week 2</th>
                    <th>Week 3</th>
                    <th>Week 1</th>
                    <th>Week 2</th>
                    <th>Week 3</th>
                </tr>
            </thead>
            <tbody>
                <tr>
                    <td>Widget A</td>
                    <td>100</td>
                    <td>120</td>
                    <td>110</td>
                    <td>130</td>
                    <td>140</td>
                    <td>150</td>
                </tr>
                <tr>
                    <td>Widget B</td>
                    <td>200</td>
                    <td>180</td>
                    <td>190</td>
                    <td>210</td>
                    <td>220</td>
                    <td>230</td>
                </tr>
                <tr>
                    <td colspan="7">Note: All values in thousands USD</td>
                </tr>
            </tbody>
        </table>
        <br>
        <table>
            <tr>
                <th>Country</th>
                <th>Population</th>
                <th>GDP</th>
            </tr>
            <tr>
                <td>USA</td>
                <td>331M</td>
                <td>$21T</td>
            </tr>
            <tr>
                <td>China</td>
                <td>1.4B</td>
                <td>$14T</td>
            </tr>
        </table>
    </body>
    </html>
    """
    # url = "https://www.w3schools.com/html/html_tables.asp"
    url = "https://en.wikipedia.org/wiki/List_of_chemical_elements"
    # url = "https://en.wikipedia.org/wiki/List_of_prime_ministers_of_India"
    # Configure LLM
    llm_config = LLMConfig(
-        provider="openai/gpt-4.1-mini",
+        # provider="openai/gpt-4.1-mini",
-        # provider="groq/llama-3.3-70b-versatile",
+        # api_token=os.getenv("OPENAI_API_KEY"),
-        api_token=os.getenv("OPENAI_API_KEY"),
+        provider="groq/llama-3.3-70b-versatile",
-        # api_token=os.getenv("GROQ_API_KEY"),
+        api_token="GROQ_API_TOKEN",
        # api_token="os.getenv("GROQ_API_KEY")",
        temperature=0.1,
        max_tokens=32000
    )