feat: Add table extraction strategies and API documentation

- Implemented table extraction strategies: default, LLM, financial, and none in utils.py. - Created new API documentation for table extraction endpoints and strategies. - Added integration tests for table extraction functionality covering various strategies and error handling. - Developed quick test script for rapid validation of table extraction features.
2025-10-17 12:30:37 +08:00
parent 3877335d89
commit 00e9904609
8 changed files with 1979 additions and 3 deletions
--- a/tests/docker/test_table_extraction.py
+++ b/tests/docker/test_table_extraction.py
@@ -0,0 +1,458 @@
+"""
+Integration tests for Table Extraction functionality in Crawl4AI Docker Server
+
+Tests cover:
+1. Integrated table extraction during crawls
+2. Dedicated /tables endpoints
+3. All extraction strategies (default, LLM, financial)
+4. Batch processing
+5. Error handling
+
+Note: These tests require the Docker server to be running on localhost:11235
+Run: python deploy/docker/server.py
+"""
+
+import pytest
+import requests
+import time
+from typing import Dict, Any
+
+
+# Base URL for the Docker API server
+BASE_URL = "http://localhost:11234"
+
+# Sample HTML with tables for testing
+SAMPLE_HTML_WITH_TABLES = """
+<!DOCTYPE html>
+<html>
+<head><title>Test Page with Tables</title></head>
+<body>
+    <h1>Financial Data</h1>
+    
+    <!-- Simple table -->
+    <table id="simple">
+        <tr><th>Name</th><th>Age</th></tr>
+        <tr><td>Alice</td><td>25</td></tr>
+        <tr><td>Bob</td><td>30</td></tr>
+    </table>
+    
+    <!-- Financial table -->
+    <table id="financial">
+        <thead>
+            <tr><th>Quarter</th><th>Revenue</th><th>Expenses</th><th>Profit</th></tr>
+        </thead>
+        <tbody>
+            <tr><td>Q1 2024</td><td>$1,250,000.00</td><td>$850,000.00</td><td>$400,000.00</td></tr>
+            <tr><td>Q2 2024</td><td>$1,500,000.00</td><td>$900,000.00</td><td>$600,000.00</td></tr>
+        </tbody>
+    </table>
+    
+    <!-- Complex nested table -->
+    <table id="complex">
+        <tr>
+            <th rowspan="2">Product</th>
+            <th colspan="2">Sales</th>
+        </tr>
+        <tr>
+            <th>Units</th>
+            <th>Revenue</th>
+        </tr>
+        <tr><td>Widget A</td><td>100</td><td>$5,000</td></tr>
+        <tr><td>Widget B</td><td>200</td><td>$10,000</td></tr>
+    </table>
+</body>
+</html>
+"""
+
+
+@pytest.fixture(scope="module")
+def server_url():
+    """Return the server URL"""
+    return BASE_URL
+
+
+@pytest.fixture(scope="module")
+def wait_for_server():
+    """Wait for server to be ready"""
+    max_retries = 5
+    for i in range(max_retries):
+        try:
+            response = requests.get(f"{BASE_URL}/health", timeout=2)
+            if response.status_code == 200:
+                return True
+        except requests.exceptions.RequestException:
+            if i < max_retries - 1:
+                time.sleep(1)
+    pytest.skip("Server not running on localhost:11235. Start with: python deploy/docker/server.py")
+
+
+class TestIntegratedTableExtraction:
+    """Test table extraction integrated with /crawl endpoint"""
+
+    def test_crawl_with_default_table_extraction(self, server_url, wait_for_server):
+        """Test crawling with default table extraction strategy"""
+        response = requests.post(f"{server_url}/crawl", json={
+            "urls": ["https://example.com/tables"],
+            "browser_config": {"headless": True},
+            "crawler_config": {},
+            "table_extraction": {
+                "strategy": "default"
+            }
+        })
+        
+        assert response.status_code == 200
+        data = response.json()
+        assert data["success"] is True
+        assert "results" in data
+        
+        # Check first result has tables
+        if data["results"]:
+            result = data["results"][0]
+            assert "tables" in result or result.get("success") is False
+
+    def test_crawl_with_llm_table_extraction(self, server_url, wait_for_server):
+        """Test crawling with LLM table extraction strategy"""
+        response = requests.post(f"{server_url}/crawl", json={
+            "urls": ["https://example.com/financial"],
+            "browser_config": {"headless": True},
+            "crawler_config": {},
+            "table_extraction": {
+                "strategy": "llm",
+                "llm_provider": "openai",
+                "llm_model": "gpt-4",
+                "llm_api_key": "test-key",
+                "llm_prompt": "Extract financial data from tables"
+            }
+        })
+        
+        # Should fail without valid API key, but structure should be correct
+        # In real scenario with valid key, this would succeed
+        assert response.status_code in [200, 500]  # May fail on auth
+
+    def test_crawl_with_financial_table_extraction(self, server_url, wait_for_server):
+        """Test crawling with financial table extraction strategy"""
+        response = requests.post(f"{server_url}/crawl", json={
+            "urls": ["https://example.com/stocks"],
+            "browser_config": {"headless": True},
+            "crawler_config": {},
+            "table_extraction": {
+                "strategy": "financial",
+                "preserve_formatting": True,
+                "extract_metadata": True
+            }
+        })
+        
+        assert response.status_code == 200
+        data = response.json()
+        assert data["success"] is True
+
+    def test_crawl_without_table_extraction(self, server_url, wait_for_server):
+        """Test crawling without table extraction (should work normally)"""
+        response = requests.post(f"{server_url}/crawl", json={
+            "urls": ["https://example.com"],
+            "browser_config": {"headless": True},
+            "crawler_config": {}
+        })
+        
+        assert response.status_code == 200
+        data = response.json()
+        assert data["success"] is True
+
+
+class TestDedicatedTableEndpoints:
+    """Test dedicated /tables endpoints"""
+
+    def test_extract_tables_from_html(self, server_url, wait_for_server):
+        """Test extracting tables from provided HTML"""
+        response = requests.post(f"{server_url}/tables/extract", json={
+            "html": SAMPLE_HTML_WITH_TABLES,
+            "config": {
+                "strategy": "default"
+            }
+        })
+        
+        assert response.status_code == 200
+        data = response.json()
+        assert data["success"] is True
+        assert data["table_count"] >= 3  # Should find at least 3 tables
+        assert "tables" in data
+        assert data["strategy"] == "default"
+        
+        # Verify table structure
+        if data["tables"]:
+            table = data["tables"][0]
+            assert "headers" in table or "rows" in table
+
+    def test_extract_tables_from_url(self, server_url, wait_for_server):
+        """Test extracting tables by fetching URL"""
+        response = requests.post(f"{server_url}/tables/extract", json={
+            "url": "https://example.com/tables",
+            "config": {
+                "strategy": "default"
+            }
+        })
+        
+        # May fail if URL doesn't exist, but structure should be correct
+        assert response.status_code in [200, 500]
+        
+        if response.status_code == 200:
+            data = response.json()
+            assert "success" in data
+            assert "tables" in data
+
+    def test_extract_tables_invalid_input(self, server_url, wait_for_server):
+        """Test error handling for invalid input"""
+        # No html or url provided
+        response = requests.post(f"{server_url}/tables/extract", json={
+            "config": {"strategy": "default"}
+        })
+        
+        assert response.status_code == 400
+        assert "html" in response.text.lower() or "url" in response.text.lower()
+
+    def test_extract_tables_both_html_and_url(self, server_url, wait_for_server):
+        """Test error when both html and url are provided"""
+        response = requests.post(f"{server_url}/tables/extract", json={
+            "html": "<table></table>",
+            "url": "https://example.com",
+            "config": {"strategy": "default"}
+        })
+        
+        assert response.status_code == 400
+        assert "both" in response.text.lower()
+
+
+class TestBatchTableExtraction:
+    """Test batch table extraction endpoints"""
+
+    def test_batch_extract_html_list(self, server_url, wait_for_server):
+        """Test batch extraction from multiple HTML contents"""
+        response = requests.post(f"{server_url}/tables/extract/batch", json={
+            "html_list": [
+                SAMPLE_HTML_WITH_TABLES,
+                "<table><tr><th>A</th></tr><tr><td>1</td></tr></table>",
+            ],
+            "config": {"strategy": "default"}
+        })
+        
+        assert response.status_code == 200
+        data = response.json()
+        assert data["success"] is True
+        assert "summary" in data
+        assert data["summary"]["total_processed"] == 2
+        assert data["summary"]["successful"] >= 0
+        assert "results" in data
+        assert len(data["results"]) == 2
+
+    def test_batch_extract_url_list(self, server_url, wait_for_server):
+        """Test batch extraction from multiple URLs"""
+        response = requests.post(f"{server_url}/tables/extract/batch", json={
+            "url_list": [
+                "https://example.com/page1",
+                "https://example.com/page2",
+            ],
+            "config": {"strategy": "default"}
+        })
+        
+        # May have mixed success/failure depending on URLs
+        assert response.status_code in [200, 500]
+        
+        if response.status_code == 200:
+            data = response.json()
+            assert "summary" in data
+            assert "results" in data
+
+    def test_batch_extract_mixed(self, server_url, wait_for_server):
+        """Test batch extraction from both HTML and URLs"""
+        response = requests.post(f"{server_url}/tables/extract/batch", json={
+            "html_list": [SAMPLE_HTML_WITH_TABLES],
+            "url_list": ["https://example.com/tables"],
+            "config": {"strategy": "default"}
+        })
+        
+        # May fail on URL crawling but should handle mixed input
+        assert response.status_code in [200, 500]
+        if response.status_code == 200:
+            data = response.json()
+            assert data["success"] is True
+            assert data["summary"]["total_processed"] == 2
+
+    def test_batch_extract_empty_list(self, server_url, wait_for_server):
+        """Test error when no items provided for batch"""
+        response = requests.post(f"{server_url}/tables/extract/batch", json={
+            "config": {"strategy": "default"}
+        })
+        
+        assert response.status_code == 400
+
+    def test_batch_extract_exceeds_limit(self, server_url, wait_for_server):
+        """Test error when batch size exceeds limit"""
+        response = requests.post(f"{server_url}/tables/extract/batch", json={
+            "html_list": ["<table></table>"] * 100,  # 100 items (limit is 50)
+            "config": {"strategy": "default"}
+        })
+        
+        assert response.status_code == 400
+        assert "50" in response.text or "limit" in response.text.lower()
+
+
+class TestTableExtractionStrategies:
+    """Test different table extraction strategies"""
+
+    def test_default_strategy(self, server_url, wait_for_server):
+        """Test default (regex-based) extraction strategy"""
+        response = requests.post(f"{server_url}/tables/extract", json={
+            "html": SAMPLE_HTML_WITH_TABLES,
+            "config": {
+                "strategy": "default"
+            }
+        })
+        
+        assert response.status_code == 200
+        data = response.json()
+        assert data["strategy"] == "default"
+        assert data["table_count"] >= 1
+
+    def test_llm_strategy_without_config(self, server_url, wait_for_server):
+        """Test LLM strategy without proper config (should use defaults or work)"""
+        response = requests.post(f"{server_url}/tables/extract", json={
+            "html": SAMPLE_HTML_WITH_TABLES,
+            "config": {
+                "strategy": "llm"
+                # Missing required LLM config
+            }
+        })
+        
+        # May succeed with defaults or fail - both are acceptable
+        assert response.status_code in [200, 400, 500]
+
+    def test_financial_strategy(self, server_url, wait_for_server):
+        """Test financial extraction strategy"""
+        response = requests.post(f"{server_url}/tables/extract", json={
+            "html": SAMPLE_HTML_WITH_TABLES,
+            "config": {
+                "strategy": "financial",
+                "preserve_formatting": True,
+                "extract_metadata": True
+            }
+        })
+        
+        assert response.status_code == 200
+        data = response.json()
+        assert data["strategy"] == "financial"
+        
+        # Financial tables should be extracted
+        if data["tables"]:
+            # Should find the financial table in our sample HTML
+            assert data["table_count"] >= 1
+
+    def test_none_strategy(self, server_url, wait_for_server):
+        """Test with 'none' strategy (no extraction)"""
+        response = requests.post(f"{server_url}/tables/extract", json={
+            "html": SAMPLE_HTML_WITH_TABLES,
+            "config": {
+                "strategy": "none"
+            }
+        })
+        
+        assert response.status_code == 200
+        data = response.json()
+        # Should return 0 tables
+        assert data["table_count"] == 0
+
+
+class TestTableExtractionConfig:
+    """Test table extraction configuration options"""
+
+    def test_preserve_formatting_option(self, server_url, wait_for_server):
+        """Test preserve_formatting option"""
+        response = requests.post(f"{server_url}/tables/extract", json={
+            "html": SAMPLE_HTML_WITH_TABLES,
+            "config": {
+                "strategy": "financial",
+                "preserve_formatting": True
+            }
+        })
+        
+        assert response.status_code == 200
+
+    def test_extract_metadata_option(self, server_url, wait_for_server):
+        """Test extract_metadata option"""
+        response = requests.post(f"{server_url}/tables/extract", json={
+            "html": SAMPLE_HTML_WITH_TABLES,
+            "config": {
+                "strategy": "financial",
+                "extract_metadata": True
+            }
+        })
+        
+        assert response.status_code == 200
+        data = response.json()
+        
+        # Check if tables have metadata when requested
+        if data["tables"]:
+            table = data["tables"][0]
+            assert isinstance(table, dict)
+
+
+class TestErrorHandling:
+    """Test error handling for table extraction"""
+
+    def test_malformed_html(self, server_url, wait_for_server):
+        """Test handling of malformed HTML"""
+        response = requests.post(f"{server_url}/tables/extract", json={
+            "html": "<table><tr><td>incomplete",
+            "config": {"strategy": "default"}
+        })
+        
+        # Should handle gracefully (either return empty or partial results)
+        assert response.status_code in [200, 400, 500]
+
+    def test_empty_html(self, server_url, wait_for_server):
+        """Test handling of empty HTML"""
+        response = requests.post(f"{server_url}/tables/extract", json={
+            "html": "",
+            "config": {"strategy": "default"}
+        })
+        
+        # May be rejected as invalid or processed as empty
+        assert response.status_code in [200, 400]
+        if response.status_code == 200:
+            data = response.json()
+            assert data["table_count"] == 0
+
+    def test_html_without_tables(self, server_url, wait_for_server):
+        """Test HTML with no tables"""
+        response = requests.post(f"{server_url}/tables/extract", json={
+            "html": "<html><body><p>No tables here</p></body></html>",
+            "config": {"strategy": "default"}
+        })
+        
+        assert response.status_code == 200
+        data = response.json()
+        assert data["table_count"] == 0
+
+    def test_invalid_strategy(self, server_url, wait_for_server):
+        """Test invalid strategy name"""
+        response = requests.post(f"{server_url}/tables/extract", json={
+            "html": SAMPLE_HTML_WITH_TABLES,
+            "config": {"strategy": "invalid_strategy"}
+        })
+        
+        # Should return validation error (400 or 422 from Pydantic)
+        assert response.status_code in [400, 422]
+
+    def test_missing_config(self, server_url, wait_for_server):
+        """Test missing configuration"""
+        response = requests.post(f"{server_url}/tables/extract", json={
+            "html": SAMPLE_HTML_WITH_TABLES
+            # Missing config
+        })
+        
+        # Should use default config or return error
+        assert response.status_code in [200, 400]
+
+
+# Run tests
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
--- a/tests/docker/test_table_extraction_quick.py
+++ b/tests/docker/test_table_extraction_quick.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+"""
+Quick test script for Table Extraction feature
+Tests the /tables/extract endpoint with sample HTML
+
+Usage:
+1. Start the server: python deploy/docker/server.py
+2. Run this script: python tests/docker/test_table_extraction_quick.py
+"""
+
+import requests
+import json
+import sys
+
+# Sample HTML with tables
+SAMPLE_HTML = """
+<!DOCTYPE html>
+<html>
+<body>
+    <h1>Test Tables</h1>
+    
+    <table id="simple">
+        <tr><th>Name</th><th>Age</th><th>City</th></tr>
+        <tr><td>Alice</td><td>25</td><td>New York</td></tr>
+        <tr><td>Bob</td><td>30</td><td>San Francisco</td></tr>
+        <tr><td>Charlie</td><td>35</td><td>Los Angeles</td></tr>
+    </table>
+    
+    <table id="financial">
+        <thead>
+            <tr><th>Quarter</th><th>Revenue</th><th>Profit</th></tr>
+        </thead>
+        <tbody>
+            <tr><td>Q1 2024</td><td>$1,250,000.00</td><td>$400,000.00</td></tr>
+            <tr><td>Q2 2024</td><td>$1,500,000.00</td><td>$600,000.00</td></tr>
+            <tr><td>Q3 2024</td><td>$1,750,000.00</td><td>$700,000.00</td></tr>
+        </tbody>
+    </table>
+</body>
+</html>
+"""
+
+BASE_URL = "http://localhost:11234"
+
+
+def test_server_health():
+    """Check if server is running"""
+    try:
+        response = requests.get(f"{BASE_URL}/health", timeout=2)
+        if response.status_code == 200:
+            print("✅ Server is running")
+            return True
+        else:
+            print(f"❌ Server health check failed: {response.status_code}")
+            return False
+    except requests.exceptions.RequestException as e:
+        print(f"❌ Server not reachable: {e}")
+        print("\n💡 Start the server with: python deploy/docker/server.py")
+        return False
+
+
+def test_default_strategy():
+    """Test default table extraction strategy"""
+    print("\n📊 Testing DEFAULT strategy...")
+    
+    response = requests.post(f"{BASE_URL}/tables/extract", json={
+        "html": SAMPLE_HTML,
+        "config": {
+            "strategy": "default"
+        }
+    })
+    
+    if response.status_code == 200:
+        data = response.json()
+        print(f"✅ Default strategy works!")
+        print(f"   - Table count: {data['table_count']}")
+        print(f"   - Strategy: {data['strategy']}")
+        
+        if data['tables']:
+            for idx, table in enumerate(data['tables']):
+                print(f"   - Table {idx + 1}: {len(table.get('rows', []))} rows")
+        
+        return True
+    else:
+        print(f"❌ Failed: {response.status_code}")
+        print(f"   Error: {response.text}")
+        return False
+
+
+def test_financial_strategy():
+    """Test financial table extraction strategy"""
+    print("\n💰 Testing FINANCIAL strategy...")
+    
+    response = requests.post(f"{BASE_URL}/tables/extract", json={
+        "html": SAMPLE_HTML,
+        "config": {
+            "strategy": "financial",
+            "preserve_formatting": True,
+            "extract_metadata": True
+        }
+    })
+    
+    if response.status_code == 200:
+        data = response.json()
+        print(f"✅ Financial strategy works!")
+        print(f"   - Table count: {data['table_count']}")
+        print(f"   - Strategy: {data['strategy']}")
+        return True
+    else:
+        print(f"❌ Failed: {response.status_code}")
+        print(f"   Error: {response.text}")
+        return False
+
+
+def test_none_strategy():
+    """Test none strategy (no extraction)"""
+    print("\n🚫 Testing NONE strategy...")
+    
+    response = requests.post(f"{BASE_URL}/tables/extract", json={
+        "html": SAMPLE_HTML,
+        "config": {
+            "strategy": "none"
+        }
+    })
+    
+    if response.status_code == 200:
+        data = response.json()
+        if data['table_count'] == 0:
+            print(f"✅ None strategy works (correctly extracted 0 tables)")
+            return True
+        else:
+            print(f"❌ None strategy returned {data['table_count']} tables (expected 0)")
+            return False
+    else:
+        print(f"❌ Failed: {response.status_code}")
+        return False
+
+
+def test_batch_extraction():
+    """Test batch extraction"""
+    print("\n📦 Testing BATCH extraction...")
+    
+    response = requests.post(f"{BASE_URL}/tables/extract/batch", json={
+        "html_list": [
+            SAMPLE_HTML,
+            "<table><tr><th>Col1</th></tr><tr><td>Val1</td></tr></table>"
+        ],
+        "config": {
+            "strategy": "default"
+        }
+    })
+    
+    if response.status_code == 200:
+        data = response.json()
+        print(f"✅ Batch extraction works!")
+        print(f"   - Total processed: {data['summary']['total_processed']}")
+        print(f"   - Successful: {data['summary']['successful']}")
+        print(f"   - Total tables: {data['summary']['total_tables_extracted']}")
+        return True
+    else:
+        print(f"❌ Failed: {response.status_code}")
+        print(f"   Error: {response.text}")
+        return False
+
+
+def test_error_handling():
+    """Test error handling"""
+    print("\n⚠️  Testing ERROR handling...")
+    
+    # Test with both html and url (should fail)
+    response = requests.post(f"{BASE_URL}/tables/extract", json={
+        "html": "<table></table>",
+        "url": "https://example.com",
+        "config": {"strategy": "default"}
+    })
+    
+    if response.status_code == 400:
+        print(f"✅ Error handling works (correctly rejected invalid input)")
+        return True
+    else:
+        print(f"❌ Expected 400 error, got: {response.status_code}")
+        return False
+
+
+def main():
+    print("=" * 60)
+    print("Table Extraction Feature - Quick Test")
+    print("=" * 60)
+    
+    # Check server
+    if not test_server_health():
+        sys.exit(1)
+    
+    # Run tests
+    results = []
+    results.append(("Default Strategy", test_default_strategy()))
+    results.append(("Financial Strategy", test_financial_strategy()))
+    results.append(("None Strategy", test_none_strategy()))
+    results.append(("Batch Extraction", test_batch_extraction()))
+    results.append(("Error Handling", test_error_handling()))
+    
+    # Summary
+    print("\n" + "=" * 60)
+    print("Test Summary")
+    print("=" * 60)
+    
+    passed = sum(1 for _, result in results if result)
+    total = len(results)
+    
+    for name, result in results:
+        status = "✅ PASS" if result else "❌ FAIL"
+        print(f"{status}: {name}")
+    
+    print(f"\nTotal: {passed}/{total} tests passed")
+    
+    if passed == total:
+        print("\n🎉 All tests passed! Table extraction is working correctly!")
+        sys.exit(0)
+    else:
+        print(f"\n⚠️  {total - passed} test(s) failed")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()