feat: Add table extraction strategies and API documentation
- Implemented table extraction strategies: default, LLM, financial, and none in utils.py. - Created new API documentation for table extraction endpoints and strategies. - Added integration tests for table extraction functionality covering various strategies and error handling. - Developed quick test script for rapid validation of table extraction features.
This commit is contained in:
225
tests/docker/test_table_extraction_quick.py
Normal file
225
tests/docker/test_table_extraction_quick.py
Normal file
@@ -0,0 +1,225 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Quick test script for Table Extraction feature
|
||||
Tests the /tables/extract endpoint with sample HTML
|
||||
|
||||
Usage:
|
||||
1. Start the server: python deploy/docker/server.py
|
||||
2. Run this script: python tests/docker/test_table_extraction_quick.py
|
||||
"""
|
||||
|
||||
import requests
|
||||
import json
|
||||
import sys
|
||||
|
||||
# Sample HTML with tables
|
||||
SAMPLE_HTML = """
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
<h1>Test Tables</h1>
|
||||
|
||||
<table id="simple">
|
||||
<tr><th>Name</th><th>Age</th><th>City</th></tr>
|
||||
<tr><td>Alice</td><td>25</td><td>New York</td></tr>
|
||||
<tr><td>Bob</td><td>30</td><td>San Francisco</td></tr>
|
||||
<tr><td>Charlie</td><td>35</td><td>Los Angeles</td></tr>
|
||||
</table>
|
||||
|
||||
<table id="financial">
|
||||
<thead>
|
||||
<tr><th>Quarter</th><th>Revenue</th><th>Profit</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr><td>Q1 2024</td><td>$1,250,000.00</td><td>$400,000.00</td></tr>
|
||||
<tr><td>Q2 2024</td><td>$1,500,000.00</td><td>$600,000.00</td></tr>
|
||||
<tr><td>Q3 2024</td><td>$1,750,000.00</td><td>$700,000.00</td></tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
BASE_URL = "http://localhost:11234"
|
||||
|
||||
|
||||
def test_server_health():
|
||||
"""Check if server is running"""
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/health", timeout=2)
|
||||
if response.status_code == 200:
|
||||
print("✅ Server is running")
|
||||
return True
|
||||
else:
|
||||
print(f"❌ Server health check failed: {response.status_code}")
|
||||
return False
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"❌ Server not reachable: {e}")
|
||||
print("\n💡 Start the server with: python deploy/docker/server.py")
|
||||
return False
|
||||
|
||||
|
||||
def test_default_strategy():
|
||||
"""Test default table extraction strategy"""
|
||||
print("\n📊 Testing DEFAULT strategy...")
|
||||
|
||||
response = requests.post(f"{BASE_URL}/tables/extract", json={
|
||||
"html": SAMPLE_HTML,
|
||||
"config": {
|
||||
"strategy": "default"
|
||||
}
|
||||
})
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
print(f"✅ Default strategy works!")
|
||||
print(f" - Table count: {data['table_count']}")
|
||||
print(f" - Strategy: {data['strategy']}")
|
||||
|
||||
if data['tables']:
|
||||
for idx, table in enumerate(data['tables']):
|
||||
print(f" - Table {idx + 1}: {len(table.get('rows', []))} rows")
|
||||
|
||||
return True
|
||||
else:
|
||||
print(f"❌ Failed: {response.status_code}")
|
||||
print(f" Error: {response.text}")
|
||||
return False
|
||||
|
||||
|
||||
def test_financial_strategy():
|
||||
"""Test financial table extraction strategy"""
|
||||
print("\n💰 Testing FINANCIAL strategy...")
|
||||
|
||||
response = requests.post(f"{BASE_URL}/tables/extract", json={
|
||||
"html": SAMPLE_HTML,
|
||||
"config": {
|
||||
"strategy": "financial",
|
||||
"preserve_formatting": True,
|
||||
"extract_metadata": True
|
||||
}
|
||||
})
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
print(f"✅ Financial strategy works!")
|
||||
print(f" - Table count: {data['table_count']}")
|
||||
print(f" - Strategy: {data['strategy']}")
|
||||
return True
|
||||
else:
|
||||
print(f"❌ Failed: {response.status_code}")
|
||||
print(f" Error: {response.text}")
|
||||
return False
|
||||
|
||||
|
||||
def test_none_strategy():
|
||||
"""Test none strategy (no extraction)"""
|
||||
print("\n🚫 Testing NONE strategy...")
|
||||
|
||||
response = requests.post(f"{BASE_URL}/tables/extract", json={
|
||||
"html": SAMPLE_HTML,
|
||||
"config": {
|
||||
"strategy": "none"
|
||||
}
|
||||
})
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
if data['table_count'] == 0:
|
||||
print(f"✅ None strategy works (correctly extracted 0 tables)")
|
||||
return True
|
||||
else:
|
||||
print(f"❌ None strategy returned {data['table_count']} tables (expected 0)")
|
||||
return False
|
||||
else:
|
||||
print(f"❌ Failed: {response.status_code}")
|
||||
return False
|
||||
|
||||
|
||||
def test_batch_extraction():
|
||||
"""Test batch extraction"""
|
||||
print("\n📦 Testing BATCH extraction...")
|
||||
|
||||
response = requests.post(f"{BASE_URL}/tables/extract/batch", json={
|
||||
"html_list": [
|
||||
SAMPLE_HTML,
|
||||
"<table><tr><th>Col1</th></tr><tr><td>Val1</td></tr></table>"
|
||||
],
|
||||
"config": {
|
||||
"strategy": "default"
|
||||
}
|
||||
})
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
print(f"✅ Batch extraction works!")
|
||||
print(f" - Total processed: {data['summary']['total_processed']}")
|
||||
print(f" - Successful: {data['summary']['successful']}")
|
||||
print(f" - Total tables: {data['summary']['total_tables_extracted']}")
|
||||
return True
|
||||
else:
|
||||
print(f"❌ Failed: {response.status_code}")
|
||||
print(f" Error: {response.text}")
|
||||
return False
|
||||
|
||||
|
||||
def test_error_handling():
|
||||
"""Test error handling"""
|
||||
print("\n⚠️ Testing ERROR handling...")
|
||||
|
||||
# Test with both html and url (should fail)
|
||||
response = requests.post(f"{BASE_URL}/tables/extract", json={
|
||||
"html": "<table></table>",
|
||||
"url": "https://example.com",
|
||||
"config": {"strategy": "default"}
|
||||
})
|
||||
|
||||
if response.status_code == 400:
|
||||
print(f"✅ Error handling works (correctly rejected invalid input)")
|
||||
return True
|
||||
else:
|
||||
print(f"❌ Expected 400 error, got: {response.status_code}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Table Extraction Feature - Quick Test")
|
||||
print("=" * 60)
|
||||
|
||||
# Check server
|
||||
if not test_server_health():
|
||||
sys.exit(1)
|
||||
|
||||
# Run tests
|
||||
results = []
|
||||
results.append(("Default Strategy", test_default_strategy()))
|
||||
results.append(("Financial Strategy", test_financial_strategy()))
|
||||
results.append(("None Strategy", test_none_strategy()))
|
||||
results.append(("Batch Extraction", test_batch_extraction()))
|
||||
results.append(("Error Handling", test_error_handling()))
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 60)
|
||||
print("Test Summary")
|
||||
print("=" * 60)
|
||||
|
||||
passed = sum(1 for _, result in results if result)
|
||||
total = len(results)
|
||||
|
||||
for name, result in results:
|
||||
status = "✅ PASS" if result else "❌ FAIL"
|
||||
print(f"{status}: {name}")
|
||||
|
||||
print(f"\nTotal: {passed}/{total} tests passed")
|
||||
|
||||
if passed == total:
|
||||
print("\n🎉 All tests passed! Table extraction is working correctly!")
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(f"\n⚠️ {total - passed} test(s) failed")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user