feat: Add table extraction strategies and API documentation

- Implemented table extraction strategies: default, LLM, financial, and none in utils.py.
- Created new API documentation for table extraction endpoints and strategies.
- Added integration tests for table extraction functionality covering various strategies and error handling.
- Developed quick test script for rapid validation of table extraction features.
This commit is contained in:
AHMET YILMAZ
2025-10-17 12:30:37 +08:00
parent 3877335d89
commit 00e9904609
8 changed files with 1979 additions and 3 deletions

View File

@@ -0,0 +1,458 @@
"""
Integration tests for Table Extraction functionality in Crawl4AI Docker Server
Tests cover:
1. Integrated table extraction during crawls
2. Dedicated /tables endpoints
3. All extraction strategies (default, LLM, financial)
4. Batch processing
5. Error handling
Note: These tests require the Docker server to be running on localhost:11235
Run: python deploy/docker/server.py
"""
import pytest
import requests
import time
from typing import Dict, Any
# Base URL for the Docker API server
BASE_URL = "http://localhost:11234"
# Sample HTML with tables for testing
SAMPLE_HTML_WITH_TABLES = """
<!DOCTYPE html>
<html>
<head><title>Test Page with Tables</title></head>
<body>
<h1>Financial Data</h1>
<!-- Simple table -->
<table id="simple">
<tr><th>Name</th><th>Age</th></tr>
<tr><td>Alice</td><td>25</td></tr>
<tr><td>Bob</td><td>30</td></tr>
</table>
<!-- Financial table -->
<table id="financial">
<thead>
<tr><th>Quarter</th><th>Revenue</th><th>Expenses</th><th>Profit</th></tr>
</thead>
<tbody>
<tr><td>Q1 2024</td><td>$1,250,000.00</td><td>$850,000.00</td><td>$400,000.00</td></tr>
<tr><td>Q2 2024</td><td>$1,500,000.00</td><td>$900,000.00</td><td>$600,000.00</td></tr>
</tbody>
</table>
<!-- Complex nested table -->
<table id="complex">
<tr>
<th rowspan="2">Product</th>
<th colspan="2">Sales</th>
</tr>
<tr>
<th>Units</th>
<th>Revenue</th>
</tr>
<tr><td>Widget A</td><td>100</td><td>$5,000</td></tr>
<tr><td>Widget B</td><td>200</td><td>$10,000</td></tr>
</table>
</body>
</html>
"""
@pytest.fixture(scope="module")
def server_url():
"""Return the server URL"""
return BASE_URL
@pytest.fixture(scope="module")
def wait_for_server():
"""Wait for server to be ready"""
max_retries = 5
for i in range(max_retries):
try:
response = requests.get(f"{BASE_URL}/health", timeout=2)
if response.status_code == 200:
return True
except requests.exceptions.RequestException:
if i < max_retries - 1:
time.sleep(1)
pytest.skip("Server not running on localhost:11235. Start with: python deploy/docker/server.py")
class TestIntegratedTableExtraction:
"""Test table extraction integrated with /crawl endpoint"""
def test_crawl_with_default_table_extraction(self, server_url, wait_for_server):
"""Test crawling with default table extraction strategy"""
response = requests.post(f"{server_url}/crawl", json={
"urls": ["https://example.com/tables"],
"browser_config": {"headless": True},
"crawler_config": {},
"table_extraction": {
"strategy": "default"
}
})
assert response.status_code == 200
data = response.json()
assert data["success"] is True
assert "results" in data
# Check first result has tables
if data["results"]:
result = data["results"][0]
assert "tables" in result or result.get("success") is False
def test_crawl_with_llm_table_extraction(self, server_url, wait_for_server):
"""Test crawling with LLM table extraction strategy"""
response = requests.post(f"{server_url}/crawl", json={
"urls": ["https://example.com/financial"],
"browser_config": {"headless": True},
"crawler_config": {},
"table_extraction": {
"strategy": "llm",
"llm_provider": "openai",
"llm_model": "gpt-4",
"llm_api_key": "test-key",
"llm_prompt": "Extract financial data from tables"
}
})
# Should fail without valid API key, but structure should be correct
# In real scenario with valid key, this would succeed
assert response.status_code in [200, 500] # May fail on auth
def test_crawl_with_financial_table_extraction(self, server_url, wait_for_server):
"""Test crawling with financial table extraction strategy"""
response = requests.post(f"{server_url}/crawl", json={
"urls": ["https://example.com/stocks"],
"browser_config": {"headless": True},
"crawler_config": {},
"table_extraction": {
"strategy": "financial",
"preserve_formatting": True,
"extract_metadata": True
}
})
assert response.status_code == 200
data = response.json()
assert data["success"] is True
def test_crawl_without_table_extraction(self, server_url, wait_for_server):
"""Test crawling without table extraction (should work normally)"""
response = requests.post(f"{server_url}/crawl", json={
"urls": ["https://example.com"],
"browser_config": {"headless": True},
"crawler_config": {}
})
assert response.status_code == 200
data = response.json()
assert data["success"] is True
class TestDedicatedTableEndpoints:
"""Test dedicated /tables endpoints"""
def test_extract_tables_from_html(self, server_url, wait_for_server):
"""Test extracting tables from provided HTML"""
response = requests.post(f"{server_url}/tables/extract", json={
"html": SAMPLE_HTML_WITH_TABLES,
"config": {
"strategy": "default"
}
})
assert response.status_code == 200
data = response.json()
assert data["success"] is True
assert data["table_count"] >= 3 # Should find at least 3 tables
assert "tables" in data
assert data["strategy"] == "default"
# Verify table structure
if data["tables"]:
table = data["tables"][0]
assert "headers" in table or "rows" in table
def test_extract_tables_from_url(self, server_url, wait_for_server):
"""Test extracting tables by fetching URL"""
response = requests.post(f"{server_url}/tables/extract", json={
"url": "https://example.com/tables",
"config": {
"strategy": "default"
}
})
# May fail if URL doesn't exist, but structure should be correct
assert response.status_code in [200, 500]
if response.status_code == 200:
data = response.json()
assert "success" in data
assert "tables" in data
def test_extract_tables_invalid_input(self, server_url, wait_for_server):
"""Test error handling for invalid input"""
# No html or url provided
response = requests.post(f"{server_url}/tables/extract", json={
"config": {"strategy": "default"}
})
assert response.status_code == 400
assert "html" in response.text.lower() or "url" in response.text.lower()
def test_extract_tables_both_html_and_url(self, server_url, wait_for_server):
"""Test error when both html and url are provided"""
response = requests.post(f"{server_url}/tables/extract", json={
"html": "<table></table>",
"url": "https://example.com",
"config": {"strategy": "default"}
})
assert response.status_code == 400
assert "both" in response.text.lower()
class TestBatchTableExtraction:
"""Test batch table extraction endpoints"""
def test_batch_extract_html_list(self, server_url, wait_for_server):
"""Test batch extraction from multiple HTML contents"""
response = requests.post(f"{server_url}/tables/extract/batch", json={
"html_list": [
SAMPLE_HTML_WITH_TABLES,
"<table><tr><th>A</th></tr><tr><td>1</td></tr></table>",
],
"config": {"strategy": "default"}
})
assert response.status_code == 200
data = response.json()
assert data["success"] is True
assert "summary" in data
assert data["summary"]["total_processed"] == 2
assert data["summary"]["successful"] >= 0
assert "results" in data
assert len(data["results"]) == 2
def test_batch_extract_url_list(self, server_url, wait_for_server):
"""Test batch extraction from multiple URLs"""
response = requests.post(f"{server_url}/tables/extract/batch", json={
"url_list": [
"https://example.com/page1",
"https://example.com/page2",
],
"config": {"strategy": "default"}
})
# May have mixed success/failure depending on URLs
assert response.status_code in [200, 500]
if response.status_code == 200:
data = response.json()
assert "summary" in data
assert "results" in data
def test_batch_extract_mixed(self, server_url, wait_for_server):
"""Test batch extraction from both HTML and URLs"""
response = requests.post(f"{server_url}/tables/extract/batch", json={
"html_list": [SAMPLE_HTML_WITH_TABLES],
"url_list": ["https://example.com/tables"],
"config": {"strategy": "default"}
})
# May fail on URL crawling but should handle mixed input
assert response.status_code in [200, 500]
if response.status_code == 200:
data = response.json()
assert data["success"] is True
assert data["summary"]["total_processed"] == 2
def test_batch_extract_empty_list(self, server_url, wait_for_server):
"""Test error when no items provided for batch"""
response = requests.post(f"{server_url}/tables/extract/batch", json={
"config": {"strategy": "default"}
})
assert response.status_code == 400
def test_batch_extract_exceeds_limit(self, server_url, wait_for_server):
"""Test error when batch size exceeds limit"""
response = requests.post(f"{server_url}/tables/extract/batch", json={
"html_list": ["<table></table>"] * 100, # 100 items (limit is 50)
"config": {"strategy": "default"}
})
assert response.status_code == 400
assert "50" in response.text or "limit" in response.text.lower()
class TestTableExtractionStrategies:
"""Test different table extraction strategies"""
def test_default_strategy(self, server_url, wait_for_server):
"""Test default (regex-based) extraction strategy"""
response = requests.post(f"{server_url}/tables/extract", json={
"html": SAMPLE_HTML_WITH_TABLES,
"config": {
"strategy": "default"
}
})
assert response.status_code == 200
data = response.json()
assert data["strategy"] == "default"
assert data["table_count"] >= 1
def test_llm_strategy_without_config(self, server_url, wait_for_server):
"""Test LLM strategy without proper config (should use defaults or work)"""
response = requests.post(f"{server_url}/tables/extract", json={
"html": SAMPLE_HTML_WITH_TABLES,
"config": {
"strategy": "llm"
# Missing required LLM config
}
})
# May succeed with defaults or fail - both are acceptable
assert response.status_code in [200, 400, 500]
def test_financial_strategy(self, server_url, wait_for_server):
"""Test financial extraction strategy"""
response = requests.post(f"{server_url}/tables/extract", json={
"html": SAMPLE_HTML_WITH_TABLES,
"config": {
"strategy": "financial",
"preserve_formatting": True,
"extract_metadata": True
}
})
assert response.status_code == 200
data = response.json()
assert data["strategy"] == "financial"
# Financial tables should be extracted
if data["tables"]:
# Should find the financial table in our sample HTML
assert data["table_count"] >= 1
def test_none_strategy(self, server_url, wait_for_server):
"""Test with 'none' strategy (no extraction)"""
response = requests.post(f"{server_url}/tables/extract", json={
"html": SAMPLE_HTML_WITH_TABLES,
"config": {
"strategy": "none"
}
})
assert response.status_code == 200
data = response.json()
# Should return 0 tables
assert data["table_count"] == 0
class TestTableExtractionConfig:
"""Test table extraction configuration options"""
def test_preserve_formatting_option(self, server_url, wait_for_server):
"""Test preserve_formatting option"""
response = requests.post(f"{server_url}/tables/extract", json={
"html": SAMPLE_HTML_WITH_TABLES,
"config": {
"strategy": "financial",
"preserve_formatting": True
}
})
assert response.status_code == 200
def test_extract_metadata_option(self, server_url, wait_for_server):
"""Test extract_metadata option"""
response = requests.post(f"{server_url}/tables/extract", json={
"html": SAMPLE_HTML_WITH_TABLES,
"config": {
"strategy": "financial",
"extract_metadata": True
}
})
assert response.status_code == 200
data = response.json()
# Check if tables have metadata when requested
if data["tables"]:
table = data["tables"][0]
assert isinstance(table, dict)
class TestErrorHandling:
"""Test error handling for table extraction"""
def test_malformed_html(self, server_url, wait_for_server):
"""Test handling of malformed HTML"""
response = requests.post(f"{server_url}/tables/extract", json={
"html": "<table><tr><td>incomplete",
"config": {"strategy": "default"}
})
# Should handle gracefully (either return empty or partial results)
assert response.status_code in [200, 400, 500]
def test_empty_html(self, server_url, wait_for_server):
"""Test handling of empty HTML"""
response = requests.post(f"{server_url}/tables/extract", json={
"html": "",
"config": {"strategy": "default"}
})
# May be rejected as invalid or processed as empty
assert response.status_code in [200, 400]
if response.status_code == 200:
data = response.json()
assert data["table_count"] == 0
def test_html_without_tables(self, server_url, wait_for_server):
"""Test HTML with no tables"""
response = requests.post(f"{server_url}/tables/extract", json={
"html": "<html><body><p>No tables here</p></body></html>",
"config": {"strategy": "default"}
})
assert response.status_code == 200
data = response.json()
assert data["table_count"] == 0
def test_invalid_strategy(self, server_url, wait_for_server):
"""Test invalid strategy name"""
response = requests.post(f"{server_url}/tables/extract", json={
"html": SAMPLE_HTML_WITH_TABLES,
"config": {"strategy": "invalid_strategy"}
})
# Should return validation error (400 or 422 from Pydantic)
assert response.status_code in [400, 422]
def test_missing_config(self, server_url, wait_for_server):
"""Test missing configuration"""
response = requests.post(f"{server_url}/tables/extract", json={
"html": SAMPLE_HTML_WITH_TABLES
# Missing config
})
# Should use default config or return error
assert response.status_code in [200, 400]
# Run tests
if __name__ == "__main__":
pytest.main([__file__, "-v"])

View File

@@ -0,0 +1,225 @@
#!/usr/bin/env python3
"""
Quick test script for Table Extraction feature
Tests the /tables/extract endpoint with sample HTML
Usage:
1. Start the server: python deploy/docker/server.py
2. Run this script: python tests/docker/test_table_extraction_quick.py
"""
import requests
import json
import sys
# Sample HTML with tables
SAMPLE_HTML = """
<!DOCTYPE html>
<html>
<body>
<h1>Test Tables</h1>
<table id="simple">
<tr><th>Name</th><th>Age</th><th>City</th></tr>
<tr><td>Alice</td><td>25</td><td>New York</td></tr>
<tr><td>Bob</td><td>30</td><td>San Francisco</td></tr>
<tr><td>Charlie</td><td>35</td><td>Los Angeles</td></tr>
</table>
<table id="financial">
<thead>
<tr><th>Quarter</th><th>Revenue</th><th>Profit</th></tr>
</thead>
<tbody>
<tr><td>Q1 2024</td><td>$1,250,000.00</td><td>$400,000.00</td></tr>
<tr><td>Q2 2024</td><td>$1,500,000.00</td><td>$600,000.00</td></tr>
<tr><td>Q3 2024</td><td>$1,750,000.00</td><td>$700,000.00</td></tr>
</tbody>
</table>
</body>
</html>
"""
BASE_URL = "http://localhost:11234"
def test_server_health():
"""Check if server is running"""
try:
response = requests.get(f"{BASE_URL}/health", timeout=2)
if response.status_code == 200:
print("✅ Server is running")
return True
else:
print(f"❌ Server health check failed: {response.status_code}")
return False
except requests.exceptions.RequestException as e:
print(f"❌ Server not reachable: {e}")
print("\n💡 Start the server with: python deploy/docker/server.py")
return False
def test_default_strategy():
"""Test default table extraction strategy"""
print("\n📊 Testing DEFAULT strategy...")
response = requests.post(f"{BASE_URL}/tables/extract", json={
"html": SAMPLE_HTML,
"config": {
"strategy": "default"
}
})
if response.status_code == 200:
data = response.json()
print(f"✅ Default strategy works!")
print(f" - Table count: {data['table_count']}")
print(f" - Strategy: {data['strategy']}")
if data['tables']:
for idx, table in enumerate(data['tables']):
print(f" - Table {idx + 1}: {len(table.get('rows', []))} rows")
return True
else:
print(f"❌ Failed: {response.status_code}")
print(f" Error: {response.text}")
return False
def test_financial_strategy():
"""Test financial table extraction strategy"""
print("\n💰 Testing FINANCIAL strategy...")
response = requests.post(f"{BASE_URL}/tables/extract", json={
"html": SAMPLE_HTML,
"config": {
"strategy": "financial",
"preserve_formatting": True,
"extract_metadata": True
}
})
if response.status_code == 200:
data = response.json()
print(f"✅ Financial strategy works!")
print(f" - Table count: {data['table_count']}")
print(f" - Strategy: {data['strategy']}")
return True
else:
print(f"❌ Failed: {response.status_code}")
print(f" Error: {response.text}")
return False
def test_none_strategy():
"""Test none strategy (no extraction)"""
print("\n🚫 Testing NONE strategy...")
response = requests.post(f"{BASE_URL}/tables/extract", json={
"html": SAMPLE_HTML,
"config": {
"strategy": "none"
}
})
if response.status_code == 200:
data = response.json()
if data['table_count'] == 0:
print(f"✅ None strategy works (correctly extracted 0 tables)")
return True
else:
print(f"❌ None strategy returned {data['table_count']} tables (expected 0)")
return False
else:
print(f"❌ Failed: {response.status_code}")
return False
def test_batch_extraction():
"""Test batch extraction"""
print("\n📦 Testing BATCH extraction...")
response = requests.post(f"{BASE_URL}/tables/extract/batch", json={
"html_list": [
SAMPLE_HTML,
"<table><tr><th>Col1</th></tr><tr><td>Val1</td></tr></table>"
],
"config": {
"strategy": "default"
}
})
if response.status_code == 200:
data = response.json()
print(f"✅ Batch extraction works!")
print(f" - Total processed: {data['summary']['total_processed']}")
print(f" - Successful: {data['summary']['successful']}")
print(f" - Total tables: {data['summary']['total_tables_extracted']}")
return True
else:
print(f"❌ Failed: {response.status_code}")
print(f" Error: {response.text}")
return False
def test_error_handling():
"""Test error handling"""
print("\n⚠️ Testing ERROR handling...")
# Test with both html and url (should fail)
response = requests.post(f"{BASE_URL}/tables/extract", json={
"html": "<table></table>",
"url": "https://example.com",
"config": {"strategy": "default"}
})
if response.status_code == 400:
print(f"✅ Error handling works (correctly rejected invalid input)")
return True
else:
print(f"❌ Expected 400 error, got: {response.status_code}")
return False
def main():
print("=" * 60)
print("Table Extraction Feature - Quick Test")
print("=" * 60)
# Check server
if not test_server_health():
sys.exit(1)
# Run tests
results = []
results.append(("Default Strategy", test_default_strategy()))
results.append(("Financial Strategy", test_financial_strategy()))
results.append(("None Strategy", test_none_strategy()))
results.append(("Batch Extraction", test_batch_extraction()))
results.append(("Error Handling", test_error_handling()))
# Summary
print("\n" + "=" * 60)
print("Test Summary")
print("=" * 60)
passed = sum(1 for _, result in results if result)
total = len(results)
for name, result in results:
status = "✅ PASS" if result else "❌ FAIL"
print(f"{status}: {name}")
print(f"\nTotal: {passed}/{total} tests passed")
if passed == total:
print("\n🎉 All tests passed! Table extraction is working correctly!")
sys.exit(0)
else:
print(f"\n⚠️ {total - passed} test(s) failed")
sys.exit(1)
if __name__ == "__main__":
main()