feat: Add table extraction strategies and API documentation
- Implemented table extraction strategies: default, LLM, financial, and none in utils.py. - Created new API documentation for table extraction endpoints and strategies. - Added integration tests for table extraction functionality covering various strategies and error handling. - Developed quick test script for rapid validation of table extraction features.
This commit is contained in:
458
tests/docker/test_table_extraction.py
Normal file
458
tests/docker/test_table_extraction.py
Normal file
@@ -0,0 +1,458 @@
|
||||
"""
|
||||
Integration tests for Table Extraction functionality in Crawl4AI Docker Server
|
||||
|
||||
Tests cover:
|
||||
1. Integrated table extraction during crawls
|
||||
2. Dedicated /tables endpoints
|
||||
3. All extraction strategies (default, LLM, financial)
|
||||
4. Batch processing
|
||||
5. Error handling
|
||||
|
||||
Note: These tests require the Docker server to be running on localhost:11235
|
||||
Run: python deploy/docker/server.py
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
|
||||
# Base URL for the Docker API server
|
||||
BASE_URL = "http://localhost:11234"
|
||||
|
||||
# Sample HTML with tables for testing
|
||||
SAMPLE_HTML_WITH_TABLES = """
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>Test Page with Tables</title></head>
|
||||
<body>
|
||||
<h1>Financial Data</h1>
|
||||
|
||||
<!-- Simple table -->
|
||||
<table id="simple">
|
||||
<tr><th>Name</th><th>Age</th></tr>
|
||||
<tr><td>Alice</td><td>25</td></tr>
|
||||
<tr><td>Bob</td><td>30</td></tr>
|
||||
</table>
|
||||
|
||||
<!-- Financial table -->
|
||||
<table id="financial">
|
||||
<thead>
|
||||
<tr><th>Quarter</th><th>Revenue</th><th>Expenses</th><th>Profit</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr><td>Q1 2024</td><td>$1,250,000.00</td><td>$850,000.00</td><td>$400,000.00</td></tr>
|
||||
<tr><td>Q2 2024</td><td>$1,500,000.00</td><td>$900,000.00</td><td>$600,000.00</td></tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<!-- Complex nested table -->
|
||||
<table id="complex">
|
||||
<tr>
|
||||
<th rowspan="2">Product</th>
|
||||
<th colspan="2">Sales</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Units</th>
|
||||
<th>Revenue</th>
|
||||
</tr>
|
||||
<tr><td>Widget A</td><td>100</td><td>$5,000</td></tr>
|
||||
<tr><td>Widget B</td><td>200</td><td>$10,000</td></tr>
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server_url():
|
||||
"""Return the server URL"""
|
||||
return BASE_URL
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def wait_for_server():
|
||||
"""Wait for server to be ready"""
|
||||
max_retries = 5
|
||||
for i in range(max_retries):
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/health", timeout=2)
|
||||
if response.status_code == 200:
|
||||
return True
|
||||
except requests.exceptions.RequestException:
|
||||
if i < max_retries - 1:
|
||||
time.sleep(1)
|
||||
pytest.skip("Server not running on localhost:11235. Start with: python deploy/docker/server.py")
|
||||
|
||||
|
||||
class TestIntegratedTableExtraction:
|
||||
"""Test table extraction integrated with /crawl endpoint"""
|
||||
|
||||
def test_crawl_with_default_table_extraction(self, server_url, wait_for_server):
|
||||
"""Test crawling with default table extraction strategy"""
|
||||
response = requests.post(f"{server_url}/crawl", json={
|
||||
"urls": ["https://example.com/tables"],
|
||||
"browser_config": {"headless": True},
|
||||
"crawler_config": {},
|
||||
"table_extraction": {
|
||||
"strategy": "default"
|
||||
}
|
||||
})
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["success"] is True
|
||||
assert "results" in data
|
||||
|
||||
# Check first result has tables
|
||||
if data["results"]:
|
||||
result = data["results"][0]
|
||||
assert "tables" in result or result.get("success") is False
|
||||
|
||||
def test_crawl_with_llm_table_extraction(self, server_url, wait_for_server):
|
||||
"""Test crawling with LLM table extraction strategy"""
|
||||
response = requests.post(f"{server_url}/crawl", json={
|
||||
"urls": ["https://example.com/financial"],
|
||||
"browser_config": {"headless": True},
|
||||
"crawler_config": {},
|
||||
"table_extraction": {
|
||||
"strategy": "llm",
|
||||
"llm_provider": "openai",
|
||||
"llm_model": "gpt-4",
|
||||
"llm_api_key": "test-key",
|
||||
"llm_prompt": "Extract financial data from tables"
|
||||
}
|
||||
})
|
||||
|
||||
# Should fail without valid API key, but structure should be correct
|
||||
# In real scenario with valid key, this would succeed
|
||||
assert response.status_code in [200, 500] # May fail on auth
|
||||
|
||||
def test_crawl_with_financial_table_extraction(self, server_url, wait_for_server):
|
||||
"""Test crawling with financial table extraction strategy"""
|
||||
response = requests.post(f"{server_url}/crawl", json={
|
||||
"urls": ["https://example.com/stocks"],
|
||||
"browser_config": {"headless": True},
|
||||
"crawler_config": {},
|
||||
"table_extraction": {
|
||||
"strategy": "financial",
|
||||
"preserve_formatting": True,
|
||||
"extract_metadata": True
|
||||
}
|
||||
})
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["success"] is True
|
||||
|
||||
def test_crawl_without_table_extraction(self, server_url, wait_for_server):
|
||||
"""Test crawling without table extraction (should work normally)"""
|
||||
response = requests.post(f"{server_url}/crawl", json={
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": {"headless": True},
|
||||
"crawler_config": {}
|
||||
})
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["success"] is True
|
||||
|
||||
|
||||
class TestDedicatedTableEndpoints:
|
||||
"""Test dedicated /tables endpoints"""
|
||||
|
||||
def test_extract_tables_from_html(self, server_url, wait_for_server):
|
||||
"""Test extracting tables from provided HTML"""
|
||||
response = requests.post(f"{server_url}/tables/extract", json={
|
||||
"html": SAMPLE_HTML_WITH_TABLES,
|
||||
"config": {
|
||||
"strategy": "default"
|
||||
}
|
||||
})
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["success"] is True
|
||||
assert data["table_count"] >= 3 # Should find at least 3 tables
|
||||
assert "tables" in data
|
||||
assert data["strategy"] == "default"
|
||||
|
||||
# Verify table structure
|
||||
if data["tables"]:
|
||||
table = data["tables"][0]
|
||||
assert "headers" in table or "rows" in table
|
||||
|
||||
def test_extract_tables_from_url(self, server_url, wait_for_server):
|
||||
"""Test extracting tables by fetching URL"""
|
||||
response = requests.post(f"{server_url}/tables/extract", json={
|
||||
"url": "https://example.com/tables",
|
||||
"config": {
|
||||
"strategy": "default"
|
||||
}
|
||||
})
|
||||
|
||||
# May fail if URL doesn't exist, but structure should be correct
|
||||
assert response.status_code in [200, 500]
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
assert "success" in data
|
||||
assert "tables" in data
|
||||
|
||||
def test_extract_tables_invalid_input(self, server_url, wait_for_server):
|
||||
"""Test error handling for invalid input"""
|
||||
# No html or url provided
|
||||
response = requests.post(f"{server_url}/tables/extract", json={
|
||||
"config": {"strategy": "default"}
|
||||
})
|
||||
|
||||
assert response.status_code == 400
|
||||
assert "html" in response.text.lower() or "url" in response.text.lower()
|
||||
|
||||
def test_extract_tables_both_html_and_url(self, server_url, wait_for_server):
|
||||
"""Test error when both html and url are provided"""
|
||||
response = requests.post(f"{server_url}/tables/extract", json={
|
||||
"html": "<table></table>",
|
||||
"url": "https://example.com",
|
||||
"config": {"strategy": "default"}
|
||||
})
|
||||
|
||||
assert response.status_code == 400
|
||||
assert "both" in response.text.lower()
|
||||
|
||||
|
||||
class TestBatchTableExtraction:
|
||||
"""Test batch table extraction endpoints"""
|
||||
|
||||
def test_batch_extract_html_list(self, server_url, wait_for_server):
|
||||
"""Test batch extraction from multiple HTML contents"""
|
||||
response = requests.post(f"{server_url}/tables/extract/batch", json={
|
||||
"html_list": [
|
||||
SAMPLE_HTML_WITH_TABLES,
|
||||
"<table><tr><th>A</th></tr><tr><td>1</td></tr></table>",
|
||||
],
|
||||
"config": {"strategy": "default"}
|
||||
})
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["success"] is True
|
||||
assert "summary" in data
|
||||
assert data["summary"]["total_processed"] == 2
|
||||
assert data["summary"]["successful"] >= 0
|
||||
assert "results" in data
|
||||
assert len(data["results"]) == 2
|
||||
|
||||
def test_batch_extract_url_list(self, server_url, wait_for_server):
|
||||
"""Test batch extraction from multiple URLs"""
|
||||
response = requests.post(f"{server_url}/tables/extract/batch", json={
|
||||
"url_list": [
|
||||
"https://example.com/page1",
|
||||
"https://example.com/page2",
|
||||
],
|
||||
"config": {"strategy": "default"}
|
||||
})
|
||||
|
||||
# May have mixed success/failure depending on URLs
|
||||
assert response.status_code in [200, 500]
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
assert "summary" in data
|
||||
assert "results" in data
|
||||
|
||||
def test_batch_extract_mixed(self, server_url, wait_for_server):
|
||||
"""Test batch extraction from both HTML and URLs"""
|
||||
response = requests.post(f"{server_url}/tables/extract/batch", json={
|
||||
"html_list": [SAMPLE_HTML_WITH_TABLES],
|
||||
"url_list": ["https://example.com/tables"],
|
||||
"config": {"strategy": "default"}
|
||||
})
|
||||
|
||||
# May fail on URL crawling but should handle mixed input
|
||||
assert response.status_code in [200, 500]
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
assert data["success"] is True
|
||||
assert data["summary"]["total_processed"] == 2
|
||||
|
||||
def test_batch_extract_empty_list(self, server_url, wait_for_server):
|
||||
"""Test error when no items provided for batch"""
|
||||
response = requests.post(f"{server_url}/tables/extract/batch", json={
|
||||
"config": {"strategy": "default"}
|
||||
})
|
||||
|
||||
assert response.status_code == 400
|
||||
|
||||
def test_batch_extract_exceeds_limit(self, server_url, wait_for_server):
|
||||
"""Test error when batch size exceeds limit"""
|
||||
response = requests.post(f"{server_url}/tables/extract/batch", json={
|
||||
"html_list": ["<table></table>"] * 100, # 100 items (limit is 50)
|
||||
"config": {"strategy": "default"}
|
||||
})
|
||||
|
||||
assert response.status_code == 400
|
||||
assert "50" in response.text or "limit" in response.text.lower()
|
||||
|
||||
|
||||
class TestTableExtractionStrategies:
|
||||
"""Test different table extraction strategies"""
|
||||
|
||||
def test_default_strategy(self, server_url, wait_for_server):
|
||||
"""Test default (regex-based) extraction strategy"""
|
||||
response = requests.post(f"{server_url}/tables/extract", json={
|
||||
"html": SAMPLE_HTML_WITH_TABLES,
|
||||
"config": {
|
||||
"strategy": "default"
|
||||
}
|
||||
})
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["strategy"] == "default"
|
||||
assert data["table_count"] >= 1
|
||||
|
||||
def test_llm_strategy_without_config(self, server_url, wait_for_server):
|
||||
"""Test LLM strategy without proper config (should use defaults or work)"""
|
||||
response = requests.post(f"{server_url}/tables/extract", json={
|
||||
"html": SAMPLE_HTML_WITH_TABLES,
|
||||
"config": {
|
||||
"strategy": "llm"
|
||||
# Missing required LLM config
|
||||
}
|
||||
})
|
||||
|
||||
# May succeed with defaults or fail - both are acceptable
|
||||
assert response.status_code in [200, 400, 500]
|
||||
|
||||
def test_financial_strategy(self, server_url, wait_for_server):
|
||||
"""Test financial extraction strategy"""
|
||||
response = requests.post(f"{server_url}/tables/extract", json={
|
||||
"html": SAMPLE_HTML_WITH_TABLES,
|
||||
"config": {
|
||||
"strategy": "financial",
|
||||
"preserve_formatting": True,
|
||||
"extract_metadata": True
|
||||
}
|
||||
})
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["strategy"] == "financial"
|
||||
|
||||
# Financial tables should be extracted
|
||||
if data["tables"]:
|
||||
# Should find the financial table in our sample HTML
|
||||
assert data["table_count"] >= 1
|
||||
|
||||
def test_none_strategy(self, server_url, wait_for_server):
|
||||
"""Test with 'none' strategy (no extraction)"""
|
||||
response = requests.post(f"{server_url}/tables/extract", json={
|
||||
"html": SAMPLE_HTML_WITH_TABLES,
|
||||
"config": {
|
||||
"strategy": "none"
|
||||
}
|
||||
})
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
# Should return 0 tables
|
||||
assert data["table_count"] == 0
|
||||
|
||||
|
||||
class TestTableExtractionConfig:
|
||||
"""Test table extraction configuration options"""
|
||||
|
||||
def test_preserve_formatting_option(self, server_url, wait_for_server):
|
||||
"""Test preserve_formatting option"""
|
||||
response = requests.post(f"{server_url}/tables/extract", json={
|
||||
"html": SAMPLE_HTML_WITH_TABLES,
|
||||
"config": {
|
||||
"strategy": "financial",
|
||||
"preserve_formatting": True
|
||||
}
|
||||
})
|
||||
|
||||
assert response.status_code == 200
|
||||
|
||||
def test_extract_metadata_option(self, server_url, wait_for_server):
|
||||
"""Test extract_metadata option"""
|
||||
response = requests.post(f"{server_url}/tables/extract", json={
|
||||
"html": SAMPLE_HTML_WITH_TABLES,
|
||||
"config": {
|
||||
"strategy": "financial",
|
||||
"extract_metadata": True
|
||||
}
|
||||
})
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
|
||||
# Check if tables have metadata when requested
|
||||
if data["tables"]:
|
||||
table = data["tables"][0]
|
||||
assert isinstance(table, dict)
|
||||
|
||||
|
||||
class TestErrorHandling:
|
||||
"""Test error handling for table extraction"""
|
||||
|
||||
def test_malformed_html(self, server_url, wait_for_server):
|
||||
"""Test handling of malformed HTML"""
|
||||
response = requests.post(f"{server_url}/tables/extract", json={
|
||||
"html": "<table><tr><td>incomplete",
|
||||
"config": {"strategy": "default"}
|
||||
})
|
||||
|
||||
# Should handle gracefully (either return empty or partial results)
|
||||
assert response.status_code in [200, 400, 500]
|
||||
|
||||
def test_empty_html(self, server_url, wait_for_server):
|
||||
"""Test handling of empty HTML"""
|
||||
response = requests.post(f"{server_url}/tables/extract", json={
|
||||
"html": "",
|
||||
"config": {"strategy": "default"}
|
||||
})
|
||||
|
||||
# May be rejected as invalid or processed as empty
|
||||
assert response.status_code in [200, 400]
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
assert data["table_count"] == 0
|
||||
|
||||
def test_html_without_tables(self, server_url, wait_for_server):
|
||||
"""Test HTML with no tables"""
|
||||
response = requests.post(f"{server_url}/tables/extract", json={
|
||||
"html": "<html><body><p>No tables here</p></body></html>",
|
||||
"config": {"strategy": "default"}
|
||||
})
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["table_count"] == 0
|
||||
|
||||
def test_invalid_strategy(self, server_url, wait_for_server):
|
||||
"""Test invalid strategy name"""
|
||||
response = requests.post(f"{server_url}/tables/extract", json={
|
||||
"html": SAMPLE_HTML_WITH_TABLES,
|
||||
"config": {"strategy": "invalid_strategy"}
|
||||
})
|
||||
|
||||
# Should return validation error (400 or 422 from Pydantic)
|
||||
assert response.status_code in [400, 422]
|
||||
|
||||
def test_missing_config(self, server_url, wait_for_server):
|
||||
"""Test missing configuration"""
|
||||
response = requests.post(f"{server_url}/tables/extract", json={
|
||||
"html": SAMPLE_HTML_WITH_TABLES
|
||||
# Missing config
|
||||
})
|
||||
|
||||
# Should use default config or return error
|
||||
assert response.status_code in [200, 400]
|
||||
|
||||
|
||||
# Run tests
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
Reference in New Issue
Block a user