""" Integration tests for Table Extraction functionality in Crawl4AI Docker Server Tests cover: 1. Integrated table extraction during crawls 2. Dedicated /tables endpoints 3. All extraction strategies (default, LLM, financial) 4. Batch processing 5. Error handling Note: These tests require the Docker server to be running on localhost:11235 Run: python deploy/docker/server.py """ import pytest import requests import time from typing import Dict, Any # Base URL for the Docker API server BASE_URL = "http://localhost:11234" # Sample HTML with tables for testing SAMPLE_HTML_WITH_TABLES = """ Test Page with Tables

Financial Data

Name	Age
Alice	25
Bob	30

Quarter	Revenue	Expenses	Profit
Q1 2024	$1,250,000.00	$850,000.00	$400,000.00
Q2 2024	$1,500,000.00	$900,000.00	$600,000.00

Product	Sales
Product	Units	Revenue
Widget A	100	$5,000
Widget B	200	$10,000

""" @pytest.fixture(scope="module") def server_url(): """Return the server URL""" return BASE_URL @pytest.fixture(scope="module") def wait_for_server(): """Wait for server to be ready""" max_retries = 5 for i in range(max_retries): try: response = requests.get(f"{BASE_URL}/health", timeout=2) if response.status_code == 200: return True except requests.exceptions.RequestException: if i < max_retries - 1: time.sleep(1) pytest.skip("Server not running on localhost:11235. Start with: python deploy/docker/server.py") class TestIntegratedTableExtraction: """Test table extraction integrated with /crawl endpoint""" def test_crawl_with_default_table_extraction(self, server_url, wait_for_server): """Test crawling with default table extraction strategy""" response = requests.post(f"{server_url}/crawl", json={ "urls": ["https://example.com/tables"], "browser_config": {"headless": True}, "crawler_config": {}, "table_extraction": { "strategy": "default" } }) assert response.status_code == 200 data = response.json() assert data["success"] is True assert "results" in data # Check first result has tables if data["results"]: result = data["results"][0] assert "tables" in result or result.get("success") is False def test_crawl_with_llm_table_extraction(self, server_url, wait_for_server): """Test crawling with LLM table extraction strategy""" response = requests.post(f"{server_url}/crawl", json={ "urls": ["https://example.com/financial"], "browser_config": {"headless": True}, "crawler_config": {}, "table_extraction": { "strategy": "llm", "llm_provider": "openai", "llm_model": "gpt-4", "llm_api_key": "test-key", "llm_prompt": "Extract financial data from tables" } }) # Should fail without valid API key, but structure should be correct # In real scenario with valid key, this would succeed assert response.status_code in [200, 500] # May fail on auth def test_crawl_with_financial_table_extraction(self, server_url, wait_for_server): """Test crawling with financial table extraction strategy""" response = requests.post(f"{server_url}/crawl", json={ "urls": ["https://example.com/stocks"], "browser_config": {"headless": True}, "crawler_config": {}, "table_extraction": { "strategy": "financial", "preserve_formatting": True, "extract_metadata": True } }) assert response.status_code == 200 data = response.json() assert data["success"] is True def test_crawl_without_table_extraction(self, server_url, wait_for_server): """Test crawling without table extraction (should work normally)""" response = requests.post(f"{server_url}/crawl", json={ "urls": ["https://example.com"], "browser_config": {"headless": True}, "crawler_config": {} }) assert response.status_code == 200 data = response.json() assert data["success"] is True class TestDedicatedTableEndpoints: """Test dedicated /tables endpoints""" def test_extract_tables_from_html(self, server_url, wait_for_server): """Test extracting tables from provided HTML""" response = requests.post(f"{server_url}/tables/extract", json={ "html": SAMPLE_HTML_WITH_TABLES, "config": { "strategy": "default" } }) assert response.status_code == 200 data = response.json() assert data["success"] is True assert data["table_count"] >= 3 # Should find at least 3 tables assert "tables" in data assert data["strategy"] == "default" # Verify table structure if data["tables"]: table = data["tables"][0] assert "headers" in table or "rows" in table def test_extract_tables_from_url(self, server_url, wait_for_server): """Test extracting tables by fetching URL""" response = requests.post(f"{server_url}/tables/extract", json={ "url": "https://example.com/tables", "config": { "strategy": "default" } }) # May fail if URL doesn't exist, but structure should be correct assert response.status_code in [200, 500] if response.status_code == 200: data = response.json() assert "success" in data assert "tables" in data def test_extract_tables_invalid_input(self, server_url, wait_for_server): """Test error handling for invalid input""" # No html or url provided response = requests.post(f"{server_url}/tables/extract", json={ "config": {"strategy": "default"} }) assert response.status_code == 400 assert "html" in response.text.lower() or "url" in response.text.lower() def test_extract_tables_both_html_and_url(self, server_url, wait_for_server): """Test error when both html and url are provided""" response = requests.post(f"{server_url}/tables/extract", json={ "html": "

", "url": "https://example.com", "config": {"strategy": "default"} }) assert response.status_code == 400 assert "both" in response.text.lower() class TestBatchTableExtraction: """Test batch table extraction endpoints""" def test_batch_extract_html_list(self, server_url, wait_for_server): """Test batch extraction from multiple HTML contents""" response = requests.post(f"{server_url}/tables/extract/batch", json={ "html_list": [ SAMPLE_HTML_WITH_TABLES, "

A
1

", ], "config": {"strategy": "default"} }) assert response.status_code == 200 data = response.json() assert data["success"] is True assert "summary" in data assert data["summary"]["total_processed"] == 2 assert data["summary"]["successful"] >= 0 assert "results" in data assert len(data["results"]) == 2 def test_batch_extract_url_list(self, server_url, wait_for_server): """Test batch extraction from multiple URLs""" response = requests.post(f"{server_url}/tables/extract/batch", json={ "url_list": [ "https://example.com/page1", "https://example.com/page2", ], "config": {"strategy": "default"} }) # May have mixed success/failure depending on URLs assert response.status_code in [200, 500] if response.status_code == 200: data = response.json() assert "summary" in data assert "results" in data def test_batch_extract_mixed(self, server_url, wait_for_server): """Test batch extraction from both HTML and URLs""" response = requests.post(f"{server_url}/tables/extract/batch", json={ "html_list": [SAMPLE_HTML_WITH_TABLES], "url_list": ["https://example.com/tables"], "config": {"strategy": "default"} }) # May fail on URL crawling but should handle mixed input assert response.status_code in [200, 500] if response.status_code == 200: data = response.json() assert data["success"] is True assert data["summary"]["total_processed"] == 2 def test_batch_extract_empty_list(self, server_url, wait_for_server): """Test error when no items provided for batch""" response = requests.post(f"{server_url}/tables/extract/batch", json={ "config": {"strategy": "default"} }) assert response.status_code == 400 def test_batch_extract_exceeds_limit(self, server_url, wait_for_server): """Test error when batch size exceeds limit""" response = requests.post(f"{server_url}/tables/extract/batch", json={ "html_list": ["

"] * 100, # 100 items (limit is 50) "config": {"strategy": "default"} }) assert response.status_code == 400 assert "50" in response.text or "limit" in response.text.lower() class TestTableExtractionStrategies: """Test different table extraction strategies""" def test_default_strategy(self, server_url, wait_for_server): """Test default (regex-based) extraction strategy""" response = requests.post(f"{server_url}/tables/extract", json={ "html": SAMPLE_HTML_WITH_TABLES, "config": { "strategy": "default" } }) assert response.status_code == 200 data = response.json() assert data["strategy"] == "default" assert data["table_count"] >= 1 def test_llm_strategy_without_config(self, server_url, wait_for_server): """Test LLM strategy without proper config (should use defaults or work)""" response = requests.post(f"{server_url}/tables/extract", json={ "html": SAMPLE_HTML_WITH_TABLES, "config": { "strategy": "llm" # Missing required LLM config } }) # May succeed with defaults or fail - both are acceptable assert response.status_code in [200, 400, 500] def test_financial_strategy(self, server_url, wait_for_server): """Test financial extraction strategy""" response = requests.post(f"{server_url}/tables/extract", json={ "html": SAMPLE_HTML_WITH_TABLES, "config": { "strategy": "financial", "preserve_formatting": True, "extract_metadata": True } }) assert response.status_code == 200 data = response.json() assert data["strategy"] == "financial" # Financial tables should be extracted if data["tables"]: # Should find the financial table in our sample HTML assert data["table_count"] >= 1 def test_none_strategy(self, server_url, wait_for_server): """Test with 'none' strategy (no extraction)""" response = requests.post(f"{server_url}/tables/extract", json={ "html": SAMPLE_HTML_WITH_TABLES, "config": { "strategy": "none" } }) assert response.status_code == 200 data = response.json() # Should return 0 tables assert data["table_count"] == 0 class TestTableExtractionConfig: """Test table extraction configuration options""" def test_preserve_formatting_option(self, server_url, wait_for_server): """Test preserve_formatting option""" response = requests.post(f"{server_url}/tables/extract", json={ "html": SAMPLE_HTML_WITH_TABLES, "config": { "strategy": "financial", "preserve_formatting": True } }) assert response.status_code == 200 def test_extract_metadata_option(self, server_url, wait_for_server): """Test extract_metadata option""" response = requests.post(f"{server_url}/tables/extract", json={ "html": SAMPLE_HTML_WITH_TABLES, "config": { "strategy": "financial", "extract_metadata": True } }) assert response.status_code == 200 data = response.json() # Check if tables have metadata when requested if data["tables"]: table = data["tables"][0] assert isinstance(table, dict) class TestErrorHandling: """Test error handling for table extraction""" def test_malformed_html(self, server_url, wait_for_server): """Test handling of malformed HTML""" response = requests.post(f"{server_url}/tables/extract", json={ "html": "

incomplete", "config": {"strategy": "default"} }) # Should handle gracefully (either return empty or partial results) assert response.status_code in [200, 400, 500] def test_empty_html(self, server_url, wait_for_server): """Test handling of empty HTML""" response = requests.post(f"{server_url}/tables/extract", json={ "html": "", "config": {"strategy": "default"} }) # May be rejected as invalid or processed as empty assert response.status_code in [200, 400] if response.status_code == 200: data = response.json() assert data["table_count"] == 0 def test_html_without_tables(self, server_url, wait_for_server): """Test HTML with no tables""" response = requests.post(f"{server_url}/tables/extract", json={ "html": "

No tables here

", "config": {"strategy": "default"} }) assert response.status_code == 200 data = response.json() assert data["table_count"] == 0 def test_invalid_strategy(self, server_url, wait_for_server): """Test invalid strategy name""" response = requests.post(f"{server_url}/tables/extract", json={ "html": SAMPLE_HTML_WITH_TABLES, "config": {"strategy": "invalid_strategy"} }) # Should return validation error (400 or 422 from Pydantic) assert response.status_code in [400, 422] def test_missing_config(self, server_url, wait_for_server): """Test missing configuration""" response = requests.post(f"{server_url}/tables/extract", json={ "html": SAMPLE_HTML_WITH_TABLES # Missing config }) # Should use default config or return error assert response.status_code in [200, 400] # Run tests if __name__ == "__main__": pytest.main([__file__, "-v"])