Files
crawl4ai/deploy/docker/routers/tables.py
AHMET YILMAZ 00e9904609 feat: Add table extraction strategies and API documentation
- Implemented table extraction strategies: default, LLM, financial, and none in utils.py.
- Created new API documentation for table extraction endpoints and strategies.
- Added integration tests for table extraction functionality covering various strategies and error handling.
- Developed quick test script for rapid validation of table extraction features.
2025-10-17 12:30:37 +08:00

302 lines
9.9 KiB
Python

"""
Table Extraction Router for Crawl4AI Docker Server
This module provides dedicated endpoints for table extraction from HTML or URLs,
separate from the main crawling functionality.
"""
import logging
from typing import List, Dict, Any
from fastapi import APIRouter, HTTPException
from fastapi.responses import JSONResponse
# Import crawler pool for browser reuse
from crawler_pool import get_crawler
# Import schemas
from schemas import (
TableExtractionRequest,
TableExtractionBatchRequest,
TableExtractionConfig,
)
# Import utilities
from utils import (
extract_tables_from_html,
format_table_response,
create_table_extraction_strategy,
)
# Configure logger
logger = logging.getLogger(__name__)
# Create router
router = APIRouter(prefix="/tables", tags=["Table Extraction"])
@router.post(
"/extract",
summary="Extract Tables from HTML or URL",
description="""
Extract tables from HTML content or by fetching a URL.
Supports multiple extraction strategies: default, LLM-based, or financial.
**Input Options:**
- Provide `html` for direct HTML content extraction
- Provide `url` to fetch and extract from a live page
- Cannot provide both `html` and `url` simultaneously
**Strategies:**
- `default`: Fast regex and HTML structure-based extraction
- `llm`: AI-powered extraction with semantic understanding (requires LLM config)
- `financial`: Specialized extraction for financial tables with numerical formatting
**Returns:**
- List of extracted tables with headers, rows, and metadata
- Each table includes cell-level details and formatting information
""",
response_description="Extracted tables with metadata",
)
async def extract_tables(request: TableExtractionRequest) -> JSONResponse:
"""
Extract tables from HTML content or URL.
Args:
request: TableExtractionRequest with html/url and extraction config
Returns:
JSONResponse with extracted tables and metadata
Raises:
HTTPException: If validation fails or extraction errors occur
"""
try:
# Validate input
if request.html and request.url:
raise HTTPException(
status_code=400,
detail="Cannot provide both 'html' and 'url'. Choose one input method."
)
if not request.html and not request.url:
raise HTTPException(
status_code=400,
detail="Must provide either 'html' or 'url' for table extraction."
)
# Handle URL-based extraction
if request.url:
# Import crawler configs
from async_configs import BrowserConfig, CrawlerRunConfig
try:
# Create minimal browser config
browser_config = BrowserConfig(
headless=True,
verbose=False,
)
# Create crawler config with table extraction
table_strategy = create_table_extraction_strategy(request.config)
crawler_config = CrawlerRunConfig(
table_extraction_strategy=table_strategy,
)
# Get crawler from pool (browser reuse for memory efficiency)
crawler = await get_crawler(browser_config, adapter=None)
# Crawl the URL
result = await crawler.arun(
url=request.url,
config=crawler_config,
)
if not result.success:
raise HTTPException(
status_code=500,
detail=f"Failed to fetch URL: {result.error_message}"
)
# Extract HTML
html_content = result.html
except Exception as e:
logger.error(f"Error fetching URL {request.url}: {e}")
raise HTTPException(
status_code=500,
detail=f"Failed to fetch and extract from URL: {str(e)}"
)
else:
# Use provided HTML
html_content = request.html
# Extract tables from HTML
tables = await extract_tables_from_html(html_content, request.config)
# Format response
formatted_tables = format_table_response(tables)
return JSONResponse({
"success": True,
"table_count": len(formatted_tables),
"tables": formatted_tables,
"strategy": request.config.strategy.value,
})
except HTTPException:
raise
except Exception as e:
logger.error(f"Error extracting tables: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"Table extraction failed: {str(e)}"
)
@router.post(
"/extract/batch",
summary="Extract Tables from Multiple Sources (Batch)",
description="""
Extract tables from multiple HTML contents or URLs in a single request.
Processes each input independently and returns results for all.
**Batch Processing:**
- Provide list of HTML contents and/or URLs
- Each input is processed with the same extraction strategy
- Partial failures are allowed (returns results for successful extractions)
**Use Cases:**
- Extracting tables from multiple pages simultaneously
- Bulk financial data extraction
- Comparing table structures across multiple sources
""",
response_description="Batch extraction results with per-item success status",
)
async def extract_tables_batch(request: TableExtractionBatchRequest) -> JSONResponse:
"""
Extract tables from multiple HTML contents or URLs in batch.
Args:
request: TableExtractionBatchRequest with list of html/url and config
Returns:
JSONResponse with batch results
Raises:
HTTPException: If validation fails
"""
try:
# Validate batch request
total_items = len(request.html_list or []) + len(request.url_list or [])
if total_items == 0:
raise HTTPException(
status_code=400,
detail="Must provide at least one HTML content or URL in batch request."
)
if total_items > 50: # Reasonable batch limit
raise HTTPException(
status_code=400,
detail=f"Batch size ({total_items}) exceeds maximum allowed (50)."
)
results = []
# Process HTML list
if request.html_list:
for idx, html_content in enumerate(request.html_list):
try:
tables = await extract_tables_from_html(html_content, request.config)
formatted_tables = format_table_response(tables)
results.append({
"success": True,
"source": f"html_{idx}",
"table_count": len(formatted_tables),
"tables": formatted_tables,
})
except Exception as e:
logger.error(f"Error extracting tables from html_{idx}: {e}")
results.append({
"success": False,
"source": f"html_{idx}",
"error": str(e),
})
# Process URL list
if request.url_list:
from async_configs import BrowserConfig, CrawlerRunConfig
browser_config = BrowserConfig(
headless=True,
verbose=False,
)
table_strategy = create_table_extraction_strategy(request.config)
crawler_config = CrawlerRunConfig(
table_extraction_strategy=table_strategy,
)
# Get crawler from pool (reuse browser for all URLs in batch)
crawler = await get_crawler(browser_config, adapter=None)
for url in request.url_list:
try:
result = await crawler.arun(
url=url,
config=crawler_config,
)
if result.success:
html_content = result.html
tables = await extract_tables_from_html(html_content, request.config)
formatted_tables = format_table_response(tables)
results.append({
"success": True,
"source": url,
"table_count": len(formatted_tables),
"tables": formatted_tables,
})
else:
results.append({
"success": False,
"source": url,
"error": result.error_message,
})
except Exception as e:
logger.error(f"Error extracting tables from {url}: {e}")
results.append({
"success": False,
"source": url,
"error": str(e),
})
# Calculate summary
successful = sum(1 for r in results if r["success"])
failed = len(results) - successful
total_tables = sum(r.get("table_count", 0) for r in results if r["success"])
return JSONResponse({
"success": True,
"summary": {
"total_processed": len(results),
"successful": successful,
"failed": failed,
"total_tables_extracted": total_tables,
},
"results": results,
"strategy": request.config.strategy.value,
})
except HTTPException:
raise
except Exception as e:
logger.error(f"Error in batch table extraction: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"Batch table extraction failed: {str(e)}"
)