crawl4ai/deploy/docker/routers/tables.py

"""
Table Extraction Router for Crawl4AI Docker Server

This module provides dedicated endpoints for table extraction from HTML or URLs,
separate from the main crawling functionality.
"""

import logging
from typing import List, Dict, Any
from fastapi import APIRouter, HTTPException
from fastapi.responses import JSONResponse

# Import crawler pool for browser reuse
from crawler_pool import get_crawler

# Import schemas
from schemas import (
    TableExtractionRequest,
    TableExtractionBatchRequest,
    TableExtractionConfig,
)

# Import utilities
from utils import (
    extract_tables_from_html,
    format_table_response,
    create_table_extraction_strategy,
)

# Configure logger
logger = logging.getLogger(__name__)

# Create router
router = APIRouter(prefix="/tables", tags=["Table Extraction"])


@router.post(
    "/extract",
    summary="Extract Tables from HTML or URL",
    description="""
Extract tables from HTML content or by fetching a URL.
Supports multiple extraction strategies: default, LLM-based, or financial.

**Input Options:**
- Provide `html` for direct HTML content extraction
- Provide `url` to fetch and extract from a live page
- Cannot provide both `html` and `url` simultaneously

**Strategies:**
- `default`: Fast regex and HTML structure-based extraction
- `llm`: AI-powered extraction with semantic understanding (requires LLM config)
- `financial`: Specialized extraction for financial tables with numerical formatting

**Returns:**
- List of extracted tables with headers, rows, and metadata
- Each table includes cell-level details and formatting information
""",
    response_description="Extracted tables with metadata",
)
async def extract_tables(request: TableExtractionRequest) -> JSONResponse:
    """
    Extract tables from HTML content or URL.

    Args:
        request: TableExtractionRequest with html/url and extraction config

    Returns:
        JSONResponse with extracted tables and metadata

    Raises:
        HTTPException: If validation fails or extraction errors occur
    """
    try:
        # Validate input
        if request.html and request.url:
            raise HTTPException(
                status_code=400,
                detail="Cannot provide both 'html' and 'url'. Choose one input method."
            )

        if not request.html and not request.url:
            raise HTTPException(
                status_code=400,
                detail="Must provide either 'html' or 'url' for table extraction."
            )

        # Handle URL-based extraction
        if request.url:
            # Import crawler configs
            from async_configs import BrowserConfig, CrawlerRunConfig

            try:
                # Create minimal browser config
                browser_config = BrowserConfig(
                    headless=True,
                    verbose=False,
                )

                # Create crawler config with table extraction
                table_strategy = create_table_extraction_strategy(request.config)
                crawler_config = CrawlerRunConfig(
                    table_extraction_strategy=table_strategy,
                )

                # Get crawler from pool (browser reuse for memory efficiency)
                crawler = await get_crawler(browser_config, adapter=None)

                # Crawl the URL
                result = await crawler.arun(
                    url=request.url,
                    config=crawler_config,
                )

                if not result.success:
                    raise HTTPException(
                        status_code=500,
                        detail=f"Failed to fetch URL: {result.error_message}"
                    )

                # Extract HTML
                html_content = result.html

            except Exception as e:
                logger.error(f"Error fetching URL {request.url}: {e}")
                raise HTTPException(
                    status_code=500,
                    detail=f"Failed to fetch and extract from URL: {str(e)}"
                )

        else:
            # Use provided HTML
            html_content = request.html

        # Extract tables from HTML
        tables = await extract_tables_from_html(html_content, request.config)

        # Format response
        formatted_tables = format_table_response(tables)

        return JSONResponse({
            "success": True,
            "table_count": len(formatted_tables),
            "tables": formatted_tables,
            "strategy": request.config.strategy.value,
        })

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error extracting tables: {e}", exc_info=True)
        raise HTTPException(
            status_code=500,
            detail=f"Table extraction failed: {str(e)}"
        )


@router.post(
    "/extract/batch",
    summary="Extract Tables from Multiple Sources (Batch)",
    description="""
Extract tables from multiple HTML contents or URLs in a single request.
Processes each input independently and returns results for all.

**Batch Processing:**
- Provide list of HTML contents and/or URLs
- Each input is processed with the same extraction strategy
- Partial failures are allowed (returns results for successful extractions)

**Use Cases:**
- Extracting tables from multiple pages simultaneously
- Bulk financial data extraction
- Comparing table structures across multiple sources
""",
    response_description="Batch extraction results with per-item success status",
)
async def extract_tables_batch(request: TableExtractionBatchRequest) -> JSONResponse:
    """
    Extract tables from multiple HTML contents or URLs in batch.

    Args:
        request: TableExtractionBatchRequest with list of html/url and config

    Returns:
        JSONResponse with batch results

    Raises:
        HTTPException: If validation fails
    """
    try:
        # Validate batch request
        total_items = len(request.html_list or []) + len(request.url_list or [])

        if total_items == 0:
            raise HTTPException(
                status_code=400,
                detail="Must provide at least one HTML content or URL in batch request."
            )

        if total_items > 50:  # Reasonable batch limit
            raise HTTPException(
                status_code=400,
                detail=f"Batch size ({total_items}) exceeds maximum allowed (50)."
            )

        results = []

        # Process HTML list
        if request.html_list:
            for idx, html_content in enumerate(request.html_list):
                try:
                    tables = await extract_tables_from_html(html_content, request.config)
                    formatted_tables = format_table_response(tables)

                    results.append({
                        "success": True,
                        "source": f"html_{idx}",
                        "table_count": len(formatted_tables),
                        "tables": formatted_tables,
                    })
                except Exception as e:
                    logger.error(f"Error extracting tables from html_{idx}: {e}")
                    results.append({
                        "success": False,
                        "source": f"html_{idx}",
                        "error": str(e),
                    })

        # Process URL list
        if request.url_list:
            from async_configs import BrowserConfig, CrawlerRunConfig

            browser_config = BrowserConfig(
                headless=True,
                verbose=False,
            )
            table_strategy = create_table_extraction_strategy(request.config)
            crawler_config = CrawlerRunConfig(
                table_extraction_strategy=table_strategy,
            )

            # Get crawler from pool (reuse browser for all URLs in batch)
            crawler = await get_crawler(browser_config, adapter=None)

            for url in request.url_list:
                try:
                    result = await crawler.arun(
                        url=url,
                        config=crawler_config,
                    )

                    if result.success:
                        html_content = result.html
                        tables = await extract_tables_from_html(html_content, request.config)
                        formatted_tables = format_table_response(tables)

                        results.append({
                            "success": True,
                            "source": url,
                            "table_count": len(formatted_tables),
                            "tables": formatted_tables,
                        })
                    else:
                        results.append({
                            "success": False,
                            "source": url,
                            "error": result.error_message,
                        })

                except Exception as e:
                    logger.error(f"Error extracting tables from {url}: {e}")
                    results.append({
                            "success": False,
                            "source": url,
                            "error": str(e),
                        })

        # Calculate summary
        successful = sum(1 for r in results if r["success"])
        failed = len(results) - successful
        total_tables = sum(r.get("table_count", 0) for r in results if r["success"])

        return JSONResponse({
            "success": True,
            "summary": {
                "total_processed": len(results),
                "successful": successful,
                "failed": failed,
                "total_tables_extracted": total_tables,
            },
            "results": results,
            "strategy": request.config.strategy.value,
        })

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error in batch table extraction: {e}", exc_info=True)
        raise HTTPException(
            status_code=500,
            detail=f"Batch table extraction failed: {str(e)}"
        )