feat: Add table extraction strategies and API documentation

- Implemented table extraction strategies: default, LLM, financial, and none in utils.py. - Created new API documentation for table extraction endpoints and strategies. - Added integration tests for table extraction functionality covering various strategies and error handling. - Developed quick test script for rapid validation of table extraction features.
2025-10-17 12:30:37 +08:00
parent 3877335d89
commit 00e9904609
8 changed files with 1979 additions and 3 deletions
--- a/deploy/docker/routers/tables.py
+++ b/deploy/docker/routers/tables.py
@@ -0,0 +1,301 @@
+"""
+Table Extraction Router for Crawl4AI Docker Server
+
+This module provides dedicated endpoints for table extraction from HTML or URLs,
+separate from the main crawling functionality.
+"""
+
+import logging
+from typing import List, Dict, Any
+from fastapi import APIRouter, HTTPException
+from fastapi.responses import JSONResponse
+
+# Import crawler pool for browser reuse
+from crawler_pool import get_crawler
+
+# Import schemas
+from schemas import (
+    TableExtractionRequest,
+    TableExtractionBatchRequest,
+    TableExtractionConfig,
+)
+
+# Import utilities
+from utils import (
+    extract_tables_from_html,
+    format_table_response,
+    create_table_extraction_strategy,
+)
+
+# Configure logger
+logger = logging.getLogger(__name__)
+
+# Create router
+router = APIRouter(prefix="/tables", tags=["Table Extraction"])
+
+
+@router.post(
+    "/extract",
+    summary="Extract Tables from HTML or URL",
+    description="""
+Extract tables from HTML content or by fetching a URL.
+Supports multiple extraction strategies: default, LLM-based, or financial.
+    
+**Input Options:**
+- Provide `html` for direct HTML content extraction
+- Provide `url` to fetch and extract from a live page
+- Cannot provide both `html` and `url` simultaneously
+
+**Strategies:**
+- `default`: Fast regex and HTML structure-based extraction
+- `llm`: AI-powered extraction with semantic understanding (requires LLM config)
+- `financial`: Specialized extraction for financial tables with numerical formatting
+
+**Returns:**
+- List of extracted tables with headers, rows, and metadata
+- Each table includes cell-level details and formatting information
+""",
+    response_description="Extracted tables with metadata",
+)
+async def extract_tables(request: TableExtractionRequest) -> JSONResponse:
+    """
+    Extract tables from HTML content or URL.
+
+    Args:
+        request: TableExtractionRequest with html/url and extraction config
+
+    Returns:
+        JSONResponse with extracted tables and metadata
+
+    Raises:
+        HTTPException: If validation fails or extraction errors occur
+    """
+    try:
+        # Validate input
+        if request.html and request.url:
+            raise HTTPException(
+                status_code=400,
+                detail="Cannot provide both 'html' and 'url'. Choose one input method."
+            )
+
+        if not request.html and not request.url:
+            raise HTTPException(
+                status_code=400,
+                detail="Must provide either 'html' or 'url' for table extraction."
+            )
+
+        # Handle URL-based extraction
+        if request.url:
+            # Import crawler configs
+            from async_configs import BrowserConfig, CrawlerRunConfig
+
+            try:
+                # Create minimal browser config
+                browser_config = BrowserConfig(
+                    headless=True,
+                    verbose=False,
+                )
+
+                # Create crawler config with table extraction
+                table_strategy = create_table_extraction_strategy(request.config)
+                crawler_config = CrawlerRunConfig(
+                    table_extraction_strategy=table_strategy,
+                )
+
+                # Get crawler from pool (browser reuse for memory efficiency)
+                crawler = await get_crawler(browser_config, adapter=None)
+                
+                # Crawl the URL
+                result = await crawler.arun(
+                    url=request.url,
+                    config=crawler_config,
+                )
+
+                if not result.success:
+                    raise HTTPException(
+                        status_code=500,
+                        detail=f"Failed to fetch URL: {result.error_message}"
+                    )
+
+                # Extract HTML
+                html_content = result.html
+
+            except Exception as e:
+                logger.error(f"Error fetching URL {request.url}: {e}")
+                raise HTTPException(
+                    status_code=500,
+                    detail=f"Failed to fetch and extract from URL: {str(e)}"
+                )
+
+        else:
+            # Use provided HTML
+            html_content = request.html
+
+        # Extract tables from HTML
+        tables = await extract_tables_from_html(html_content, request.config)
+
+        # Format response
+        formatted_tables = format_table_response(tables)
+
+        return JSONResponse({
+            "success": True,
+            "table_count": len(formatted_tables),
+            "tables": formatted_tables,
+            "strategy": request.config.strategy.value,
+        })
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error extracting tables: {e}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail=f"Table extraction failed: {str(e)}"
+        )
+
+
+@router.post(
+    "/extract/batch",
+    summary="Extract Tables from Multiple Sources (Batch)",
+    description="""
+Extract tables from multiple HTML contents or URLs in a single request.
+Processes each input independently and returns results for all.
+
+**Batch Processing:**
+- Provide list of HTML contents and/or URLs
+- Each input is processed with the same extraction strategy
+- Partial failures are allowed (returns results for successful extractions)
+
+**Use Cases:**
+- Extracting tables from multiple pages simultaneously
+- Bulk financial data extraction
+- Comparing table structures across multiple sources
+""",
+    response_description="Batch extraction results with per-item success status",
+)
+async def extract_tables_batch(request: TableExtractionBatchRequest) -> JSONResponse:
+    """
+    Extract tables from multiple HTML contents or URLs in batch.
+
+    Args:
+        request: TableExtractionBatchRequest with list of html/url and config
+
+    Returns:
+        JSONResponse with batch results
+
+    Raises:
+        HTTPException: If validation fails
+    """
+    try:
+        # Validate batch request
+        total_items = len(request.html_list or []) + len(request.url_list or [])
+        
+        if total_items == 0:
+            raise HTTPException(
+                status_code=400,
+                detail="Must provide at least one HTML content or URL in batch request."
+            )
+
+        if total_items > 50:  # Reasonable batch limit
+            raise HTTPException(
+                status_code=400,
+                detail=f"Batch size ({total_items}) exceeds maximum allowed (50)."
+            )
+
+        results = []
+
+        # Process HTML list
+        if request.html_list:
+            for idx, html_content in enumerate(request.html_list):
+                try:
+                    tables = await extract_tables_from_html(html_content, request.config)
+                    formatted_tables = format_table_response(tables)
+                    
+                    results.append({
+                        "success": True,
+                        "source": f"html_{idx}",
+                        "table_count": len(formatted_tables),
+                        "tables": formatted_tables,
+                    })
+                except Exception as e:
+                    logger.error(f"Error extracting tables from html_{idx}: {e}")
+                    results.append({
+                        "success": False,
+                        "source": f"html_{idx}",
+                        "error": str(e),
+                    })
+
+        # Process URL list
+        if request.url_list:
+            from async_configs import BrowserConfig, CrawlerRunConfig
+
+            browser_config = BrowserConfig(
+                headless=True,
+                verbose=False,
+            )
+            table_strategy = create_table_extraction_strategy(request.config)
+            crawler_config = CrawlerRunConfig(
+                table_extraction_strategy=table_strategy,
+            )
+
+            # Get crawler from pool (reuse browser for all URLs in batch)
+            crawler = await get_crawler(browser_config, adapter=None)
+            
+            for url in request.url_list:
+                try:
+                    result = await crawler.arun(
+                        url=url,
+                        config=crawler_config,
+                    )
+
+                    if result.success:
+                        html_content = result.html
+                        tables = await extract_tables_from_html(html_content, request.config)
+                        formatted_tables = format_table_response(tables)
+                        
+                        results.append({
+                            "success": True,
+                            "source": url,
+                            "table_count": len(formatted_tables),
+                            "tables": formatted_tables,
+                        })
+                    else:
+                        results.append({
+                            "success": False,
+                            "source": url,
+                            "error": result.error_message,
+                        })
+
+                except Exception as e:
+                    logger.error(f"Error extracting tables from {url}: {e}")
+                    results.append({
+                            "success": False,
+                            "source": url,
+                            "error": str(e),
+                        })
+
+        # Calculate summary
+        successful = sum(1 for r in results if r["success"])
+        failed = len(results) - successful
+        total_tables = sum(r.get("table_count", 0) for r in results if r["success"])
+
+        return JSONResponse({
+            "success": True,
+            "summary": {
+                "total_processed": len(results),
+                "successful": successful,
+                "failed": failed,
+                "total_tables_extracted": total_tables,
+            },
+            "results": results,
+            "strategy": request.config.strategy.value,
+        })
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error in batch table extraction: {e}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail=f"Batch table extraction failed: {str(e)}"
+        )