feat: Add table extraction strategies and API documentation

- Implemented table extraction strategies: default, LLM, financial, and none in utils.py. - Created new API documentation for table extraction endpoints and strategies. - Added integration tests for table extraction functionality covering various strategies and error handling. - Developed quick test script for rapid validation of table extraction features.
2025-10-17 12:30:37 +08:00
parent 3877335d89
commit 00e9904609
8 changed files with 1979 additions and 3 deletions
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -731,6 +731,7 @@ async def handle_crawl_request(
    proxies: Optional[List[Dict[str, Any]]] = None,
    proxy_failure_threshold: int = 3,
    proxy_recovery_time: int = 300,
+    table_extraction: Optional[dict] = None,
    dispatcher = None,
 ) -> dict:
    """Handle non-streaming crawl requests with optional hooks."""
@@ -768,6 +769,19 @@ async def handle_crawl_request(
            except ValueError as e:
                raise HTTPException(status_code=400, detail=str(e))

+        # Configure table extraction strategy if specified
+        if table_extraction:
+            try:
+                from schemas import TableExtractionConfig
+                from utils import create_table_extraction_strategy
+
+                table_config = TableExtractionConfig(**table_extraction)
+                table_strategy = create_table_extraction_strategy(table_config)
+                crawler_config.table_extraction_strategy = table_strategy
+            except Exception as e:
+                logger.error(f"Error creating table extraction strategy: {e}")
+                raise HTTPException(status_code=400, detail=f"Invalid table extraction config: {str(e)}")
+
        # Configure browser adapter based on anti_bot_strategy
        browser_adapter = _get_browser_adapter(anti_bot_strategy, browser_config)

@@ -974,6 +988,7 @@ async def handle_stream_crawl_request(
    proxies: Optional[List[Dict[str, Any]]] = None,
    proxy_failure_threshold: int = 3,
    proxy_recovery_time: int = 300,
+    table_extraction: Optional[dict] = None,
    dispatcher = None,
 ) -> Tuple[AsyncWebCrawler, AsyncGenerator, Optional[Dict]]:
    """Handle streaming crawl requests with optional hooks."""
@@ -1003,6 +1018,19 @@ async def handle_stream_crawl_request(
            except ValueError as e:
                raise HTTPException(status_code=400, detail=str(e))

+        # Configure table extraction strategy if specified
+        if table_extraction:
+            try:
+                from schemas import TableExtractionConfig
+                from utils import create_table_extraction_strategy
+
+                table_config = TableExtractionConfig(**table_extraction)
+                table_strategy = create_table_extraction_strategy(table_config)
+                crawler_config.table_extraction_strategy = table_strategy
+            except Exception as e:
+                logger.error(f"Error creating table extraction strategy: {e}")
+                raise HTTPException(status_code=400, detail=f"Invalid table extraction config: {str(e)}")
+
        # Configure browser adapter based on anti_bot_strategy
        browser_adapter = _get_browser_adapter(anti_bot_strategy, browser_config)

--- a/deploy/docker/routers/tables.py
+++ b/deploy/docker/routers/tables.py
@@ -0,0 +1,301 @@
+"""
+Table Extraction Router for Crawl4AI Docker Server
+
+This module provides dedicated endpoints for table extraction from HTML or URLs,
+separate from the main crawling functionality.
+"""
+
+import logging
+from typing import List, Dict, Any
+from fastapi import APIRouter, HTTPException
+from fastapi.responses import JSONResponse
+
+# Import crawler pool for browser reuse
+from crawler_pool import get_crawler
+
+# Import schemas
+from schemas import (
+    TableExtractionRequest,
+    TableExtractionBatchRequest,
+    TableExtractionConfig,
+)
+
+# Import utilities
+from utils import (
+    extract_tables_from_html,
+    format_table_response,
+    create_table_extraction_strategy,
+)
+
+# Configure logger
+logger = logging.getLogger(__name__)
+
+# Create router
+router = APIRouter(prefix="/tables", tags=["Table Extraction"])
+
+
+@router.post(
+    "/extract",
+    summary="Extract Tables from HTML or URL",
+    description="""
+Extract tables from HTML content or by fetching a URL.
+Supports multiple extraction strategies: default, LLM-based, or financial.
+    
+**Input Options:**
+- Provide `html` for direct HTML content extraction
+- Provide `url` to fetch and extract from a live page
+- Cannot provide both `html` and `url` simultaneously
+
+**Strategies:**
+- `default`: Fast regex and HTML structure-based extraction
+- `llm`: AI-powered extraction with semantic understanding (requires LLM config)
+- `financial`: Specialized extraction for financial tables with numerical formatting
+
+**Returns:**
+- List of extracted tables with headers, rows, and metadata
+- Each table includes cell-level details and formatting information
+""",
+    response_description="Extracted tables with metadata",
+)
+async def extract_tables(request: TableExtractionRequest) -> JSONResponse:
+    """
+    Extract tables from HTML content or URL.
+
+    Args:
+        request: TableExtractionRequest with html/url and extraction config
+
+    Returns:
+        JSONResponse with extracted tables and metadata
+
+    Raises:
+        HTTPException: If validation fails or extraction errors occur
+    """
+    try:
+        # Validate input
+        if request.html and request.url:
+            raise HTTPException(
+                status_code=400,
+                detail="Cannot provide both 'html' and 'url'. Choose one input method."
+            )
+
+        if not request.html and not request.url:
+            raise HTTPException(
+                status_code=400,
+                detail="Must provide either 'html' or 'url' for table extraction."
+            )
+
+        # Handle URL-based extraction
+        if request.url:
+            # Import crawler configs
+            from async_configs import BrowserConfig, CrawlerRunConfig
+
+            try:
+                # Create minimal browser config
+                browser_config = BrowserConfig(
+                    headless=True,
+                    verbose=False,
+                )
+
+                # Create crawler config with table extraction
+                table_strategy = create_table_extraction_strategy(request.config)
+                crawler_config = CrawlerRunConfig(
+                    table_extraction_strategy=table_strategy,
+                )
+
+                # Get crawler from pool (browser reuse for memory efficiency)
+                crawler = await get_crawler(browser_config, adapter=None)
+                
+                # Crawl the URL
+                result = await crawler.arun(
+                    url=request.url,
+                    config=crawler_config,
+                )
+
+                if not result.success:
+                    raise HTTPException(
+                        status_code=500,
+                        detail=f"Failed to fetch URL: {result.error_message}"
+                    )
+
+                # Extract HTML
+                html_content = result.html
+
+            except Exception as e:
+                logger.error(f"Error fetching URL {request.url}: {e}")
+                raise HTTPException(
+                    status_code=500,
+                    detail=f"Failed to fetch and extract from URL: {str(e)}"
+                )
+
+        else:
+            # Use provided HTML
+            html_content = request.html
+
+        # Extract tables from HTML
+        tables = await extract_tables_from_html(html_content, request.config)
+
+        # Format response
+        formatted_tables = format_table_response(tables)
+
+        return JSONResponse({
+            "success": True,
+            "table_count": len(formatted_tables),
+            "tables": formatted_tables,
+            "strategy": request.config.strategy.value,
+        })
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error extracting tables: {e}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail=f"Table extraction failed: {str(e)}"
+        )
+
+
+@router.post(
+    "/extract/batch",
+    summary="Extract Tables from Multiple Sources (Batch)",
+    description="""
+Extract tables from multiple HTML contents or URLs in a single request.
+Processes each input independently and returns results for all.
+
+**Batch Processing:**
+- Provide list of HTML contents and/or URLs
+- Each input is processed with the same extraction strategy
+- Partial failures are allowed (returns results for successful extractions)
+
+**Use Cases:**
+- Extracting tables from multiple pages simultaneously
+- Bulk financial data extraction
+- Comparing table structures across multiple sources
+""",
+    response_description="Batch extraction results with per-item success status",
+)
+async def extract_tables_batch(request: TableExtractionBatchRequest) -> JSONResponse:
+    """
+    Extract tables from multiple HTML contents or URLs in batch.
+
+    Args:
+        request: TableExtractionBatchRequest with list of html/url and config
+
+    Returns:
+        JSONResponse with batch results
+
+    Raises:
+        HTTPException: If validation fails
+    """
+    try:
+        # Validate batch request
+        total_items = len(request.html_list or []) + len(request.url_list or [])
+        
+        if total_items == 0:
+            raise HTTPException(
+                status_code=400,
+                detail="Must provide at least one HTML content or URL in batch request."
+            )
+
+        if total_items > 50:  # Reasonable batch limit
+            raise HTTPException(
+                status_code=400,
+                detail=f"Batch size ({total_items}) exceeds maximum allowed (50)."
+            )
+
+        results = []
+
+        # Process HTML list
+        if request.html_list:
+            for idx, html_content in enumerate(request.html_list):
+                try:
+                    tables = await extract_tables_from_html(html_content, request.config)
+                    formatted_tables = format_table_response(tables)
+                    
+                    results.append({
+                        "success": True,
+                        "source": f"html_{idx}",
+                        "table_count": len(formatted_tables),
+                        "tables": formatted_tables,
+                    })
+                except Exception as e:
+                    logger.error(f"Error extracting tables from html_{idx}: {e}")
+                    results.append({
+                        "success": False,
+                        "source": f"html_{idx}",
+                        "error": str(e),
+                    })
+
+        # Process URL list
+        if request.url_list:
+            from async_configs import BrowserConfig, CrawlerRunConfig
+
+            browser_config = BrowserConfig(
+                headless=True,
+                verbose=False,
+            )
+            table_strategy = create_table_extraction_strategy(request.config)
+            crawler_config = CrawlerRunConfig(
+                table_extraction_strategy=table_strategy,
+            )
+
+            # Get crawler from pool (reuse browser for all URLs in batch)
+            crawler = await get_crawler(browser_config, adapter=None)
+            
+            for url in request.url_list:
+                try:
+                    result = await crawler.arun(
+                        url=url,
+                        config=crawler_config,
+                    )
+
+                    if result.success:
+                        html_content = result.html
+                        tables = await extract_tables_from_html(html_content, request.config)
+                        formatted_tables = format_table_response(tables)
+                        
+                        results.append({
+                            "success": True,
+                            "source": url,
+                            "table_count": len(formatted_tables),
+                            "tables": formatted_tables,
+                        })
+                    else:
+                        results.append({
+                            "success": False,
+                            "source": url,
+                            "error": result.error_message,
+                        })
+
+                except Exception as e:
+                    logger.error(f"Error extracting tables from {url}: {e}")
+                    results.append({
+                            "success": False,
+                            "source": url,
+                            "error": str(e),
+                        })
+
+        # Calculate summary
+        successful = sum(1 for r in results if r["success"])
+        failed = len(results) - successful
+        total_tables = sum(r.get("table_count", 0) for r in results if r["success"])
+
+        return JSONResponse({
+            "success": True,
+            "summary": {
+                "total_processed": len(results),
+                "successful": successful,
+                "failed": failed,
+                "total_tables_extracted": total_tables,
+            },
+            "results": results,
+            "strategy": request.config.strategy.value,
+        })
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error in batch table extraction: {e}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail=f"Batch table extraction failed: {str(e)}"
+        )
--- a/deploy/docker/schemas.py
+++ b/deploy/docker/schemas.py
@@ -48,6 +48,153 @@ class DispatcherSelection(BaseModel):
 # ============================================================================


+# ============================================================================
+# Table Extraction Schemas
+# ============================================================================
+
+class TableExtractionStrategy(str, Enum):
+    """Available table extraction strategies."""
+    NONE = "none"
+    DEFAULT = "default"
+    LLM = "llm"
+    FINANCIAL = "financial"
+
+
+class TableExtractionConfig(BaseModel):
+    """Configuration for table extraction."""
+    
+    strategy: TableExtractionStrategy = Field(
+        default=TableExtractionStrategy.DEFAULT,
+        description="Table extraction strategy to use"
+    )
+    
+    # Common configuration for all strategies
+    table_score_threshold: int = Field(
+        default=7,
+        ge=0,
+        le=100,
+        description="Minimum score for a table to be considered a data table (default strategy)"
+    )
+    min_rows: int = Field(
+        default=0,
+        ge=0,
+        description="Minimum number of rows for a valid table"
+    )
+    min_cols: int = Field(
+        default=0,
+        ge=0,
+        description="Minimum number of columns for a valid table"
+    )
+    
+    # LLM-specific configuration
+    llm_provider: Optional[str] = Field(
+        None,
+        description="LLM provider for LLM strategy (e.g., 'openai/gpt-4')"
+    )
+    llm_model: Optional[str] = Field(
+        None,
+        description="Specific LLM model to use"
+    )
+    llm_api_key: Optional[str] = Field(
+        None,
+        description="API key for LLM provider (if not in environment)"
+    )
+    llm_base_url: Optional[str] = Field(
+        None,
+        description="Custom base URL for LLM API"
+    )
+    extraction_prompt: Optional[str] = Field(
+        None,
+        description="Custom prompt for LLM table extraction"
+    )
+    
+    # Financial-specific configuration
+    decimal_separator: str = Field(
+        default=".",
+        description="Decimal separator for financial tables (e.g., '.' or ',')"
+    )
+    thousand_separator: str = Field(
+        default=",",
+        description="Thousand separator for financial tables (e.g., ',' or '.')"
+    )
+    
+    # General options
+    verbose: bool = Field(
+        default=False,
+        description="Enable verbose logging for table extraction"
+    )
+    
+    class Config:
+        schema_extra = {
+            "example": {
+                "strategy": "default",
+                "table_score_threshold": 7,
+                "min_rows": 2,
+                "min_cols": 2
+            }
+        }
+
+
+class TableExtractionRequest(BaseModel):
+    """Request for dedicated table extraction endpoint."""
+    
+    url: Optional[str] = Field(
+        None,
+        description="URL to crawl and extract tables from"
+    )
+    html: Optional[str] = Field(
+        None,
+        description="Raw HTML content to extract tables from"
+    )
+    config: TableExtractionConfig = Field(
+        default_factory=lambda: TableExtractionConfig(),
+        description="Table extraction configuration"
+    )
+    
+    # Browser config (only used if URL is provided)
+    browser_config: Optional[Dict] = Field(
+        default_factory=dict,
+        description="Browser configuration for URL crawling"
+    )
+    
+    class Config:
+        schema_extra = {
+            "example": {
+                "url": "https://example.com/data-table",
+                "config": {
+                    "strategy": "default",
+                    "min_rows": 2
+                }
+            }
+        }
+
+
+class TableExtractionBatchRequest(BaseModel):
+    """Request for batch table extraction."""
+    
+    html_list: Optional[List[str]] = Field(
+        None,
+        description="List of HTML contents to extract tables from"
+    )
+    url_list: Optional[List[str]] = Field(
+        None,
+        description="List of URLs to extract tables from"
+    )
+    config: TableExtractionConfig = Field(
+        default_factory=lambda: TableExtractionConfig(),
+        description="Table extraction configuration"
+    )
+    browser_config: Optional[Dict] = Field(
+        default_factory=dict,
+        description="Browser configuration"
+    )
+
+
+# ============================================================================
+# End Table Extraction Schemas
+# ============================================================================
+
+
 class CrawlRequest(BaseModel):
    urls: List[str] = Field(min_length=1, max_length=100)
    browser_config: Optional[Dict] = Field(default_factory=dict)
@@ -77,6 +224,11 @@ class CrawlRequest(BaseModel):
    proxy_recovery_time: Optional[int] = Field(
        300, ge=60, le=3600, description="Recovery time in seconds for failure_aware strategy"
    )
+    
+    # Table extraction configuration
+    table_extraction: Optional[TableExtractionConfig] = Field(
+        None, description="Optional table extraction configuration to extract tables during crawl"
+    )


 class HookConfig(BaseModel):
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -87,7 +87,7 @@ from prometheus_fastapi_instrumentator import Instrumentator
 from pydantic import BaseModel, Field
 from rank_bm25 import BM25Okapi
 from redis import asyncio as aioredis
-from routers import adaptive, dispatchers, scripts, monitoring
+from routers import adaptive, dispatchers, scripts, monitoring, tables
 from schemas import (
    CrawlRequest,
    CrawlRequestWithHooks,
@@ -298,6 +298,7 @@ app.include_router(adaptive.router)
 app.include_router(dispatchers.router)
 app.include_router(scripts.router)
 app.include_router(monitoring.router)
+app.include_router(tables.router)


 # ──────────────────────── Endpoints ──────────────────────────
@@ -1578,6 +1579,7 @@ async def crawl(
        proxies=crawl_request.proxies,
        proxy_failure_threshold=crawl_request.proxy_failure_threshold,
        proxy_recovery_time=crawl_request.proxy_recovery_time,
+        table_extraction=crawl_request.table_extraction.model_dump() if crawl_request.table_extraction else None,
        dispatcher=dispatcher,
    )
    # check if all of the results are not successful
@@ -1729,6 +1731,7 @@ async def stream_process(crawl_request: CrawlRequestWithHooks):
        proxies=crawl_request.proxies,
        proxy_failure_threshold=crawl_request.proxy_failure_threshold,
        proxy_recovery_time=crawl_request.proxy_recovery_time,
+        table_extraction=crawl_request.table_extraction.model_dump() if crawl_request.table_extraction else None,
        dispatcher=dispatcher,
    )

--- a/deploy/docker/utils.py
+++ b/deploy/docker/utils.py
@@ -6,7 +6,7 @@ from datetime import datetime
 from enum import Enum
 from pathlib import Path
 from fastapi import Request
-from typing import Dict, Optional, Any
+from typing import Dict, Optional, Any, List

 # Import dispatchers from crawl4ai
 from crawl4ai.async_dispatcher import (
@@ -373,4 +373,187 @@ def create_chunking_strategy(config: Optional[Dict[str, Any]] = None) -> Optiona
    try:
        return strategies[strategy_type](**params)
    except Exception as e:
-        raise ValueError(f"Failed to create {strategy_type} with params {params}: {str(e)}")
+        raise ValueError(f"Failed to create {strategy_type} with params {params}: {str(e)}")
+
+
+# ============================================================================
+# Table Extraction Utilities
+# ============================================================================
+
+def create_table_extraction_strategy(config):
+    """
+    Create a table extraction strategy from configuration.
+    
+    Args:
+        config: TableExtractionConfig instance or dict
+        
+    Returns:
+        TableExtractionStrategy instance
+        
+    Raises:
+        ValueError: If strategy type is unknown or configuration is invalid
+    """
+    from crawl4ai.table_extraction import (
+        NoTableExtraction,
+        DefaultTableExtraction,
+        LLMTableExtraction
+    )
+    from schemas import TableExtractionStrategy
+    
+    # Handle both Pydantic model and dict
+    if hasattr(config, 'strategy'):
+        strategy_type = config.strategy
+    elif isinstance(config, dict):
+        strategy_type = config.get('strategy', 'default')
+    else:
+        strategy_type = 'default'
+    
+    # Convert string to enum if needed
+    if isinstance(strategy_type, str):
+        strategy_type = strategy_type.lower()
+    
+    # Extract configuration values
+    def get_config_value(key, default=None):
+        if hasattr(config, key):
+            return getattr(config, key)
+        elif isinstance(config, dict):
+            return config.get(key, default)
+        return default
+    
+    # Create strategy based on type
+    if strategy_type in ['none', TableExtractionStrategy.NONE]:
+        return NoTableExtraction()
+    
+    elif strategy_type in ['default', TableExtractionStrategy.DEFAULT]:
+        return DefaultTableExtraction(
+            table_score_threshold=get_config_value('table_score_threshold', 7),
+            min_rows=get_config_value('min_rows', 0),
+            min_cols=get_config_value('min_cols', 0),
+            verbose=get_config_value('verbose', False)
+        )
+    
+    elif strategy_type in ['llm', TableExtractionStrategy.LLM]:
+        from crawl4ai.types import LLMConfig
+        
+        # Build LLM config
+        llm_config = None
+        llm_provider = get_config_value('llm_provider')
+        llm_api_key = get_config_value('llm_api_key')
+        llm_model = get_config_value('llm_model')
+        llm_base_url = get_config_value('llm_base_url')
+        
+        if llm_provider or llm_api_key:
+            llm_config = LLMConfig(
+                provider=llm_provider or "openai/gpt-4",
+                api_token=llm_api_key,
+                model=llm_model,
+                base_url=llm_base_url
+            )
+        
+        return LLMTableExtraction(
+            llm_config=llm_config,
+            extraction_prompt=get_config_value('extraction_prompt'),
+            table_score_threshold=get_config_value('table_score_threshold', 7),
+            min_rows=get_config_value('min_rows', 0),
+            min_cols=get_config_value('min_cols', 0),
+            verbose=get_config_value('verbose', False)
+        )
+    
+    elif strategy_type in ['financial', TableExtractionStrategy.FINANCIAL]:
+        # Financial strategy uses DefaultTableExtraction with specialized settings
+        # optimized for financial data (tables with currency, numbers, etc.)
+        return DefaultTableExtraction(
+            table_score_threshold=get_config_value('table_score_threshold', 10),  # Higher threshold for financial
+            min_rows=get_config_value('min_rows', 2),  # Financial tables usually have at least 2 rows
+            min_cols=get_config_value('min_cols', 2),  # Financial tables usually have at least 2 columns
+            verbose=get_config_value('verbose', False)
+        )
+    
+    else:
+        raise ValueError(f"Unknown table extraction strategy: {strategy_type}")
+
+
+def format_table_response(tables: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """
+    Format extracted tables for API response.
+    
+    Args:
+        tables: List of table dictionaries from table extraction strategy
+        
+    Returns:
+        List of formatted table dictionaries with consistent structure
+    """
+    if not tables:
+        return []
+    
+    formatted_tables = []
+    for idx, table in enumerate(tables):
+        formatted = {
+            "table_index": idx,
+            "headers": table.get("headers", []),
+            "rows": table.get("rows", []),
+            "caption": table.get("caption"),
+            "summary": table.get("summary"),
+            "metadata": table.get("metadata", {}),
+            "row_count": len(table.get("rows", [])),
+            "col_count": len(table.get("headers", [])),
+        }
+        
+        # Add score if available (from scoring strategies)
+        if "score" in table:
+            formatted["score"] = table["score"]
+        
+        # Add position information if available
+        if "position" in table:
+            formatted["position"] = table["position"]
+        
+        formatted_tables.append(formatted)
+    
+    return formatted_tables
+
+
+async def extract_tables_from_html(html: str, config = None):
+    """
+    Extract tables from HTML content (async wrapper for CPU-bound operation).
+    
+    Args:
+        html: HTML content as string
+        config: TableExtractionConfig instance or dict
+        
+    Returns:
+        List of formatted table dictionaries
+        
+    Raises:
+        ValueError: If HTML parsing fails
+    """
+    import asyncio
+    from functools import partial
+    from lxml import html as lxml_html
+    from schemas import TableExtractionConfig
+    
+    # Define sync extraction function
+    def _sync_extract():
+        try:
+            # Parse HTML
+            element = lxml_html.fromstring(html)
+        except Exception as e:
+            raise ValueError(f"Failed to parse HTML: {str(e)}")
+        
+        # Create strategy
+        cfg = config if config is not None else TableExtractionConfig()
+        strategy = create_table_extraction_strategy(cfg)
+        
+        # Extract tables
+        tables = strategy.extract_tables(element)
+        
+        # Format response
+        return format_table_response(tables)
+    
+    # Run in executor to avoid blocking the event loop
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(None, _sync_extract)
+
+
+# ============================================================================
+# End Table Extraction Utilities
+# ============================================================================