feat: Add table extraction strategies and API documentation

- Implemented table extraction strategies: default, LLM, financial, and none in utils.py. - Created new API documentation for table extraction endpoints and strategies. - Added integration tests for table extraction functionality covering various strategies and error handling. - Developed quick test script for rapid validation of table extraction features.
2025-10-17 12:30:37 +08:00
parent 3877335d89
commit 00e9904609
8 changed files with 1979 additions and 3 deletions
--- a/deploy/docker/schemas.py
+++ b/deploy/docker/schemas.py
@@ -48,6 +48,153 @@ class DispatcherSelection(BaseModel):
 # ============================================================================


+# ============================================================================
+# Table Extraction Schemas
+# ============================================================================
+
+class TableExtractionStrategy(str, Enum):
+    """Available table extraction strategies."""
+    NONE = "none"
+    DEFAULT = "default"
+    LLM = "llm"
+    FINANCIAL = "financial"
+
+
+class TableExtractionConfig(BaseModel):
+    """Configuration for table extraction."""
+    
+    strategy: TableExtractionStrategy = Field(
+        default=TableExtractionStrategy.DEFAULT,
+        description="Table extraction strategy to use"
+    )
+    
+    # Common configuration for all strategies
+    table_score_threshold: int = Field(
+        default=7,
+        ge=0,
+        le=100,
+        description="Minimum score for a table to be considered a data table (default strategy)"
+    )
+    min_rows: int = Field(
+        default=0,
+        ge=0,
+        description="Minimum number of rows for a valid table"
+    )
+    min_cols: int = Field(
+        default=0,
+        ge=0,
+        description="Minimum number of columns for a valid table"
+    )
+    
+    # LLM-specific configuration
+    llm_provider: Optional[str] = Field(
+        None,
+        description="LLM provider for LLM strategy (e.g., 'openai/gpt-4')"
+    )
+    llm_model: Optional[str] = Field(
+        None,
+        description="Specific LLM model to use"
+    )
+    llm_api_key: Optional[str] = Field(
+        None,
+        description="API key for LLM provider (if not in environment)"
+    )
+    llm_base_url: Optional[str] = Field(
+        None,
+        description="Custom base URL for LLM API"
+    )
+    extraction_prompt: Optional[str] = Field(
+        None,
+        description="Custom prompt for LLM table extraction"
+    )
+    
+    # Financial-specific configuration
+    decimal_separator: str = Field(
+        default=".",
+        description="Decimal separator for financial tables (e.g., '.' or ',')"
+    )
+    thousand_separator: str = Field(
+        default=",",
+        description="Thousand separator for financial tables (e.g., ',' or '.')"
+    )
+    
+    # General options
+    verbose: bool = Field(
+        default=False,
+        description="Enable verbose logging for table extraction"
+    )
+    
+    class Config:
+        schema_extra = {
+            "example": {
+                "strategy": "default",
+                "table_score_threshold": 7,
+                "min_rows": 2,
+                "min_cols": 2
+            }
+        }
+
+
+class TableExtractionRequest(BaseModel):
+    """Request for dedicated table extraction endpoint."""
+    
+    url: Optional[str] = Field(
+        None,
+        description="URL to crawl and extract tables from"
+    )
+    html: Optional[str] = Field(
+        None,
+        description="Raw HTML content to extract tables from"
+    )
+    config: TableExtractionConfig = Field(
+        default_factory=lambda: TableExtractionConfig(),
+        description="Table extraction configuration"
+    )
+    
+    # Browser config (only used if URL is provided)
+    browser_config: Optional[Dict] = Field(
+        default_factory=dict,
+        description="Browser configuration for URL crawling"
+    )
+    
+    class Config:
+        schema_extra = {
+            "example": {
+                "url": "https://example.com/data-table",
+                "config": {
+                    "strategy": "default",
+                    "min_rows": 2
+                }
+            }
+        }
+
+
+class TableExtractionBatchRequest(BaseModel):
+    """Request for batch table extraction."""
+    
+    html_list: Optional[List[str]] = Field(
+        None,
+        description="List of HTML contents to extract tables from"
+    )
+    url_list: Optional[List[str]] = Field(
+        None,
+        description="List of URLs to extract tables from"
+    )
+    config: TableExtractionConfig = Field(
+        default_factory=lambda: TableExtractionConfig(),
+        description="Table extraction configuration"
+    )
+    browser_config: Optional[Dict] = Field(
+        default_factory=dict,
+        description="Browser configuration"
+    )
+
+
+# ============================================================================
+# End Table Extraction Schemas
+# ============================================================================
+
+
 class CrawlRequest(BaseModel):
    urls: List[str] = Field(min_length=1, max_length=100)
    browser_config: Optional[Dict] = Field(default_factory=dict)
@@ -77,6 +224,11 @@ class CrawlRequest(BaseModel):
    proxy_recovery_time: Optional[int] = Field(
        300, ge=60, le=3600, description="Recovery time in seconds for failure_aware strategy"
    )
+    
+    # Table extraction configuration
+    table_extraction: Optional[TableExtractionConfig] = Field(
+        None, description="Optional table extraction configuration to extract tables during crawl"
+    )


 class HookConfig(BaseModel):