feat: Add table extraction strategies and API documentation
- Implemented table extraction strategies: default, LLM, financial, and none in utils.py. - Created new API documentation for table extraction endpoints and strategies. - Added integration tests for table extraction functionality covering various strategies and error handling. - Developed quick test script for rapid validation of table extraction features.
This commit is contained in:
@@ -731,6 +731,7 @@ async def handle_crawl_request(
|
|||||||
proxies: Optional[List[Dict[str, Any]]] = None,
|
proxies: Optional[List[Dict[str, Any]]] = None,
|
||||||
proxy_failure_threshold: int = 3,
|
proxy_failure_threshold: int = 3,
|
||||||
proxy_recovery_time: int = 300,
|
proxy_recovery_time: int = 300,
|
||||||
|
table_extraction: Optional[dict] = None,
|
||||||
dispatcher = None,
|
dispatcher = None,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""Handle non-streaming crawl requests with optional hooks."""
|
"""Handle non-streaming crawl requests with optional hooks."""
|
||||||
@@ -768,6 +769,19 @@ async def handle_crawl_request(
|
|||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
raise HTTPException(status_code=400, detail=str(e))
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
|
|
||||||
|
# Configure table extraction strategy if specified
|
||||||
|
if table_extraction:
|
||||||
|
try:
|
||||||
|
from schemas import TableExtractionConfig
|
||||||
|
from utils import create_table_extraction_strategy
|
||||||
|
|
||||||
|
table_config = TableExtractionConfig(**table_extraction)
|
||||||
|
table_strategy = create_table_extraction_strategy(table_config)
|
||||||
|
crawler_config.table_extraction_strategy = table_strategy
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error creating table extraction strategy: {e}")
|
||||||
|
raise HTTPException(status_code=400, detail=f"Invalid table extraction config: {str(e)}")
|
||||||
|
|
||||||
# Configure browser adapter based on anti_bot_strategy
|
# Configure browser adapter based on anti_bot_strategy
|
||||||
browser_adapter = _get_browser_adapter(anti_bot_strategy, browser_config)
|
browser_adapter = _get_browser_adapter(anti_bot_strategy, browser_config)
|
||||||
|
|
||||||
@@ -974,6 +988,7 @@ async def handle_stream_crawl_request(
|
|||||||
proxies: Optional[List[Dict[str, Any]]] = None,
|
proxies: Optional[List[Dict[str, Any]]] = None,
|
||||||
proxy_failure_threshold: int = 3,
|
proxy_failure_threshold: int = 3,
|
||||||
proxy_recovery_time: int = 300,
|
proxy_recovery_time: int = 300,
|
||||||
|
table_extraction: Optional[dict] = None,
|
||||||
dispatcher = None,
|
dispatcher = None,
|
||||||
) -> Tuple[AsyncWebCrawler, AsyncGenerator, Optional[Dict]]:
|
) -> Tuple[AsyncWebCrawler, AsyncGenerator, Optional[Dict]]:
|
||||||
"""Handle streaming crawl requests with optional hooks."""
|
"""Handle streaming crawl requests with optional hooks."""
|
||||||
@@ -1003,6 +1018,19 @@ async def handle_stream_crawl_request(
|
|||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
raise HTTPException(status_code=400, detail=str(e))
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
|
|
||||||
|
# Configure table extraction strategy if specified
|
||||||
|
if table_extraction:
|
||||||
|
try:
|
||||||
|
from schemas import TableExtractionConfig
|
||||||
|
from utils import create_table_extraction_strategy
|
||||||
|
|
||||||
|
table_config = TableExtractionConfig(**table_extraction)
|
||||||
|
table_strategy = create_table_extraction_strategy(table_config)
|
||||||
|
crawler_config.table_extraction_strategy = table_strategy
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error creating table extraction strategy: {e}")
|
||||||
|
raise HTTPException(status_code=400, detail=f"Invalid table extraction config: {str(e)}")
|
||||||
|
|
||||||
# Configure browser adapter based on anti_bot_strategy
|
# Configure browser adapter based on anti_bot_strategy
|
||||||
browser_adapter = _get_browser_adapter(anti_bot_strategy, browser_config)
|
browser_adapter = _get_browser_adapter(anti_bot_strategy, browser_config)
|
||||||
|
|
||||||
|
|||||||
301
deploy/docker/routers/tables.py
Normal file
301
deploy/docker/routers/tables.py
Normal file
@@ -0,0 +1,301 @@
|
|||||||
|
"""
|
||||||
|
Table Extraction Router for Crawl4AI Docker Server
|
||||||
|
|
||||||
|
This module provides dedicated endpoints for table extraction from HTML or URLs,
|
||||||
|
separate from the main crawling functionality.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
|
||||||
|
# Import crawler pool for browser reuse
|
||||||
|
from crawler_pool import get_crawler
|
||||||
|
|
||||||
|
# Import schemas
|
||||||
|
from schemas import (
|
||||||
|
TableExtractionRequest,
|
||||||
|
TableExtractionBatchRequest,
|
||||||
|
TableExtractionConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Import utilities
|
||||||
|
from utils import (
|
||||||
|
extract_tables_from_html,
|
||||||
|
format_table_response,
|
||||||
|
create_table_extraction_strategy,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Configure logger
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Create router
|
||||||
|
router = APIRouter(prefix="/tables", tags=["Table Extraction"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/extract",
|
||||||
|
summary="Extract Tables from HTML or URL",
|
||||||
|
description="""
|
||||||
|
Extract tables from HTML content or by fetching a URL.
|
||||||
|
Supports multiple extraction strategies: default, LLM-based, or financial.
|
||||||
|
|
||||||
|
**Input Options:**
|
||||||
|
- Provide `html` for direct HTML content extraction
|
||||||
|
- Provide `url` to fetch and extract from a live page
|
||||||
|
- Cannot provide both `html` and `url` simultaneously
|
||||||
|
|
||||||
|
**Strategies:**
|
||||||
|
- `default`: Fast regex and HTML structure-based extraction
|
||||||
|
- `llm`: AI-powered extraction with semantic understanding (requires LLM config)
|
||||||
|
- `financial`: Specialized extraction for financial tables with numerical formatting
|
||||||
|
|
||||||
|
**Returns:**
|
||||||
|
- List of extracted tables with headers, rows, and metadata
|
||||||
|
- Each table includes cell-level details and formatting information
|
||||||
|
""",
|
||||||
|
response_description="Extracted tables with metadata",
|
||||||
|
)
|
||||||
|
async def extract_tables(request: TableExtractionRequest) -> JSONResponse:
|
||||||
|
"""
|
||||||
|
Extract tables from HTML content or URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: TableExtractionRequest with html/url and extraction config
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSONResponse with extracted tables and metadata
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HTTPException: If validation fails or extraction errors occur
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Validate input
|
||||||
|
if request.html and request.url:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail="Cannot provide both 'html' and 'url'. Choose one input method."
|
||||||
|
)
|
||||||
|
|
||||||
|
if not request.html and not request.url:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail="Must provide either 'html' or 'url' for table extraction."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Handle URL-based extraction
|
||||||
|
if request.url:
|
||||||
|
# Import crawler configs
|
||||||
|
from async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create minimal browser config
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
verbose=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create crawler config with table extraction
|
||||||
|
table_strategy = create_table_extraction_strategy(request.config)
|
||||||
|
crawler_config = CrawlerRunConfig(
|
||||||
|
table_extraction_strategy=table_strategy,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get crawler from pool (browser reuse for memory efficiency)
|
||||||
|
crawler = await get_crawler(browser_config, adapter=None)
|
||||||
|
|
||||||
|
# Crawl the URL
|
||||||
|
result = await crawler.arun(
|
||||||
|
url=request.url,
|
||||||
|
config=crawler_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not result.success:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=f"Failed to fetch URL: {result.error_message}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract HTML
|
||||||
|
html_content = result.html
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error fetching URL {request.url}: {e}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=f"Failed to fetch and extract from URL: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Use provided HTML
|
||||||
|
html_content = request.html
|
||||||
|
|
||||||
|
# Extract tables from HTML
|
||||||
|
tables = await extract_tables_from_html(html_content, request.config)
|
||||||
|
|
||||||
|
# Format response
|
||||||
|
formatted_tables = format_table_response(tables)
|
||||||
|
|
||||||
|
return JSONResponse({
|
||||||
|
"success": True,
|
||||||
|
"table_count": len(formatted_tables),
|
||||||
|
"tables": formatted_tables,
|
||||||
|
"strategy": request.config.strategy.value,
|
||||||
|
})
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error extracting tables: {e}", exc_info=True)
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=f"Table extraction failed: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/extract/batch",
|
||||||
|
summary="Extract Tables from Multiple Sources (Batch)",
|
||||||
|
description="""
|
||||||
|
Extract tables from multiple HTML contents or URLs in a single request.
|
||||||
|
Processes each input independently and returns results for all.
|
||||||
|
|
||||||
|
**Batch Processing:**
|
||||||
|
- Provide list of HTML contents and/or URLs
|
||||||
|
- Each input is processed with the same extraction strategy
|
||||||
|
- Partial failures are allowed (returns results for successful extractions)
|
||||||
|
|
||||||
|
**Use Cases:**
|
||||||
|
- Extracting tables from multiple pages simultaneously
|
||||||
|
- Bulk financial data extraction
|
||||||
|
- Comparing table structures across multiple sources
|
||||||
|
""",
|
||||||
|
response_description="Batch extraction results with per-item success status",
|
||||||
|
)
|
||||||
|
async def extract_tables_batch(request: TableExtractionBatchRequest) -> JSONResponse:
|
||||||
|
"""
|
||||||
|
Extract tables from multiple HTML contents or URLs in batch.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: TableExtractionBatchRequest with list of html/url and config
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSONResponse with batch results
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HTTPException: If validation fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Validate batch request
|
||||||
|
total_items = len(request.html_list or []) + len(request.url_list or [])
|
||||||
|
|
||||||
|
if total_items == 0:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail="Must provide at least one HTML content or URL in batch request."
|
||||||
|
)
|
||||||
|
|
||||||
|
if total_items > 50: # Reasonable batch limit
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Batch size ({total_items}) exceeds maximum allowed (50)."
|
||||||
|
)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Process HTML list
|
||||||
|
if request.html_list:
|
||||||
|
for idx, html_content in enumerate(request.html_list):
|
||||||
|
try:
|
||||||
|
tables = await extract_tables_from_html(html_content, request.config)
|
||||||
|
formatted_tables = format_table_response(tables)
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"success": True,
|
||||||
|
"source": f"html_{idx}",
|
||||||
|
"table_count": len(formatted_tables),
|
||||||
|
"tables": formatted_tables,
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error extracting tables from html_{idx}: {e}")
|
||||||
|
results.append({
|
||||||
|
"success": False,
|
||||||
|
"source": f"html_{idx}",
|
||||||
|
"error": str(e),
|
||||||
|
})
|
||||||
|
|
||||||
|
# Process URL list
|
||||||
|
if request.url_list:
|
||||||
|
from async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
verbose=False,
|
||||||
|
)
|
||||||
|
table_strategy = create_table_extraction_strategy(request.config)
|
||||||
|
crawler_config = CrawlerRunConfig(
|
||||||
|
table_extraction_strategy=table_strategy,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get crawler from pool (reuse browser for all URLs in batch)
|
||||||
|
crawler = await get_crawler(browser_config, adapter=None)
|
||||||
|
|
||||||
|
for url in request.url_list:
|
||||||
|
try:
|
||||||
|
result = await crawler.arun(
|
||||||
|
url=url,
|
||||||
|
config=crawler_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.success:
|
||||||
|
html_content = result.html
|
||||||
|
tables = await extract_tables_from_html(html_content, request.config)
|
||||||
|
formatted_tables = format_table_response(tables)
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"success": True,
|
||||||
|
"source": url,
|
||||||
|
"table_count": len(formatted_tables),
|
||||||
|
"tables": formatted_tables,
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
results.append({
|
||||||
|
"success": False,
|
||||||
|
"source": url,
|
||||||
|
"error": result.error_message,
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error extracting tables from {url}: {e}")
|
||||||
|
results.append({
|
||||||
|
"success": False,
|
||||||
|
"source": url,
|
||||||
|
"error": str(e),
|
||||||
|
})
|
||||||
|
|
||||||
|
# Calculate summary
|
||||||
|
successful = sum(1 for r in results if r["success"])
|
||||||
|
failed = len(results) - successful
|
||||||
|
total_tables = sum(r.get("table_count", 0) for r in results if r["success"])
|
||||||
|
|
||||||
|
return JSONResponse({
|
||||||
|
"success": True,
|
||||||
|
"summary": {
|
||||||
|
"total_processed": len(results),
|
||||||
|
"successful": successful,
|
||||||
|
"failed": failed,
|
||||||
|
"total_tables_extracted": total_tables,
|
||||||
|
},
|
||||||
|
"results": results,
|
||||||
|
"strategy": request.config.strategy.value,
|
||||||
|
})
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in batch table extraction: {e}", exc_info=True)
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=f"Batch table extraction failed: {str(e)}"
|
||||||
|
)
|
||||||
@@ -48,6 +48,153 @@ class DispatcherSelection(BaseModel):
|
|||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Table Extraction Schemas
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
class TableExtractionStrategy(str, Enum):
|
||||||
|
"""Available table extraction strategies."""
|
||||||
|
NONE = "none"
|
||||||
|
DEFAULT = "default"
|
||||||
|
LLM = "llm"
|
||||||
|
FINANCIAL = "financial"
|
||||||
|
|
||||||
|
|
||||||
|
class TableExtractionConfig(BaseModel):
|
||||||
|
"""Configuration for table extraction."""
|
||||||
|
|
||||||
|
strategy: TableExtractionStrategy = Field(
|
||||||
|
default=TableExtractionStrategy.DEFAULT,
|
||||||
|
description="Table extraction strategy to use"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Common configuration for all strategies
|
||||||
|
table_score_threshold: int = Field(
|
||||||
|
default=7,
|
||||||
|
ge=0,
|
||||||
|
le=100,
|
||||||
|
description="Minimum score for a table to be considered a data table (default strategy)"
|
||||||
|
)
|
||||||
|
min_rows: int = Field(
|
||||||
|
default=0,
|
||||||
|
ge=0,
|
||||||
|
description="Minimum number of rows for a valid table"
|
||||||
|
)
|
||||||
|
min_cols: int = Field(
|
||||||
|
default=0,
|
||||||
|
ge=0,
|
||||||
|
description="Minimum number of columns for a valid table"
|
||||||
|
)
|
||||||
|
|
||||||
|
# LLM-specific configuration
|
||||||
|
llm_provider: Optional[str] = Field(
|
||||||
|
None,
|
||||||
|
description="LLM provider for LLM strategy (e.g., 'openai/gpt-4')"
|
||||||
|
)
|
||||||
|
llm_model: Optional[str] = Field(
|
||||||
|
None,
|
||||||
|
description="Specific LLM model to use"
|
||||||
|
)
|
||||||
|
llm_api_key: Optional[str] = Field(
|
||||||
|
None,
|
||||||
|
description="API key for LLM provider (if not in environment)"
|
||||||
|
)
|
||||||
|
llm_base_url: Optional[str] = Field(
|
||||||
|
None,
|
||||||
|
description="Custom base URL for LLM API"
|
||||||
|
)
|
||||||
|
extraction_prompt: Optional[str] = Field(
|
||||||
|
None,
|
||||||
|
description="Custom prompt for LLM table extraction"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Financial-specific configuration
|
||||||
|
decimal_separator: str = Field(
|
||||||
|
default=".",
|
||||||
|
description="Decimal separator for financial tables (e.g., '.' or ',')"
|
||||||
|
)
|
||||||
|
thousand_separator: str = Field(
|
||||||
|
default=",",
|
||||||
|
description="Thousand separator for financial tables (e.g., ',' or '.')"
|
||||||
|
)
|
||||||
|
|
||||||
|
# General options
|
||||||
|
verbose: bool = Field(
|
||||||
|
default=False,
|
||||||
|
description="Enable verbose logging for table extraction"
|
||||||
|
)
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
schema_extra = {
|
||||||
|
"example": {
|
||||||
|
"strategy": "default",
|
||||||
|
"table_score_threshold": 7,
|
||||||
|
"min_rows": 2,
|
||||||
|
"min_cols": 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TableExtractionRequest(BaseModel):
|
||||||
|
"""Request for dedicated table extraction endpoint."""
|
||||||
|
|
||||||
|
url: Optional[str] = Field(
|
||||||
|
None,
|
||||||
|
description="URL to crawl and extract tables from"
|
||||||
|
)
|
||||||
|
html: Optional[str] = Field(
|
||||||
|
None,
|
||||||
|
description="Raw HTML content to extract tables from"
|
||||||
|
)
|
||||||
|
config: TableExtractionConfig = Field(
|
||||||
|
default_factory=lambda: TableExtractionConfig(),
|
||||||
|
description="Table extraction configuration"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Browser config (only used if URL is provided)
|
||||||
|
browser_config: Optional[Dict] = Field(
|
||||||
|
default_factory=dict,
|
||||||
|
description="Browser configuration for URL crawling"
|
||||||
|
)
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
schema_extra = {
|
||||||
|
"example": {
|
||||||
|
"url": "https://example.com/data-table",
|
||||||
|
"config": {
|
||||||
|
"strategy": "default",
|
||||||
|
"min_rows": 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TableExtractionBatchRequest(BaseModel):
|
||||||
|
"""Request for batch table extraction."""
|
||||||
|
|
||||||
|
html_list: Optional[List[str]] = Field(
|
||||||
|
None,
|
||||||
|
description="List of HTML contents to extract tables from"
|
||||||
|
)
|
||||||
|
url_list: Optional[List[str]] = Field(
|
||||||
|
None,
|
||||||
|
description="List of URLs to extract tables from"
|
||||||
|
)
|
||||||
|
config: TableExtractionConfig = Field(
|
||||||
|
default_factory=lambda: TableExtractionConfig(),
|
||||||
|
description="Table extraction configuration"
|
||||||
|
)
|
||||||
|
browser_config: Optional[Dict] = Field(
|
||||||
|
default_factory=dict,
|
||||||
|
description="Browser configuration"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# End Table Extraction Schemas
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
|
||||||
class CrawlRequest(BaseModel):
|
class CrawlRequest(BaseModel):
|
||||||
urls: List[str] = Field(min_length=1, max_length=100)
|
urls: List[str] = Field(min_length=1, max_length=100)
|
||||||
browser_config: Optional[Dict] = Field(default_factory=dict)
|
browser_config: Optional[Dict] = Field(default_factory=dict)
|
||||||
@@ -78,6 +225,11 @@ class CrawlRequest(BaseModel):
|
|||||||
300, ge=60, le=3600, description="Recovery time in seconds for failure_aware strategy"
|
300, ge=60, le=3600, description="Recovery time in seconds for failure_aware strategy"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Table extraction configuration
|
||||||
|
table_extraction: Optional[TableExtractionConfig] = Field(
|
||||||
|
None, description="Optional table extraction configuration to extract tables during crawl"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class HookConfig(BaseModel):
|
class HookConfig(BaseModel):
|
||||||
"""Configuration for user-provided hooks"""
|
"""Configuration for user-provided hooks"""
|
||||||
|
|||||||
@@ -87,7 +87,7 @@ from prometheus_fastapi_instrumentator import Instrumentator
|
|||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from rank_bm25 import BM25Okapi
|
from rank_bm25 import BM25Okapi
|
||||||
from redis import asyncio as aioredis
|
from redis import asyncio as aioredis
|
||||||
from routers import adaptive, dispatchers, scripts, monitoring
|
from routers import adaptive, dispatchers, scripts, monitoring, tables
|
||||||
from schemas import (
|
from schemas import (
|
||||||
CrawlRequest,
|
CrawlRequest,
|
||||||
CrawlRequestWithHooks,
|
CrawlRequestWithHooks,
|
||||||
@@ -298,6 +298,7 @@ app.include_router(adaptive.router)
|
|||||||
app.include_router(dispatchers.router)
|
app.include_router(dispatchers.router)
|
||||||
app.include_router(scripts.router)
|
app.include_router(scripts.router)
|
||||||
app.include_router(monitoring.router)
|
app.include_router(monitoring.router)
|
||||||
|
app.include_router(tables.router)
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────── Endpoints ──────────────────────────
|
# ──────────────────────── Endpoints ──────────────────────────
|
||||||
@@ -1578,6 +1579,7 @@ async def crawl(
|
|||||||
proxies=crawl_request.proxies,
|
proxies=crawl_request.proxies,
|
||||||
proxy_failure_threshold=crawl_request.proxy_failure_threshold,
|
proxy_failure_threshold=crawl_request.proxy_failure_threshold,
|
||||||
proxy_recovery_time=crawl_request.proxy_recovery_time,
|
proxy_recovery_time=crawl_request.proxy_recovery_time,
|
||||||
|
table_extraction=crawl_request.table_extraction.model_dump() if crawl_request.table_extraction else None,
|
||||||
dispatcher=dispatcher,
|
dispatcher=dispatcher,
|
||||||
)
|
)
|
||||||
# check if all of the results are not successful
|
# check if all of the results are not successful
|
||||||
@@ -1729,6 +1731,7 @@ async def stream_process(crawl_request: CrawlRequestWithHooks):
|
|||||||
proxies=crawl_request.proxies,
|
proxies=crawl_request.proxies,
|
||||||
proxy_failure_threshold=crawl_request.proxy_failure_threshold,
|
proxy_failure_threshold=crawl_request.proxy_failure_threshold,
|
||||||
proxy_recovery_time=crawl_request.proxy_recovery_time,
|
proxy_recovery_time=crawl_request.proxy_recovery_time,
|
||||||
|
table_extraction=crawl_request.table_extraction.model_dump() if crawl_request.table_extraction else None,
|
||||||
dispatcher=dispatcher,
|
dispatcher=dispatcher,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ from datetime import datetime
|
|||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from fastapi import Request
|
from fastapi import Request
|
||||||
from typing import Dict, Optional, Any
|
from typing import Dict, Optional, Any, List
|
||||||
|
|
||||||
# Import dispatchers from crawl4ai
|
# Import dispatchers from crawl4ai
|
||||||
from crawl4ai.async_dispatcher import (
|
from crawl4ai.async_dispatcher import (
|
||||||
@@ -374,3 +374,186 @@ def create_chunking_strategy(config: Optional[Dict[str, Any]] = None) -> Optiona
|
|||||||
return strategies[strategy_type](**params)
|
return strategies[strategy_type](**params)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise ValueError(f"Failed to create {strategy_type} with params {params}: {str(e)}")
|
raise ValueError(f"Failed to create {strategy_type} with params {params}: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Table Extraction Utilities
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def create_table_extraction_strategy(config):
|
||||||
|
"""
|
||||||
|
Create a table extraction strategy from configuration.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: TableExtractionConfig instance or dict
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
TableExtractionStrategy instance
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If strategy type is unknown or configuration is invalid
|
||||||
|
"""
|
||||||
|
from crawl4ai.table_extraction import (
|
||||||
|
NoTableExtraction,
|
||||||
|
DefaultTableExtraction,
|
||||||
|
LLMTableExtraction
|
||||||
|
)
|
||||||
|
from schemas import TableExtractionStrategy
|
||||||
|
|
||||||
|
# Handle both Pydantic model and dict
|
||||||
|
if hasattr(config, 'strategy'):
|
||||||
|
strategy_type = config.strategy
|
||||||
|
elif isinstance(config, dict):
|
||||||
|
strategy_type = config.get('strategy', 'default')
|
||||||
|
else:
|
||||||
|
strategy_type = 'default'
|
||||||
|
|
||||||
|
# Convert string to enum if needed
|
||||||
|
if isinstance(strategy_type, str):
|
||||||
|
strategy_type = strategy_type.lower()
|
||||||
|
|
||||||
|
# Extract configuration values
|
||||||
|
def get_config_value(key, default=None):
|
||||||
|
if hasattr(config, key):
|
||||||
|
return getattr(config, key)
|
||||||
|
elif isinstance(config, dict):
|
||||||
|
return config.get(key, default)
|
||||||
|
return default
|
||||||
|
|
||||||
|
# Create strategy based on type
|
||||||
|
if strategy_type in ['none', TableExtractionStrategy.NONE]:
|
||||||
|
return NoTableExtraction()
|
||||||
|
|
||||||
|
elif strategy_type in ['default', TableExtractionStrategy.DEFAULT]:
|
||||||
|
return DefaultTableExtraction(
|
||||||
|
table_score_threshold=get_config_value('table_score_threshold', 7),
|
||||||
|
min_rows=get_config_value('min_rows', 0),
|
||||||
|
min_cols=get_config_value('min_cols', 0),
|
||||||
|
verbose=get_config_value('verbose', False)
|
||||||
|
)
|
||||||
|
|
||||||
|
elif strategy_type in ['llm', TableExtractionStrategy.LLM]:
|
||||||
|
from crawl4ai.types import LLMConfig
|
||||||
|
|
||||||
|
# Build LLM config
|
||||||
|
llm_config = None
|
||||||
|
llm_provider = get_config_value('llm_provider')
|
||||||
|
llm_api_key = get_config_value('llm_api_key')
|
||||||
|
llm_model = get_config_value('llm_model')
|
||||||
|
llm_base_url = get_config_value('llm_base_url')
|
||||||
|
|
||||||
|
if llm_provider or llm_api_key:
|
||||||
|
llm_config = LLMConfig(
|
||||||
|
provider=llm_provider or "openai/gpt-4",
|
||||||
|
api_token=llm_api_key,
|
||||||
|
model=llm_model,
|
||||||
|
base_url=llm_base_url
|
||||||
|
)
|
||||||
|
|
||||||
|
return LLMTableExtraction(
|
||||||
|
llm_config=llm_config,
|
||||||
|
extraction_prompt=get_config_value('extraction_prompt'),
|
||||||
|
table_score_threshold=get_config_value('table_score_threshold', 7),
|
||||||
|
min_rows=get_config_value('min_rows', 0),
|
||||||
|
min_cols=get_config_value('min_cols', 0),
|
||||||
|
verbose=get_config_value('verbose', False)
|
||||||
|
)
|
||||||
|
|
||||||
|
elif strategy_type in ['financial', TableExtractionStrategy.FINANCIAL]:
|
||||||
|
# Financial strategy uses DefaultTableExtraction with specialized settings
|
||||||
|
# optimized for financial data (tables with currency, numbers, etc.)
|
||||||
|
return DefaultTableExtraction(
|
||||||
|
table_score_threshold=get_config_value('table_score_threshold', 10), # Higher threshold for financial
|
||||||
|
min_rows=get_config_value('min_rows', 2), # Financial tables usually have at least 2 rows
|
||||||
|
min_cols=get_config_value('min_cols', 2), # Financial tables usually have at least 2 columns
|
||||||
|
verbose=get_config_value('verbose', False)
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown table extraction strategy: {strategy_type}")
|
||||||
|
|
||||||
|
|
||||||
|
def format_table_response(tables: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Format extracted tables for API response.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tables: List of table dictionaries from table extraction strategy
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of formatted table dictionaries with consistent structure
|
||||||
|
"""
|
||||||
|
if not tables:
|
||||||
|
return []
|
||||||
|
|
||||||
|
formatted_tables = []
|
||||||
|
for idx, table in enumerate(tables):
|
||||||
|
formatted = {
|
||||||
|
"table_index": idx,
|
||||||
|
"headers": table.get("headers", []),
|
||||||
|
"rows": table.get("rows", []),
|
||||||
|
"caption": table.get("caption"),
|
||||||
|
"summary": table.get("summary"),
|
||||||
|
"metadata": table.get("metadata", {}),
|
||||||
|
"row_count": len(table.get("rows", [])),
|
||||||
|
"col_count": len(table.get("headers", [])),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add score if available (from scoring strategies)
|
||||||
|
if "score" in table:
|
||||||
|
formatted["score"] = table["score"]
|
||||||
|
|
||||||
|
# Add position information if available
|
||||||
|
if "position" in table:
|
||||||
|
formatted["position"] = table["position"]
|
||||||
|
|
||||||
|
formatted_tables.append(formatted)
|
||||||
|
|
||||||
|
return formatted_tables
|
||||||
|
|
||||||
|
|
||||||
|
async def extract_tables_from_html(html: str, config = None):
|
||||||
|
"""
|
||||||
|
Extract tables from HTML content (async wrapper for CPU-bound operation).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html: HTML content as string
|
||||||
|
config: TableExtractionConfig instance or dict
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of formatted table dictionaries
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If HTML parsing fails
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
from functools import partial
|
||||||
|
from lxml import html as lxml_html
|
||||||
|
from schemas import TableExtractionConfig
|
||||||
|
|
||||||
|
# Define sync extraction function
|
||||||
|
def _sync_extract():
|
||||||
|
try:
|
||||||
|
# Parse HTML
|
||||||
|
element = lxml_html.fromstring(html)
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError(f"Failed to parse HTML: {str(e)}")
|
||||||
|
|
||||||
|
# Create strategy
|
||||||
|
cfg = config if config is not None else TableExtractionConfig()
|
||||||
|
strategy = create_table_extraction_strategy(cfg)
|
||||||
|
|
||||||
|
# Extract tables
|
||||||
|
tables = strategy.extract_tables(element)
|
||||||
|
|
||||||
|
# Format response
|
||||||
|
return format_table_response(tables)
|
||||||
|
|
||||||
|
# Run in executor to avoid blocking the event loop
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
return await loop.run_in_executor(None, _sync_extract)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# End Table Extraction Utilities
|
||||||
|
# ============================================================================
|
||||||
626
docs/examples/table-extraction-api.md
Normal file
626
docs/examples/table-extraction-api.md
Normal file
@@ -0,0 +1,626 @@
|
|||||||
|
# Table Extraction API Documentation
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The Crawl4AI Docker Server provides powerful table extraction capabilities through both **integrated** and **dedicated** endpoints. Extract structured data from HTML tables using multiple strategies: default (fast regex-based), LLM-powered (semantic understanding), or financial (specialized for financial data).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Table of Contents
|
||||||
|
|
||||||
|
1. [Quick Start](#quick-start)
|
||||||
|
2. [Extraction Strategies](#extraction-strategies)
|
||||||
|
3. [Integrated Extraction (with /crawl)](#integrated-extraction)
|
||||||
|
4. [Dedicated Endpoints (/tables)](#dedicated-endpoints)
|
||||||
|
5. [Batch Processing](#batch-processing)
|
||||||
|
6. [Configuration Options](#configuration-options)
|
||||||
|
7. [Response Format](#response-format)
|
||||||
|
8. [Error Handling](#error-handling)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### Extract Tables During Crawl
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:11235/crawl \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"urls": ["https://example.com/financial-data"],
|
||||||
|
"table_extraction": {
|
||||||
|
"strategy": "default"
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### Extract Tables from HTML
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:11235/tables/extract \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"html": "<table><tr><th>Name</th><th>Value</th></tr><tr><td>A</td><td>100</td></tr></table>",
|
||||||
|
"config": {
|
||||||
|
"strategy": "default"
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Extraction Strategies
|
||||||
|
|
||||||
|
### 1. **Default Strategy** (Fast, Regex-Based)
|
||||||
|
|
||||||
|
Best for general-purpose table extraction with high performance.
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"strategy": "default"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Use Cases:**
|
||||||
|
- General web scraping
|
||||||
|
- Simple data tables
|
||||||
|
- High-volume extraction
|
||||||
|
|
||||||
|
### 2. **LLM Strategy** (AI-Powered)
|
||||||
|
|
||||||
|
Uses Large Language Models for semantic understanding and complex table structures.
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"strategy": "llm",
|
||||||
|
"llm_provider": "openai",
|
||||||
|
"llm_model": "gpt-4",
|
||||||
|
"llm_api_key": "your-api-key",
|
||||||
|
"llm_prompt": "Extract and structure the financial data"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Use Cases:**
|
||||||
|
- Complex nested tables
|
||||||
|
- Tables with irregular structure
|
||||||
|
- Semantic data extraction
|
||||||
|
|
||||||
|
**Supported Providers:**
|
||||||
|
- `openai` (GPT-3.5, GPT-4)
|
||||||
|
- `anthropic` (Claude)
|
||||||
|
- `huggingface` (Open models)
|
||||||
|
|
||||||
|
### 3. **Financial Strategy** (Specialized)
|
||||||
|
|
||||||
|
Optimized for financial tables with proper numerical formatting.
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"strategy": "financial",
|
||||||
|
"preserve_formatting": true,
|
||||||
|
"extract_metadata": true
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Use Cases:**
|
||||||
|
- Stock data
|
||||||
|
- Financial statements
|
||||||
|
- Accounting tables
|
||||||
|
- Price lists
|
||||||
|
|
||||||
|
### 4. **None Strategy** (No Extraction)
|
||||||
|
|
||||||
|
Disables table extraction.
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"strategy": "none"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Integrated Extraction
|
||||||
|
|
||||||
|
Add table extraction to any crawl request by including the `table_extraction` configuration.
|
||||||
|
|
||||||
|
### Example: Basic Integration
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
response = requests.post("http://localhost:11235/crawl", json={
|
||||||
|
"urls": ["https://finance.yahoo.com/quote/AAPL"],
|
||||||
|
"browser_config": {
|
||||||
|
"headless": True
|
||||||
|
},
|
||||||
|
"crawler_config": {
|
||||||
|
"wait_until": "networkidle"
|
||||||
|
},
|
||||||
|
"table_extraction": {
|
||||||
|
"strategy": "financial",
|
||||||
|
"preserve_formatting": True
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
data = response.json()
|
||||||
|
for result in data["results"]:
|
||||||
|
if result["success"]:
|
||||||
|
print(f"Found {len(result.get('tables', []))} tables")
|
||||||
|
for table in result.get("tables", []):
|
||||||
|
print(f"Table: {table['headers']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example: Multiple URLs with Table Extraction
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// Node.js example
|
||||||
|
const axios = require('axios');
|
||||||
|
|
||||||
|
const response = await axios.post('http://localhost:11235/crawl', {
|
||||||
|
urls: [
|
||||||
|
'https://example.com/page1',
|
||||||
|
'https://example.com/page2',
|
||||||
|
'https://example.com/page3'
|
||||||
|
],
|
||||||
|
table_extraction: {
|
||||||
|
strategy: 'default'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
response.data.results.forEach((result, index) => {
|
||||||
|
console.log(`Page ${index + 1}:`);
|
||||||
|
console.log(` Tables found: ${result.tables?.length || 0}`);
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example: LLM-Based Extraction with Custom Prompt
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:11235/crawl \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"urls": ["https://example.com/complex-data"],
|
||||||
|
"table_extraction": {
|
||||||
|
"strategy": "llm",
|
||||||
|
"llm_provider": "openai",
|
||||||
|
"llm_model": "gpt-4",
|
||||||
|
"llm_api_key": "sk-...",
|
||||||
|
"llm_prompt": "Extract product pricing information, including discounts and availability"
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dedicated Endpoints
|
||||||
|
|
||||||
|
### `/tables/extract` - Single Extraction
|
||||||
|
|
||||||
|
Extract tables from HTML content or by fetching a URL.
|
||||||
|
|
||||||
|
#### Extract from HTML
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
html_content = """
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr><th>Product</th><th>Price</th><th>Stock</th></tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>Widget A</td><td>$19.99</td><td>In Stock</td></tr>
|
||||||
|
<tr><td>Widget B</td><td>$29.99</td><td>Out of Stock</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
"""
|
||||||
|
|
||||||
|
response = requests.post("http://localhost:11235/tables/extract", json={
|
||||||
|
"html": html_content,
|
||||||
|
"config": {
|
||||||
|
"strategy": "default"
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
data = response.json()
|
||||||
|
print(f"Success: {data['success']}")
|
||||||
|
print(f"Tables found: {data['table_count']}")
|
||||||
|
print(f"Strategy used: {data['strategy']}")
|
||||||
|
|
||||||
|
for table in data['tables']:
|
||||||
|
print("\nTable:")
|
||||||
|
print(f" Headers: {table['headers']}")
|
||||||
|
print(f" Rows: {len(table['rows'])}")
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Extract from URL
|
||||||
|
|
||||||
|
```python
|
||||||
|
response = requests.post("http://localhost:11235/tables/extract", json={
|
||||||
|
"url": "https://example.com/data-page",
|
||||||
|
"config": {
|
||||||
|
"strategy": "financial",
|
||||||
|
"preserve_formatting": True
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
data = response.json()
|
||||||
|
for table in data['tables']:
|
||||||
|
print(f"Table with {len(table['rows'])} rows")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Batch Processing
|
||||||
|
|
||||||
|
### `/tables/extract/batch` - Batch Extraction
|
||||||
|
|
||||||
|
Extract tables from multiple HTML contents or URLs in a single request.
|
||||||
|
|
||||||
|
#### Batch from HTML List
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
html_contents = [
|
||||||
|
"<table><tr><th>A</th></tr><tr><td>1</td></tr></table>",
|
||||||
|
"<table><tr><th>B</th></tr><tr><td>2</td></tr></table>",
|
||||||
|
"<table><tr><th>C</th></tr><tr><td>3</td></tr></table>",
|
||||||
|
]
|
||||||
|
|
||||||
|
response = requests.post("http://localhost:11235/tables/extract/batch", json={
|
||||||
|
"html_list": html_contents,
|
||||||
|
"config": {
|
||||||
|
"strategy": "default"
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
data = response.json()
|
||||||
|
print(f"Total processed: {data['summary']['total_processed']}")
|
||||||
|
print(f"Successful: {data['summary']['successful']}")
|
||||||
|
print(f"Failed: {data['summary']['failed']}")
|
||||||
|
print(f"Total tables: {data['summary']['total_tables_extracted']}")
|
||||||
|
|
||||||
|
for result in data['results']:
|
||||||
|
if result['success']:
|
||||||
|
print(f" {result['source']}: {result['table_count']} tables")
|
||||||
|
else:
|
||||||
|
print(f" {result['source']}: Error - {result['error']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Batch from URL List
|
||||||
|
|
||||||
|
```python
|
||||||
|
response = requests.post("http://localhost:11235/tables/extract/batch", json={
|
||||||
|
"url_list": [
|
||||||
|
"https://example.com/page1",
|
||||||
|
"https://example.com/page2",
|
||||||
|
"https://example.com/page3",
|
||||||
|
],
|
||||||
|
"config": {
|
||||||
|
"strategy": "financial"
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
data = response.json()
|
||||||
|
for result in data['results']:
|
||||||
|
print(f"URL: {result['source']}")
|
||||||
|
if result['success']:
|
||||||
|
print(f" ✓ Found {result['table_count']} tables")
|
||||||
|
else:
|
||||||
|
print(f" ✗ Failed: {result['error']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Mixed Batch (HTML + URLs)
|
||||||
|
|
||||||
|
```python
|
||||||
|
response = requests.post("http://localhost:11235/tables/extract/batch", json={
|
||||||
|
"html_list": [
|
||||||
|
"<table><tr><th>Local</th></tr></table>"
|
||||||
|
],
|
||||||
|
"url_list": [
|
||||||
|
"https://example.com/remote"
|
||||||
|
],
|
||||||
|
"config": {
|
||||||
|
"strategy": "default"
|
||||||
|
}
|
||||||
|
})
|
||||||
|
```
|
||||||
|
|
||||||
|
**Batch Limits:**
|
||||||
|
- Maximum 50 items per batch request
|
||||||
|
- Items are processed independently (partial failures allowed)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Configuration Options
|
||||||
|
|
||||||
|
### TableExtractionConfig
|
||||||
|
|
||||||
|
| Field | Type | Default | Description |
|
||||||
|
|-------|------|---------|-------------|
|
||||||
|
| `strategy` | `"none"` \| `"default"` \| `"llm"` \| `"financial"` | `"default"` | Extraction strategy to use |
|
||||||
|
| `llm_provider` | `string` | `null` | LLM provider (required for `llm` strategy) |
|
||||||
|
| `llm_model` | `string` | `null` | Model name (required for `llm` strategy) |
|
||||||
|
| `llm_api_key` | `string` | `null` | API key (required for `llm` strategy) |
|
||||||
|
| `llm_prompt` | `string` | `null` | Custom extraction prompt |
|
||||||
|
| `preserve_formatting` | `boolean` | `false` | Keep original number/date formatting |
|
||||||
|
| `extract_metadata` | `boolean` | `false` | Include table metadata (id, class, etc.) |
|
||||||
|
|
||||||
|
### Example: Full Configuration
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"strategy": "llm",
|
||||||
|
"llm_provider": "openai",
|
||||||
|
"llm_model": "gpt-4",
|
||||||
|
"llm_api_key": "sk-...",
|
||||||
|
"llm_prompt": "Extract structured product data",
|
||||||
|
"preserve_formatting": true,
|
||||||
|
"extract_metadata": true
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Response Format
|
||||||
|
|
||||||
|
### Single Extraction Response
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"success": true,
|
||||||
|
"table_count": 2,
|
||||||
|
"strategy": "default",
|
||||||
|
"tables": [
|
||||||
|
{
|
||||||
|
"headers": ["Product", "Price", "Stock"],
|
||||||
|
"rows": [
|
||||||
|
["Widget A", "$19.99", "In Stock"],
|
||||||
|
["Widget B", "$29.99", "Out of Stock"]
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "product-table",
|
||||||
|
"class": "data-table",
|
||||||
|
"row_count": 2,
|
||||||
|
"column_count": 3
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Batch Extraction Response
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"success": true,
|
||||||
|
"summary": {
|
||||||
|
"total_processed": 3,
|
||||||
|
"successful": 2,
|
||||||
|
"failed": 1,
|
||||||
|
"total_tables_extracted": 5
|
||||||
|
},
|
||||||
|
"strategy": "default",
|
||||||
|
"results": [
|
||||||
|
{
|
||||||
|
"success": true,
|
||||||
|
"source": "html_0",
|
||||||
|
"table_count": 2,
|
||||||
|
"tables": [...]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"success": true,
|
||||||
|
"source": "https://example.com",
|
||||||
|
"table_count": 3,
|
||||||
|
"tables": [...]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"success": false,
|
||||||
|
"source": "html_2",
|
||||||
|
"error": "Invalid HTML structure"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Integrated Crawl Response
|
||||||
|
|
||||||
|
Tables are included in the standard crawl result:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"success": true,
|
||||||
|
"results": [
|
||||||
|
{
|
||||||
|
"url": "https://example.com",
|
||||||
|
"success": true,
|
||||||
|
"html": "...",
|
||||||
|
"markdown": "...",
|
||||||
|
"tables": [
|
||||||
|
{
|
||||||
|
"headers": [...],
|
||||||
|
"rows": [...]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Error Handling
|
||||||
|
|
||||||
|
### Common Errors
|
||||||
|
|
||||||
|
#### 400 Bad Request
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"detail": "Must provide either 'html' or 'url' for table extraction."
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Cause:** Invalid request parameters
|
||||||
|
|
||||||
|
**Solution:** Ensure you provide exactly one of `html` or `url`
|
||||||
|
|
||||||
|
#### 400 Bad Request (LLM)
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"detail": "Invalid table extraction config: LLM strategy requires llm_provider, llm_model, and llm_api_key"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Cause:** Missing required LLM configuration
|
||||||
|
|
||||||
|
**Solution:** Provide all required LLM fields
|
||||||
|
|
||||||
|
#### 500 Internal Server Error
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"detail": "Failed to fetch and extract from URL: Connection timeout"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Cause:** URL fetch failure or extraction error
|
||||||
|
|
||||||
|
**Solution:** Check URL accessibility and HTML validity
|
||||||
|
|
||||||
|
### Handling Partial Failures in Batch
|
||||||
|
|
||||||
|
```python
|
||||||
|
response = requests.post("http://localhost:11235/tables/extract/batch", json={
|
||||||
|
"url_list": urls,
|
||||||
|
"config": {"strategy": "default"}
|
||||||
|
})
|
||||||
|
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
successful_results = [r for r in data['results'] if r['success']]
|
||||||
|
failed_results = [r for r in data['results'] if not r['success']]
|
||||||
|
|
||||||
|
print(f"Successful: {len(successful_results)}")
|
||||||
|
for result in failed_results:
|
||||||
|
print(f"Failed: {result['source']} - {result['error']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
### 1. **Choose the Right Strategy**
|
||||||
|
|
||||||
|
- **Default**: Fast, reliable for most tables
|
||||||
|
- **LLM**: Complex structures, semantic extraction
|
||||||
|
- **Financial**: Numerical data with formatting
|
||||||
|
|
||||||
|
### 2. **Batch Processing**
|
||||||
|
|
||||||
|
- Use batch endpoints for multiple pages
|
||||||
|
- Keep batch size under 50 items
|
||||||
|
- Handle partial failures gracefully
|
||||||
|
|
||||||
|
### 3. **Performance Optimization**
|
||||||
|
|
||||||
|
- Use `default` strategy for high-volume extraction
|
||||||
|
- Enable `preserve_formatting` only when needed
|
||||||
|
- Limit `extract_metadata` to reduce payload size
|
||||||
|
|
||||||
|
### 4. **LLM Strategy Tips**
|
||||||
|
|
||||||
|
- Use specific prompts for better results
|
||||||
|
- GPT-4 for complex tables, GPT-3.5 for simple ones
|
||||||
|
- Cache results to reduce API costs
|
||||||
|
|
||||||
|
### 5. **Error Handling**
|
||||||
|
|
||||||
|
- Always check `success` field
|
||||||
|
- Log errors for debugging
|
||||||
|
- Implement retry logic for transient failures
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Examples by Use Case
|
||||||
|
|
||||||
|
### Financial Data Extraction
|
||||||
|
|
||||||
|
```python
|
||||||
|
response = requests.post("http://localhost:11235/crawl", json={
|
||||||
|
"urls": ["https://finance.site.com/stocks"],
|
||||||
|
"table_extraction": {
|
||||||
|
"strategy": "financial",
|
||||||
|
"preserve_formatting": True,
|
||||||
|
"extract_metadata": True
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
for result in response.json()["results"]:
|
||||||
|
for table in result.get("tables", []):
|
||||||
|
# Financial tables with preserved formatting
|
||||||
|
print(table["rows"])
|
||||||
|
```
|
||||||
|
|
||||||
|
### Product Catalog Scraping
|
||||||
|
|
||||||
|
```python
|
||||||
|
response = requests.post("http://localhost:11235/tables/extract/batch", json={
|
||||||
|
"url_list": [
|
||||||
|
"https://shop.com/category/electronics",
|
||||||
|
"https://shop.com/category/clothing",
|
||||||
|
"https://shop.com/category/books",
|
||||||
|
],
|
||||||
|
"config": {"strategy": "default"}
|
||||||
|
})
|
||||||
|
|
||||||
|
all_products = []
|
||||||
|
for result in response.json()["results"]:
|
||||||
|
if result["success"]:
|
||||||
|
for table in result["tables"]:
|
||||||
|
all_products.extend(table["rows"])
|
||||||
|
|
||||||
|
print(f"Total products: {len(all_products)}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Complex Table with LLM
|
||||||
|
|
||||||
|
```python
|
||||||
|
response = requests.post("http://localhost:11235/tables/extract", json={
|
||||||
|
"url": "https://complex-data.com/report",
|
||||||
|
"config": {
|
||||||
|
"strategy": "llm",
|
||||||
|
"llm_provider": "openai",
|
||||||
|
"llm_model": "gpt-4",
|
||||||
|
"llm_api_key": "sk-...",
|
||||||
|
"llm_prompt": "Extract quarterly revenue breakdown by region and product category"
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
structured_data = response.json()["tables"]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## API Reference Summary
|
||||||
|
|
||||||
|
| Endpoint | Method | Purpose |
|
||||||
|
|----------|--------|---------|
|
||||||
|
| `/crawl` | POST | Crawl with integrated table extraction |
|
||||||
|
| `/crawl/stream` | POST | Stream crawl with table extraction |
|
||||||
|
| `/tables/extract` | POST | Extract tables from HTML or URL |
|
||||||
|
| `/tables/extract/batch` | POST | Batch extract from multiple sources |
|
||||||
|
|
||||||
|
For complete API documentation, visit: `/docs` (Swagger UI)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Support
|
||||||
|
|
||||||
|
For issues, feature requests, or questions:
|
||||||
|
- GitHub: https://github.com/unclecode/crawl4ai
|
||||||
|
- Documentation: https://crawl4ai.com/docs
|
||||||
|
- Discord: https://discord.gg/crawl4ai
|
||||||
458
tests/docker/test_table_extraction.py
Normal file
458
tests/docker/test_table_extraction.py
Normal file
@@ -0,0 +1,458 @@
|
|||||||
|
"""
|
||||||
|
Integration tests for Table Extraction functionality in Crawl4AI Docker Server
|
||||||
|
|
||||||
|
Tests cover:
|
||||||
|
1. Integrated table extraction during crawls
|
||||||
|
2. Dedicated /tables endpoints
|
||||||
|
3. All extraction strategies (default, LLM, financial)
|
||||||
|
4. Batch processing
|
||||||
|
5. Error handling
|
||||||
|
|
||||||
|
Note: These tests require the Docker server to be running on localhost:11235
|
||||||
|
Run: python deploy/docker/server.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
|
||||||
|
# Base URL for the Docker API server
|
||||||
|
BASE_URL = "http://localhost:11234"
|
||||||
|
|
||||||
|
# Sample HTML with tables for testing
|
||||||
|
SAMPLE_HTML_WITH_TABLES = """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head><title>Test Page with Tables</title></head>
|
||||||
|
<body>
|
||||||
|
<h1>Financial Data</h1>
|
||||||
|
|
||||||
|
<!-- Simple table -->
|
||||||
|
<table id="simple">
|
||||||
|
<tr><th>Name</th><th>Age</th></tr>
|
||||||
|
<tr><td>Alice</td><td>25</td></tr>
|
||||||
|
<tr><td>Bob</td><td>30</td></tr>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<!-- Financial table -->
|
||||||
|
<table id="financial">
|
||||||
|
<thead>
|
||||||
|
<tr><th>Quarter</th><th>Revenue</th><th>Expenses</th><th>Profit</th></tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>Q1 2024</td><td>$1,250,000.00</td><td>$850,000.00</td><td>$400,000.00</td></tr>
|
||||||
|
<tr><td>Q2 2024</td><td>$1,500,000.00</td><td>$900,000.00</td><td>$600,000.00</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<!-- Complex nested table -->
|
||||||
|
<table id="complex">
|
||||||
|
<tr>
|
||||||
|
<th rowspan="2">Product</th>
|
||||||
|
<th colspan="2">Sales</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Units</th>
|
||||||
|
<th>Revenue</th>
|
||||||
|
</tr>
|
||||||
|
<tr><td>Widget A</td><td>100</td><td>$5,000</td></tr>
|
||||||
|
<tr><td>Widget B</td><td>200</td><td>$10,000</td></tr>
|
||||||
|
</table>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def server_url():
|
||||||
|
"""Return the server URL"""
|
||||||
|
return BASE_URL
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def wait_for_server():
|
||||||
|
"""Wait for server to be ready"""
|
||||||
|
max_retries = 5
|
||||||
|
for i in range(max_retries):
|
||||||
|
try:
|
||||||
|
response = requests.get(f"{BASE_URL}/health", timeout=2)
|
||||||
|
if response.status_code == 200:
|
||||||
|
return True
|
||||||
|
except requests.exceptions.RequestException:
|
||||||
|
if i < max_retries - 1:
|
||||||
|
time.sleep(1)
|
||||||
|
pytest.skip("Server not running on localhost:11235. Start with: python deploy/docker/server.py")
|
||||||
|
|
||||||
|
|
||||||
|
class TestIntegratedTableExtraction:
|
||||||
|
"""Test table extraction integrated with /crawl endpoint"""
|
||||||
|
|
||||||
|
def test_crawl_with_default_table_extraction(self, server_url, wait_for_server):
|
||||||
|
"""Test crawling with default table extraction strategy"""
|
||||||
|
response = requests.post(f"{server_url}/crawl", json={
|
||||||
|
"urls": ["https://example.com/tables"],
|
||||||
|
"browser_config": {"headless": True},
|
||||||
|
"crawler_config": {},
|
||||||
|
"table_extraction": {
|
||||||
|
"strategy": "default"
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["success"] is True
|
||||||
|
assert "results" in data
|
||||||
|
|
||||||
|
# Check first result has tables
|
||||||
|
if data["results"]:
|
||||||
|
result = data["results"][0]
|
||||||
|
assert "tables" in result or result.get("success") is False
|
||||||
|
|
||||||
|
def test_crawl_with_llm_table_extraction(self, server_url, wait_for_server):
|
||||||
|
"""Test crawling with LLM table extraction strategy"""
|
||||||
|
response = requests.post(f"{server_url}/crawl", json={
|
||||||
|
"urls": ["https://example.com/financial"],
|
||||||
|
"browser_config": {"headless": True},
|
||||||
|
"crawler_config": {},
|
||||||
|
"table_extraction": {
|
||||||
|
"strategy": "llm",
|
||||||
|
"llm_provider": "openai",
|
||||||
|
"llm_model": "gpt-4",
|
||||||
|
"llm_api_key": "test-key",
|
||||||
|
"llm_prompt": "Extract financial data from tables"
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
# Should fail without valid API key, but structure should be correct
|
||||||
|
# In real scenario with valid key, this would succeed
|
||||||
|
assert response.status_code in [200, 500] # May fail on auth
|
||||||
|
|
||||||
|
def test_crawl_with_financial_table_extraction(self, server_url, wait_for_server):
|
||||||
|
"""Test crawling with financial table extraction strategy"""
|
||||||
|
response = requests.post(f"{server_url}/crawl", json={
|
||||||
|
"urls": ["https://example.com/stocks"],
|
||||||
|
"browser_config": {"headless": True},
|
||||||
|
"crawler_config": {},
|
||||||
|
"table_extraction": {
|
||||||
|
"strategy": "financial",
|
||||||
|
"preserve_formatting": True,
|
||||||
|
"extract_metadata": True
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["success"] is True
|
||||||
|
|
||||||
|
def test_crawl_without_table_extraction(self, server_url, wait_for_server):
|
||||||
|
"""Test crawling without table extraction (should work normally)"""
|
||||||
|
response = requests.post(f"{server_url}/crawl", json={
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"browser_config": {"headless": True},
|
||||||
|
"crawler_config": {}
|
||||||
|
})
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["success"] is True
|
||||||
|
|
||||||
|
|
||||||
|
class TestDedicatedTableEndpoints:
|
||||||
|
"""Test dedicated /tables endpoints"""
|
||||||
|
|
||||||
|
def test_extract_tables_from_html(self, server_url, wait_for_server):
|
||||||
|
"""Test extracting tables from provided HTML"""
|
||||||
|
response = requests.post(f"{server_url}/tables/extract", json={
|
||||||
|
"html": SAMPLE_HTML_WITH_TABLES,
|
||||||
|
"config": {
|
||||||
|
"strategy": "default"
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["success"] is True
|
||||||
|
assert data["table_count"] >= 3 # Should find at least 3 tables
|
||||||
|
assert "tables" in data
|
||||||
|
assert data["strategy"] == "default"
|
||||||
|
|
||||||
|
# Verify table structure
|
||||||
|
if data["tables"]:
|
||||||
|
table = data["tables"][0]
|
||||||
|
assert "headers" in table or "rows" in table
|
||||||
|
|
||||||
|
def test_extract_tables_from_url(self, server_url, wait_for_server):
|
||||||
|
"""Test extracting tables by fetching URL"""
|
||||||
|
response = requests.post(f"{server_url}/tables/extract", json={
|
||||||
|
"url": "https://example.com/tables",
|
||||||
|
"config": {
|
||||||
|
"strategy": "default"
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
# May fail if URL doesn't exist, but structure should be correct
|
||||||
|
assert response.status_code in [200, 500]
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
data = response.json()
|
||||||
|
assert "success" in data
|
||||||
|
assert "tables" in data
|
||||||
|
|
||||||
|
def test_extract_tables_invalid_input(self, server_url, wait_for_server):
|
||||||
|
"""Test error handling for invalid input"""
|
||||||
|
# No html or url provided
|
||||||
|
response = requests.post(f"{server_url}/tables/extract", json={
|
||||||
|
"config": {"strategy": "default"}
|
||||||
|
})
|
||||||
|
|
||||||
|
assert response.status_code == 400
|
||||||
|
assert "html" in response.text.lower() or "url" in response.text.lower()
|
||||||
|
|
||||||
|
def test_extract_tables_both_html_and_url(self, server_url, wait_for_server):
|
||||||
|
"""Test error when both html and url are provided"""
|
||||||
|
response = requests.post(f"{server_url}/tables/extract", json={
|
||||||
|
"html": "<table></table>",
|
||||||
|
"url": "https://example.com",
|
||||||
|
"config": {"strategy": "default"}
|
||||||
|
})
|
||||||
|
|
||||||
|
assert response.status_code == 400
|
||||||
|
assert "both" in response.text.lower()
|
||||||
|
|
||||||
|
|
||||||
|
class TestBatchTableExtraction:
|
||||||
|
"""Test batch table extraction endpoints"""
|
||||||
|
|
||||||
|
def test_batch_extract_html_list(self, server_url, wait_for_server):
|
||||||
|
"""Test batch extraction from multiple HTML contents"""
|
||||||
|
response = requests.post(f"{server_url}/tables/extract/batch", json={
|
||||||
|
"html_list": [
|
||||||
|
SAMPLE_HTML_WITH_TABLES,
|
||||||
|
"<table><tr><th>A</th></tr><tr><td>1</td></tr></table>",
|
||||||
|
],
|
||||||
|
"config": {"strategy": "default"}
|
||||||
|
})
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["success"] is True
|
||||||
|
assert "summary" in data
|
||||||
|
assert data["summary"]["total_processed"] == 2
|
||||||
|
assert data["summary"]["successful"] >= 0
|
||||||
|
assert "results" in data
|
||||||
|
assert len(data["results"]) == 2
|
||||||
|
|
||||||
|
def test_batch_extract_url_list(self, server_url, wait_for_server):
|
||||||
|
"""Test batch extraction from multiple URLs"""
|
||||||
|
response = requests.post(f"{server_url}/tables/extract/batch", json={
|
||||||
|
"url_list": [
|
||||||
|
"https://example.com/page1",
|
||||||
|
"https://example.com/page2",
|
||||||
|
],
|
||||||
|
"config": {"strategy": "default"}
|
||||||
|
})
|
||||||
|
|
||||||
|
# May have mixed success/failure depending on URLs
|
||||||
|
assert response.status_code in [200, 500]
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
data = response.json()
|
||||||
|
assert "summary" in data
|
||||||
|
assert "results" in data
|
||||||
|
|
||||||
|
def test_batch_extract_mixed(self, server_url, wait_for_server):
|
||||||
|
"""Test batch extraction from both HTML and URLs"""
|
||||||
|
response = requests.post(f"{server_url}/tables/extract/batch", json={
|
||||||
|
"html_list": [SAMPLE_HTML_WITH_TABLES],
|
||||||
|
"url_list": ["https://example.com/tables"],
|
||||||
|
"config": {"strategy": "default"}
|
||||||
|
})
|
||||||
|
|
||||||
|
# May fail on URL crawling but should handle mixed input
|
||||||
|
assert response.status_code in [200, 500]
|
||||||
|
if response.status_code == 200:
|
||||||
|
data = response.json()
|
||||||
|
assert data["success"] is True
|
||||||
|
assert data["summary"]["total_processed"] == 2
|
||||||
|
|
||||||
|
def test_batch_extract_empty_list(self, server_url, wait_for_server):
|
||||||
|
"""Test error when no items provided for batch"""
|
||||||
|
response = requests.post(f"{server_url}/tables/extract/batch", json={
|
||||||
|
"config": {"strategy": "default"}
|
||||||
|
})
|
||||||
|
|
||||||
|
assert response.status_code == 400
|
||||||
|
|
||||||
|
def test_batch_extract_exceeds_limit(self, server_url, wait_for_server):
|
||||||
|
"""Test error when batch size exceeds limit"""
|
||||||
|
response = requests.post(f"{server_url}/tables/extract/batch", json={
|
||||||
|
"html_list": ["<table></table>"] * 100, # 100 items (limit is 50)
|
||||||
|
"config": {"strategy": "default"}
|
||||||
|
})
|
||||||
|
|
||||||
|
assert response.status_code == 400
|
||||||
|
assert "50" in response.text or "limit" in response.text.lower()
|
||||||
|
|
||||||
|
|
||||||
|
class TestTableExtractionStrategies:
|
||||||
|
"""Test different table extraction strategies"""
|
||||||
|
|
||||||
|
def test_default_strategy(self, server_url, wait_for_server):
|
||||||
|
"""Test default (regex-based) extraction strategy"""
|
||||||
|
response = requests.post(f"{server_url}/tables/extract", json={
|
||||||
|
"html": SAMPLE_HTML_WITH_TABLES,
|
||||||
|
"config": {
|
||||||
|
"strategy": "default"
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["strategy"] == "default"
|
||||||
|
assert data["table_count"] >= 1
|
||||||
|
|
||||||
|
def test_llm_strategy_without_config(self, server_url, wait_for_server):
|
||||||
|
"""Test LLM strategy without proper config (should use defaults or work)"""
|
||||||
|
response = requests.post(f"{server_url}/tables/extract", json={
|
||||||
|
"html": SAMPLE_HTML_WITH_TABLES,
|
||||||
|
"config": {
|
||||||
|
"strategy": "llm"
|
||||||
|
# Missing required LLM config
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
# May succeed with defaults or fail - both are acceptable
|
||||||
|
assert response.status_code in [200, 400, 500]
|
||||||
|
|
||||||
|
def test_financial_strategy(self, server_url, wait_for_server):
|
||||||
|
"""Test financial extraction strategy"""
|
||||||
|
response = requests.post(f"{server_url}/tables/extract", json={
|
||||||
|
"html": SAMPLE_HTML_WITH_TABLES,
|
||||||
|
"config": {
|
||||||
|
"strategy": "financial",
|
||||||
|
"preserve_formatting": True,
|
||||||
|
"extract_metadata": True
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["strategy"] == "financial"
|
||||||
|
|
||||||
|
# Financial tables should be extracted
|
||||||
|
if data["tables"]:
|
||||||
|
# Should find the financial table in our sample HTML
|
||||||
|
assert data["table_count"] >= 1
|
||||||
|
|
||||||
|
def test_none_strategy(self, server_url, wait_for_server):
|
||||||
|
"""Test with 'none' strategy (no extraction)"""
|
||||||
|
response = requests.post(f"{server_url}/tables/extract", json={
|
||||||
|
"html": SAMPLE_HTML_WITH_TABLES,
|
||||||
|
"config": {
|
||||||
|
"strategy": "none"
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
# Should return 0 tables
|
||||||
|
assert data["table_count"] == 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestTableExtractionConfig:
|
||||||
|
"""Test table extraction configuration options"""
|
||||||
|
|
||||||
|
def test_preserve_formatting_option(self, server_url, wait_for_server):
|
||||||
|
"""Test preserve_formatting option"""
|
||||||
|
response = requests.post(f"{server_url}/tables/extract", json={
|
||||||
|
"html": SAMPLE_HTML_WITH_TABLES,
|
||||||
|
"config": {
|
||||||
|
"strategy": "financial",
|
||||||
|
"preserve_formatting": True
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
|
||||||
|
def test_extract_metadata_option(self, server_url, wait_for_server):
|
||||||
|
"""Test extract_metadata option"""
|
||||||
|
response = requests.post(f"{server_url}/tables/extract", json={
|
||||||
|
"html": SAMPLE_HTML_WITH_TABLES,
|
||||||
|
"config": {
|
||||||
|
"strategy": "financial",
|
||||||
|
"extract_metadata": True
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
# Check if tables have metadata when requested
|
||||||
|
if data["tables"]:
|
||||||
|
table = data["tables"][0]
|
||||||
|
assert isinstance(table, dict)
|
||||||
|
|
||||||
|
|
||||||
|
class TestErrorHandling:
|
||||||
|
"""Test error handling for table extraction"""
|
||||||
|
|
||||||
|
def test_malformed_html(self, server_url, wait_for_server):
|
||||||
|
"""Test handling of malformed HTML"""
|
||||||
|
response = requests.post(f"{server_url}/tables/extract", json={
|
||||||
|
"html": "<table><tr><td>incomplete",
|
||||||
|
"config": {"strategy": "default"}
|
||||||
|
})
|
||||||
|
|
||||||
|
# Should handle gracefully (either return empty or partial results)
|
||||||
|
assert response.status_code in [200, 400, 500]
|
||||||
|
|
||||||
|
def test_empty_html(self, server_url, wait_for_server):
|
||||||
|
"""Test handling of empty HTML"""
|
||||||
|
response = requests.post(f"{server_url}/tables/extract", json={
|
||||||
|
"html": "",
|
||||||
|
"config": {"strategy": "default"}
|
||||||
|
})
|
||||||
|
|
||||||
|
# May be rejected as invalid or processed as empty
|
||||||
|
assert response.status_code in [200, 400]
|
||||||
|
if response.status_code == 200:
|
||||||
|
data = response.json()
|
||||||
|
assert data["table_count"] == 0
|
||||||
|
|
||||||
|
def test_html_without_tables(self, server_url, wait_for_server):
|
||||||
|
"""Test HTML with no tables"""
|
||||||
|
response = requests.post(f"{server_url}/tables/extract", json={
|
||||||
|
"html": "<html><body><p>No tables here</p></body></html>",
|
||||||
|
"config": {"strategy": "default"}
|
||||||
|
})
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["table_count"] == 0
|
||||||
|
|
||||||
|
def test_invalid_strategy(self, server_url, wait_for_server):
|
||||||
|
"""Test invalid strategy name"""
|
||||||
|
response = requests.post(f"{server_url}/tables/extract", json={
|
||||||
|
"html": SAMPLE_HTML_WITH_TABLES,
|
||||||
|
"config": {"strategy": "invalid_strategy"}
|
||||||
|
})
|
||||||
|
|
||||||
|
# Should return validation error (400 or 422 from Pydantic)
|
||||||
|
assert response.status_code in [400, 422]
|
||||||
|
|
||||||
|
def test_missing_config(self, server_url, wait_for_server):
|
||||||
|
"""Test missing configuration"""
|
||||||
|
response = requests.post(f"{server_url}/tables/extract", json={
|
||||||
|
"html": SAMPLE_HTML_WITH_TABLES
|
||||||
|
# Missing config
|
||||||
|
})
|
||||||
|
|
||||||
|
# Should use default config or return error
|
||||||
|
assert response.status_code in [200, 400]
|
||||||
|
|
||||||
|
|
||||||
|
# Run tests
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main([__file__, "-v"])
|
||||||
225
tests/docker/test_table_extraction_quick.py
Normal file
225
tests/docker/test_table_extraction_quick.py
Normal file
@@ -0,0 +1,225 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Quick test script for Table Extraction feature
|
||||||
|
Tests the /tables/extract endpoint with sample HTML
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
1. Start the server: python deploy/docker/server.py
|
||||||
|
2. Run this script: python tests/docker/test_table_extraction_quick.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# Sample HTML with tables
|
||||||
|
SAMPLE_HTML = """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1>Test Tables</h1>
|
||||||
|
|
||||||
|
<table id="simple">
|
||||||
|
<tr><th>Name</th><th>Age</th><th>City</th></tr>
|
||||||
|
<tr><td>Alice</td><td>25</td><td>New York</td></tr>
|
||||||
|
<tr><td>Bob</td><td>30</td><td>San Francisco</td></tr>
|
||||||
|
<tr><td>Charlie</td><td>35</td><td>Los Angeles</td></tr>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<table id="financial">
|
||||||
|
<thead>
|
||||||
|
<tr><th>Quarter</th><th>Revenue</th><th>Profit</th></tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>Q1 2024</td><td>$1,250,000.00</td><td>$400,000.00</td></tr>
|
||||||
|
<tr><td>Q2 2024</td><td>$1,500,000.00</td><td>$600,000.00</td></tr>
|
||||||
|
<tr><td>Q3 2024</td><td>$1,750,000.00</td><td>$700,000.00</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
BASE_URL = "http://localhost:11234"
|
||||||
|
|
||||||
|
|
||||||
|
def test_server_health():
|
||||||
|
"""Check if server is running"""
|
||||||
|
try:
|
||||||
|
response = requests.get(f"{BASE_URL}/health", timeout=2)
|
||||||
|
if response.status_code == 200:
|
||||||
|
print("✅ Server is running")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f"❌ Server health check failed: {response.status_code}")
|
||||||
|
return False
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f"❌ Server not reachable: {e}")
|
||||||
|
print("\n💡 Start the server with: python deploy/docker/server.py")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def test_default_strategy():
|
||||||
|
"""Test default table extraction strategy"""
|
||||||
|
print("\n📊 Testing DEFAULT strategy...")
|
||||||
|
|
||||||
|
response = requests.post(f"{BASE_URL}/tables/extract", json={
|
||||||
|
"html": SAMPLE_HTML,
|
||||||
|
"config": {
|
||||||
|
"strategy": "default"
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
data = response.json()
|
||||||
|
print(f"✅ Default strategy works!")
|
||||||
|
print(f" - Table count: {data['table_count']}")
|
||||||
|
print(f" - Strategy: {data['strategy']}")
|
||||||
|
|
||||||
|
if data['tables']:
|
||||||
|
for idx, table in enumerate(data['tables']):
|
||||||
|
print(f" - Table {idx + 1}: {len(table.get('rows', []))} rows")
|
||||||
|
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f"❌ Failed: {response.status_code}")
|
||||||
|
print(f" Error: {response.text}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def test_financial_strategy():
|
||||||
|
"""Test financial table extraction strategy"""
|
||||||
|
print("\n💰 Testing FINANCIAL strategy...")
|
||||||
|
|
||||||
|
response = requests.post(f"{BASE_URL}/tables/extract", json={
|
||||||
|
"html": SAMPLE_HTML,
|
||||||
|
"config": {
|
||||||
|
"strategy": "financial",
|
||||||
|
"preserve_formatting": True,
|
||||||
|
"extract_metadata": True
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
data = response.json()
|
||||||
|
print(f"✅ Financial strategy works!")
|
||||||
|
print(f" - Table count: {data['table_count']}")
|
||||||
|
print(f" - Strategy: {data['strategy']}")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f"❌ Failed: {response.status_code}")
|
||||||
|
print(f" Error: {response.text}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def test_none_strategy():
|
||||||
|
"""Test none strategy (no extraction)"""
|
||||||
|
print("\n🚫 Testing NONE strategy...")
|
||||||
|
|
||||||
|
response = requests.post(f"{BASE_URL}/tables/extract", json={
|
||||||
|
"html": SAMPLE_HTML,
|
||||||
|
"config": {
|
||||||
|
"strategy": "none"
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
data = response.json()
|
||||||
|
if data['table_count'] == 0:
|
||||||
|
print(f"✅ None strategy works (correctly extracted 0 tables)")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f"❌ None strategy returned {data['table_count']} tables (expected 0)")
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
print(f"❌ Failed: {response.status_code}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def test_batch_extraction():
|
||||||
|
"""Test batch extraction"""
|
||||||
|
print("\n📦 Testing BATCH extraction...")
|
||||||
|
|
||||||
|
response = requests.post(f"{BASE_URL}/tables/extract/batch", json={
|
||||||
|
"html_list": [
|
||||||
|
SAMPLE_HTML,
|
||||||
|
"<table><tr><th>Col1</th></tr><tr><td>Val1</td></tr></table>"
|
||||||
|
],
|
||||||
|
"config": {
|
||||||
|
"strategy": "default"
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
data = response.json()
|
||||||
|
print(f"✅ Batch extraction works!")
|
||||||
|
print(f" - Total processed: {data['summary']['total_processed']}")
|
||||||
|
print(f" - Successful: {data['summary']['successful']}")
|
||||||
|
print(f" - Total tables: {data['summary']['total_tables_extracted']}")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f"❌ Failed: {response.status_code}")
|
||||||
|
print(f" Error: {response.text}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def test_error_handling():
|
||||||
|
"""Test error handling"""
|
||||||
|
print("\n⚠️ Testing ERROR handling...")
|
||||||
|
|
||||||
|
# Test with both html and url (should fail)
|
||||||
|
response = requests.post(f"{BASE_URL}/tables/extract", json={
|
||||||
|
"html": "<table></table>",
|
||||||
|
"url": "https://example.com",
|
||||||
|
"config": {"strategy": "default"}
|
||||||
|
})
|
||||||
|
|
||||||
|
if response.status_code == 400:
|
||||||
|
print(f"✅ Error handling works (correctly rejected invalid input)")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f"❌ Expected 400 error, got: {response.status_code}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 60)
|
||||||
|
print("Table Extraction Feature - Quick Test")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Check server
|
||||||
|
if not test_server_health():
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Run tests
|
||||||
|
results = []
|
||||||
|
results.append(("Default Strategy", test_default_strategy()))
|
||||||
|
results.append(("Financial Strategy", test_financial_strategy()))
|
||||||
|
results.append(("None Strategy", test_none_strategy()))
|
||||||
|
results.append(("Batch Extraction", test_batch_extraction()))
|
||||||
|
results.append(("Error Handling", test_error_handling()))
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Test Summary")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
passed = sum(1 for _, result in results if result)
|
||||||
|
total = len(results)
|
||||||
|
|
||||||
|
for name, result in results:
|
||||||
|
status = "✅ PASS" if result else "❌ FAIL"
|
||||||
|
print(f"{status}: {name}")
|
||||||
|
|
||||||
|
print(f"\nTotal: {passed}/{total} tests passed")
|
||||||
|
|
||||||
|
if passed == total:
|
||||||
|
print("\n🎉 All tests passed! Table extraction is working correctly!")
|
||||||
|
sys.exit(0)
|
||||||
|
else:
|
||||||
|
print(f"\n⚠️ {total - passed} test(s) failed")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user