feat: Add table extraction strategies and API documentation

- Implemented table extraction strategies: default, LLM, financial, and none in utils.py.
- Created new API documentation for table extraction endpoints and strategies.
- Added integration tests for table extraction functionality covering various strategies and error handling.
- Developed quick test script for rapid validation of table extraction features.
This commit is contained in:
AHMET YILMAZ
2025-10-17 12:30:37 +08:00
parent 3877335d89
commit 00e9904609
8 changed files with 1979 additions and 3 deletions

View File

@@ -731,6 +731,7 @@ async def handle_crawl_request(
proxies: Optional[List[Dict[str, Any]]] = None,
proxy_failure_threshold: int = 3,
proxy_recovery_time: int = 300,
table_extraction: Optional[dict] = None,
dispatcher = None,
) -> dict:
"""Handle non-streaming crawl requests with optional hooks."""
@@ -768,6 +769,19 @@ async def handle_crawl_request(
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
# Configure table extraction strategy if specified
if table_extraction:
try:
from schemas import TableExtractionConfig
from utils import create_table_extraction_strategy
table_config = TableExtractionConfig(**table_extraction)
table_strategy = create_table_extraction_strategy(table_config)
crawler_config.table_extraction_strategy = table_strategy
except Exception as e:
logger.error(f"Error creating table extraction strategy: {e}")
raise HTTPException(status_code=400, detail=f"Invalid table extraction config: {str(e)}")
# Configure browser adapter based on anti_bot_strategy
browser_adapter = _get_browser_adapter(anti_bot_strategy, browser_config)
@@ -974,6 +988,7 @@ async def handle_stream_crawl_request(
proxies: Optional[List[Dict[str, Any]]] = None,
proxy_failure_threshold: int = 3,
proxy_recovery_time: int = 300,
table_extraction: Optional[dict] = None,
dispatcher = None,
) -> Tuple[AsyncWebCrawler, AsyncGenerator, Optional[Dict]]:
"""Handle streaming crawl requests with optional hooks."""
@@ -1003,6 +1018,19 @@ async def handle_stream_crawl_request(
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
# Configure table extraction strategy if specified
if table_extraction:
try:
from schemas import TableExtractionConfig
from utils import create_table_extraction_strategy
table_config = TableExtractionConfig(**table_extraction)
table_strategy = create_table_extraction_strategy(table_config)
crawler_config.table_extraction_strategy = table_strategy
except Exception as e:
logger.error(f"Error creating table extraction strategy: {e}")
raise HTTPException(status_code=400, detail=f"Invalid table extraction config: {str(e)}")
# Configure browser adapter based on anti_bot_strategy
browser_adapter = _get_browser_adapter(anti_bot_strategy, browser_config)