feat: Add table extraction strategies and API documentation

- Implemented table extraction strategies: default, LLM, financial, and none in utils.py.
- Created new API documentation for table extraction endpoints and strategies.
- Added integration tests for table extraction functionality covering various strategies and error handling.
- Developed quick test script for rapid validation of table extraction features.
This commit is contained in:
AHMET YILMAZ
2025-10-17 12:30:37 +08:00
parent 3877335d89
commit 00e9904609
8 changed files with 1979 additions and 3 deletions

View File

@@ -87,7 +87,7 @@ from prometheus_fastapi_instrumentator import Instrumentator
from pydantic import BaseModel, Field
from rank_bm25 import BM25Okapi
from redis import asyncio as aioredis
from routers import adaptive, dispatchers, scripts, monitoring
from routers import adaptive, dispatchers, scripts, monitoring, tables
from schemas import (
CrawlRequest,
CrawlRequestWithHooks,
@@ -298,6 +298,7 @@ app.include_router(adaptive.router)
app.include_router(dispatchers.router)
app.include_router(scripts.router)
app.include_router(monitoring.router)
app.include_router(tables.router)
# ──────────────────────── Endpoints ──────────────────────────
@@ -1578,6 +1579,7 @@ async def crawl(
proxies=crawl_request.proxies,
proxy_failure_threshold=crawl_request.proxy_failure_threshold,
proxy_recovery_time=crawl_request.proxy_recovery_time,
table_extraction=crawl_request.table_extraction.model_dump() if crawl_request.table_extraction else None,
dispatcher=dispatcher,
)
# check if all of the results are not successful
@@ -1729,6 +1731,7 @@ async def stream_process(crawl_request: CrawlRequestWithHooks):
proxies=crawl_request.proxies,
proxy_failure_threshold=crawl_request.proxy_failure_threshold,
proxy_recovery_time=crawl_request.proxy_recovery_time,
table_extraction=crawl_request.table_extraction.model_dump() if crawl_request.table_extraction else None,
dispatcher=dispatcher,
)