chore: Update Dockerfile to install chromium-chromedriver and spacy library

This commit is contained in:
Unclecode
2024-05-18 09:16:52 +00:00
parent 3846648c12
commit bf00c26a83
5 changed files with 67 additions and 13 deletions

View File

@@ -2,6 +2,8 @@ import os
import importlib
import asyncio
from functools import lru_cache
import logging
logging.basicConfig(level=logging.DEBUG)
from fastapi import FastAPI, HTTPException, Request
from fastapi.responses import HTMLResponse, JSONResponse
@@ -77,7 +79,7 @@ async def get_total_url_count():
# Add endpoit to clear db
@app.get("/clear-db")
async def clear_database():
clear_db()
# clear_db()
return JSONResponse(content={"message": "Database cleared."})
def import_strategy(module_name: str, class_name: str, *args, **kwargs):
@@ -86,12 +88,15 @@ def import_strategy(module_name: str, class_name: str, *args, **kwargs):
strategy_class = getattr(module, class_name)
return strategy_class(*args, **kwargs)
except ImportError:
print("ImportError: Module not found.")
raise HTTPException(status_code=400, detail=f"Module {module_name} not found.")
except AttributeError:
print("AttributeError: Class not found.")
raise HTTPException(status_code=400, detail=f"Class {class_name} not found in {module_name}.")
@app.post("/crawl")
async def crawl_urls(crawl_request: CrawlRequest, request: Request):
logging.debug(f"[LOG] Crawl request for URL: {crawl_request.urls}")
global current_requests
async with lock:
if current_requests >= MAX_CONCURRENT_REQUESTS:
@@ -99,10 +104,12 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
current_requests += 1
try:
logging.debug("[LOG] Loading extraction and chunking strategies...")
extraction_strategy = import_strategy("crawl4ai.extraction_strategy", crawl_request.extraction_strategy, **crawl_request.extraction_strategy_args)
chunking_strategy = import_strategy("crawl4ai.chunking_strategy", crawl_request.chunking_strategy, **crawl_request.chunking_strategy_args)
# Use ThreadPoolExecutor to run the synchronous WebCrawler in async manner
logging.debug("[LOG] Running the WebCrawler...")
with ThreadPoolExecutor() as executor:
loop = asyncio.get_event_loop()
futures = [