- Debug
- Refactor code for new version
This commit is contained in:
unclecode
2024-05-16 17:31:44 +08:00
parent f6e59157bf
commit 5b80be956d
23 changed files with 3116 additions and 1019 deletions

27
main.py
View File

@@ -7,6 +7,7 @@ from fastapi import FastAPI, HTTPException, Request
from fastapi.responses import HTMLResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
from fastapi.templating import Jinja2Templates
from pydantic import BaseModel, HttpUrl
from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -35,7 +36,7 @@ app.add_middleware(
# Mount the pages directory as a static directory
app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages")
templates = Jinja2Templates(directory=__location__ + "/pages")
# chromedriver_autoinstaller.install() # Ensure chromedriver is installed
@lru_cache()
def get_crawler():
@@ -51,16 +52,24 @@ class CrawlRequest(BaseModel):
extract_blocks: bool = True
word_count_threshold: Optional[int] = 5
extraction_strategy: Optional[str] = "CosineStrategy"
extraction_strategy_args: Optional[dict] = {}
chunking_strategy: Optional[str] = "RegexChunking"
chunking_strategy_args: Optional[dict] = {}
css_selector: Optional[str] = None
verbose: Optional[bool] = True
@app.get("/", response_class=HTMLResponse)
async def read_index():
with open(f"{__location__}/pages/index.html", "r") as file:
html_content = file.read()
return HTMLResponse(content=html_content, status_code=200)
async def read_index(request: Request):
partials_dir = os.path.join(__location__, "pages", "partial")
partials = {}
for filename in os.listdir(partials_dir):
if filename.endswith(".html"):
with open(os.path.join(partials_dir, filename), "r") as file:
partials[filename[:-5]] = file.read()
return templates.TemplateResponse("index.html", {"request": request, **partials})
@app.get("/total-count")
async def get_total_url_count():
@@ -73,11 +82,11 @@ async def clear_database():
clear_db()
return JSONResponse(content={"message": "Database cleared."})
def import_strategy(module_name: str, class_name: str):
def import_strategy(module_name: str, class_name: str, *args, **kwargs):
try:
module = importlib.import_module(module_name)
strategy_class = getattr(module, class_name)
return strategy_class()
return strategy_class(*args, **kwargs)
except ImportError:
raise HTTPException(status_code=400, detail=f"Module {module_name} not found.")
except AttributeError:
@@ -95,8 +104,8 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
current_requests += 1
try:
extraction_strategy = import_strategy("crawl4ai.extraction_strategy", crawl_request.extraction_strategy)
chunking_strategy = import_strategy("crawl4ai.chunking_strategy", crawl_request.chunking_strategy)
extraction_strategy = import_strategy("crawl4ai.extraction_strategy", crawl_request.extraction_strategy, **crawl_request.extraction_strategy_args)
chunking_strategy = import_strategy("crawl4ai.chunking_strategy", crawl_request.chunking_strategy, **crawl_request.chunking_strategy_args)
# Use ThreadPoolExecutor to run the synchronous WebCrawler in async manner
with ThreadPoolExecutor() as executor: