remove crawl endpoints

This commit is contained in:
Unclecode
2024-12-12 12:24:13 +00:00
parent d7200138a0
commit 3fd777dd6f

158
main.py
View File

@@ -380,97 +380,97 @@ def read_root():
return {"message": "Crawl4AI API service is running"} return {"message": "Crawl4AI API service is running"}
@app.post("/crawl", dependencies=[Depends(verify_token)]) # @app.post("/crawl", dependencies=[Depends(verify_token)])
async def crawl(request: CrawlRequest) -> Dict[str, str]: # async def crawl(request: CrawlRequest) -> Dict[str, str]:
task_id = await crawler_service.submit_task(request) # task_id = await crawler_service.submit_task(request)
return {"task_id": task_id} # return {"task_id": task_id}
@app.get("/task/{task_id}", dependencies=[Depends(verify_token)]) # @app.get("/task/{task_id}", dependencies=[Depends(verify_token)])
async def get_task_status(task_id: str): # async def get_task_status(task_id: str):
task_info = crawler_service.task_manager.get_task(task_id) # task_info = crawler_service.task_manager.get_task(task_id)
if not task_info: # if not task_info:
raise HTTPException(status_code=404, detail="Task not found") # raise HTTPException(status_code=404, detail="Task not found")
response = { # response = {
"status": task_info.status, # "status": task_info.status,
"created_at": task_info.created_at, # "created_at": task_info.created_at,
} # }
if task_info.status == TaskStatus.COMPLETED: # if task_info.status == TaskStatus.COMPLETED:
# Convert CrawlResult to dict for JSON response # # Convert CrawlResult to dict for JSON response
if isinstance(task_info.result, list): # if isinstance(task_info.result, list):
response["results"] = [result.dict() for result in task_info.result] # response["results"] = [result.dict() for result in task_info.result]
else: # else:
response["result"] = task_info.result.dict() # response["result"] = task_info.result.dict()
elif task_info.status == TaskStatus.FAILED: # elif task_info.status == TaskStatus.FAILED:
response["error"] = task_info.error # response["error"] = task_info.error
return response # return response
@app.post("/crawl_sync", dependencies=[Depends(verify_token)]) # @app.post("/crawl_sync", dependencies=[Depends(verify_token)])
async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]: # async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
task_id = await crawler_service.submit_task(request) # task_id = await crawler_service.submit_task(request)
# Wait up to 60 seconds for task completion # # Wait up to 60 seconds for task completion
for _ in range(60): # for _ in range(60):
task_info = crawler_service.task_manager.get_task(task_id) # task_info = crawler_service.task_manager.get_task(task_id)
if not task_info: # if not task_info:
raise HTTPException(status_code=404, detail="Task not found") # raise HTTPException(status_code=404, detail="Task not found")
if task_info.status == TaskStatus.COMPLETED: # if task_info.status == TaskStatus.COMPLETED:
# Return same format as /task/{task_id} endpoint # # Return same format as /task/{task_id} endpoint
if isinstance(task_info.result, list): # if isinstance(task_info.result, list):
return {"status": task_info.status, "results": [result.dict() for result in task_info.result]} # return {"status": task_info.status, "results": [result.dict() for result in task_info.result]}
return {"status": task_info.status, "result": task_info.result.dict()} # return {"status": task_info.status, "result": task_info.result.dict()}
if task_info.status == TaskStatus.FAILED: # if task_info.status == TaskStatus.FAILED:
raise HTTPException(status_code=500, detail=task_info.error) # raise HTTPException(status_code=500, detail=task_info.error)
await asyncio.sleep(1) # await asyncio.sleep(1)
# If we get here, task didn't complete within timeout # # If we get here, task didn't complete within timeout
raise HTTPException(status_code=408, detail="Task timed out") # raise HTTPException(status_code=408, detail="Task timed out")
@app.post("/crawl_direct", dependencies=[Depends(verify_token)]) # @app.post("/crawl_direct", dependencies=[Depends(verify_token)])
async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]: # async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
try: # try:
crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params) # crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config) # extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config)
try: # try:
if isinstance(request.urls, list): # if isinstance(request.urls, list):
results = await crawler.arun_many( # results = await crawler.arun_many(
urls=[str(url) for url in request.urls], # urls=[str(url) for url in request.urls],
extraction_strategy=extraction_strategy, # extraction_strategy=extraction_strategy,
js_code=request.js_code, # js_code=request.js_code,
wait_for=request.wait_for, # wait_for=request.wait_for,
css_selector=request.css_selector, # css_selector=request.css_selector,
screenshot=request.screenshot, # screenshot=request.screenshot,
magic=request.magic, # magic=request.magic,
cache_mode=request.cache_mode, # cache_mode=request.cache_mode,
session_id=request.session_id, # session_id=request.session_id,
**request.extra, # **request.extra,
) # )
return {"results": [result.dict() for result in results]} # return {"results": [result.dict() for result in results]}
else: # else:
result = await crawler.arun( # result = await crawler.arun(
url=str(request.urls), # url=str(request.urls),
extraction_strategy=extraction_strategy, # extraction_strategy=extraction_strategy,
js_code=request.js_code, # js_code=request.js_code,
wait_for=request.wait_for, # wait_for=request.wait_for,
css_selector=request.css_selector, # css_selector=request.css_selector,
screenshot=request.screenshot, # screenshot=request.screenshot,
magic=request.magic, # magic=request.magic,
cache_mode=request.cache_mode, # cache_mode=request.cache_mode,
session_id=request.session_id, # session_id=request.session_id,
**request.extra, # **request.extra,
) # )
return {"result": result.dict()} # return {"result": result.dict()}
finally: # finally:
await crawler_service.crawler_pool.release(crawler) # await crawler_service.crawler_pool.release(crawler)
except Exception as e: # except Exception as e:
logger.error(f"Error in direct crawl: {str(e)}") # logger.error(f"Error in direct crawl: {str(e)}")
raise HTTPException(status_code=500, detail=str(e)) # raise HTTPException(status_code=500, detail=str(e))
@app.get("/health") @app.get("/health")
async def health_check(): async def health_check():