feat(crawl_sync): add synchronous crawl endpoint and corresponding test
This commit is contained in:
@@ -33,6 +33,13 @@ class Crawl4AiTester:
|
|||||||
return status
|
return status
|
||||||
|
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
|
|
||||||
|
def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, timeout=60)
|
||||||
|
if response.status_code == 408:
|
||||||
|
raise TimeoutError("Task did not complete within server timeout")
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
|
||||||
def test_docker_deployment(version="basic"):
|
def test_docker_deployment(version="basic"):
|
||||||
tester = Crawl4AiTester()
|
tester = Crawl4AiTester()
|
||||||
@@ -54,6 +61,7 @@ def test_docker_deployment(version="basic"):
|
|||||||
|
|
||||||
# Test cases based on version
|
# Test cases based on version
|
||||||
test_basic_crawl(tester)
|
test_basic_crawl(tester)
|
||||||
|
test_basic_crawl_sync(tester)
|
||||||
|
|
||||||
# if version in ["full", "transformer"]:
|
# if version in ["full", "transformer"]:
|
||||||
# test_cosine_extraction(tester)
|
# test_cosine_extraction(tester)
|
||||||
@@ -78,6 +86,19 @@ def test_basic_crawl(tester: Crawl4AiTester):
|
|||||||
assert result["result"]["success"]
|
assert result["result"]["success"]
|
||||||
assert len(result["result"]["markdown"]) > 0
|
assert len(result["result"]["markdown"]) > 0
|
||||||
|
|
||||||
|
def test_basic_crawl_sync(tester: Crawl4AiTester):
|
||||||
|
print("\n=== Testing Basic Crawl (Sync) ===")
|
||||||
|
request = {
|
||||||
|
"urls": "https://www.nbcnews.com/business",
|
||||||
|
"priority": 10
|
||||||
|
}
|
||||||
|
|
||||||
|
result = tester.submit_sync(request)
|
||||||
|
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||||
|
assert result['status'] == 'completed'
|
||||||
|
assert result['result']['success']
|
||||||
|
assert len(result['result']['markdown']) > 0
|
||||||
|
|
||||||
def test_js_execution(tester: Crawl4AiTester):
|
def test_js_execution(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing JS Execution ===")
|
print("\n=== Testing JS Execution ===")
|
||||||
request = {
|
request = {
|
||||||
|
|||||||
24
main.py
24
main.py
@@ -375,6 +375,30 @@ async def get_task_status(task_id: str):
|
|||||||
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
@app.post("/crawl_sync")
|
||||||
|
async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
|
||||||
|
task_id = await crawler_service.submit_task(request)
|
||||||
|
|
||||||
|
# Wait up to 60 seconds for task completion
|
||||||
|
for _ in range(60):
|
||||||
|
task_info = crawler_service.task_manager.get_task(task_id)
|
||||||
|
if not task_info:
|
||||||
|
raise HTTPException(status_code=404, detail="Task not found")
|
||||||
|
|
||||||
|
if task_info.status == TaskStatus.COMPLETED:
|
||||||
|
# Return same format as /task/{task_id} endpoint
|
||||||
|
if isinstance(task_info.result, list):
|
||||||
|
return {"status": task_info.status, "results": [result.dict() for result in task_info.result]}
|
||||||
|
return {"status": task_info.status, "result": task_info.result.dict()}
|
||||||
|
|
||||||
|
if task_info.status == TaskStatus.FAILED:
|
||||||
|
raise HTTPException(status_code=500, detail=task_info.error)
|
||||||
|
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
# If we get here, task didn't complete within timeout
|
||||||
|
raise HTTPException(status_code=408, detail="Task timed out")
|
||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
async def health_check():
|
async def health_check():
|
||||||
available_slots = await crawler_service.resource_monitor.get_available_slots()
|
available_slots = await crawler_service.resource_monitor.get_available_slots()
|
||||||
|
|||||||
Reference in New Issue
Block a user