From 04bc643cec3a47bff4305642494c6c115c06833d Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 2 Feb 2025 20:53:31 +0800 Subject: [PATCH] feat(api): improve cache handling and add API tests Changes cache mode from BYPASS to WRITE_ONLY when cache is disabled to ensure results are still cached for future use. Also adds error handling for non-JSON LLM responses and comprehensive API test suite. - Changes default cache fallback from BYPASS to WRITE_ONLY - Adds error handling for LLM JSON parsing - Introduces new test suite for API endpoints --- deploy/docker/api.py | 9 ++- tests/docker/test_server.py | 146 ++++++++++++++++++++++++++++++++++++ 2 files changed, 152 insertions(+), 3 deletions(-) create mode 100644 tests/docker/test_server.py diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 8838f19f..1f3e4070 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -51,7 +51,7 @@ async def process_llm_extraction( schema=json.loads(schema) if schema else None, ) - cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.BYPASS + cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY async with AsyncWebCrawler() as crawler: result = await crawler.arun( @@ -70,7 +70,10 @@ async def process_llm_extraction( }) return - content = json.loads(result.extracted_content) + try: + content = json.loads(result.extracted_content) + except json.JSONDecodeError: + content = result.extracted_content await redis.hset(f"task:{task_id}", mapping={ "status": TaskStatus.COMPLETED, "result": json.dumps(content) @@ -110,7 +113,7 @@ async def handle_markdown_request( }[filter_type] md_generator = DefaultMarkdownGenerator(content_filter=content_filter) - cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.BYPASS + cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY async with AsyncWebCrawler() as crawler: result = await crawler.arun( diff --git a/tests/docker/test_server.py b/tests/docker/test_server.py new file mode 100644 index 00000000..7bb0195b --- /dev/null +++ b/tests/docker/test_server.py @@ -0,0 +1,146 @@ +import asyncio +import json +from typing import Optional +from urllib.parse import quote + +async def test_endpoint( + endpoint: str, + url: str, + params: Optional[dict] = None, + expected_status: int = 200 +) -> None: + """Test an endpoint and print results""" + import aiohttp + + params = params or {} + param_str = "&".join(f"{k}={v}" for k, v in params.items()) + full_url = f"http://localhost:8000/{endpoint}/{quote(url)}" + if param_str: + full_url += f"?{param_str}" + + print(f"\nTesting: {full_url}") + + try: + async with aiohttp.ClientSession() as session: + async with session.get(full_url) as response: + status = response.status + try: + data = await response.json() + except: + data = await response.text() + + print(f"Status: {status} (Expected: {expected_status})") + if isinstance(data, dict): + print(f"Response: {json.dumps(data, indent=2)}") + else: + print(f"Response: {data[:500]}...") # First 500 chars + assert status == expected_status + return data + except Exception as e: + print(f"Error: {str(e)}") + return None + +async def test_llm_task_completion(task_id: str) -> None: + """Poll task until completion""" + for _ in range(10): # Try 10 times + result = await test_endpoint("llm", task_id) + if result and result.get("status") in ["completed", "failed"]: + return result + print("Task still processing, waiting 5 seconds...") + await asyncio.sleep(5) + print("Task timed out") + +async def run_tests(): + print("Starting API Tests...") + + # Test URLs + urls = [ + "example.com", + "https://www.python.org", + "https://news.ycombinator.com/news", + "https://github.com/trending" + ] + + print("\n=== Testing Markdown Endpoint ===") + for url in[] : #urls: + # Test different filter types + for filter_type in ["raw", "fit", "bm25", "llm"]: + params = {"f": filter_type} + if filter_type in ["bm25", "llm"]: + params["q"] = "extract main content" + + # Test with and without cache + for cache in ["0", "1"]: + params["c"] = cache + await test_endpoint("md", url, params) + await asyncio.sleep(1) # Be nice to the server + + print("\n=== Testing LLM Endpoint ===") + for url in []: # urls: + # Test basic extraction + result = await test_endpoint( + "llm", + url, + {"q": "Extract title and main content"} + ) + if result and "task_id" in result: + print("\nChecking task completion...") + await test_llm_task_completion(result["task_id"]) + + # Test with schema + schema = { + "type": "object", + "properties": { + "title": {"type": "string"}, + "content": {"type": "string"}, + "links": {"type": "array", "items": {"type": "string"}} + } + } + result = await test_endpoint( + "llm", + url, + { + "q": "Extract content with links", + "s": json.dumps(schema), + "c": "1" # Test with cache + } + ) + if result and "task_id" in result: + print("\nChecking schema task completion...") + await test_llm_task_completion(result["task_id"]) + + await asyncio.sleep(2) # Be nice to the server + + print("\n=== Testing Error Cases ===") + # Test invalid URL + await test_endpoint( + "md", + "not_a_real_url", + expected_status=500 + ) + + # Test invalid filter type + await test_endpoint( + "md", + "example.com", + {"f": "invalid"}, + expected_status=422 + ) + + # Test LLM without query + await test_endpoint( + "llm", + "example.com" + ) + + # Test invalid task ID + await test_endpoint( + "llm", + "llm_invalid_task", + expected_status=404 + ) + + print("\nAll tests completed!") + +if __name__ == "__main__": + asyncio.run(run_tests()) \ No newline at end of file