import pytest import pytest_asyncio import httpx import json import asyncio import os from typing import List, Dict, Any, AsyncGenerator from dotenv import load_dotenv load_dotenv() # Optional: Import crawl4ai classes directly for reference/easier payload creation aid # You don't strictly NEED these imports for the tests to run against the server, # but they help in understanding the structure you are mimicking in JSON. from crawl4ai import ( BrowserConfig, CrawlerRunConfig, CacheMode, DefaultMarkdownGenerator, PruningContentFilter, BM25ContentFilter, BFSDeepCrawlStrategy, FilterChain, ContentTypeFilter, DomainFilter, CompositeScorer, KeywordRelevanceScorer, PathDepthScorer, JsonCssExtractionStrategy, LLMExtractionStrategy, LLMConfig ) # --- Test Configuration --- # BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Make base URL configurable BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # Make base URL configurable # Use a known simple HTML page for basic tests SIMPLE_HTML_URL = "https://httpbin.org/html" # Use a site suitable for scraping tests SCRAPE_TARGET_URL = "http://books.toscrape.com/" # Use a site with internal links for deep crawl tests DEEP_CRAWL_URL = "https://python.org" # --- Pytest Fixtures --- # Use the built-in event_loop fixture from pytest_asyncio # The custom implementation was causing issues with closing the loop @pytest_asyncio.fixture(scope="function") # Changed to function scope to avoid event loop issues async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]: """Provides an async HTTP client""" client = httpx.AsyncClient(base_url=BASE_URL, timeout=120.0) yield client await client.aclose() # --- Helper Functions --- async def check_server_health(client: httpx.AsyncClient): """Check if the server is healthy before running tests.""" try: response = await client.get("/health") response.raise_for_status() print(f"\nServer healthy: {response.json()}") return True except (httpx.RequestError, httpx.HTTPStatusError) as e: pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False) async def assert_crawl_result_structure(result: Dict[str, Any]): """Asserts the basic structure of a single crawl result.""" assert isinstance(result, dict) assert "url" in result assert "success" in result assert "html" in result # Add more common checks if needed async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]: """Processes an NDJSON streaming response.""" results = [] completed = False async for line in response.aiter_lines(): if line: try: data = json.loads(line) if data.get("status") == "completed": completed = True break # Stop processing after completion marker else: results.append(data) except json.JSONDecodeError: pytest.fail(f"Failed to decode JSON line: {line}") assert completed, "Streaming response did not end with a completion marker." return results # --- Test Class --- @pytest.mark.asyncio class TestCrawlEndpoints: @pytest_asyncio.fixture(autouse=True) async def check_health_before_tests(self, async_client: httpx.AsyncClient): """Fixture to ensure server is healthy before each test in the class.""" await check_server_health(async_client) # 1. Simple Requests (Primitives) async def test_simple_crawl_single_url(self, async_client: httpx.AsyncClient): """Test /crawl with a single URL and simple config values.""" payload = { "urls": [SIMPLE_HTML_URL], "browser_config": { "type": "BrowserConfig", "params": { "headless": True, } }, "crawler_config": { "type": "CrawlerRunConfig", "params": { "stream": False, # Explicitly false for /crawl "screenshot": False, "cache_mode": CacheMode.BYPASS.value # Use enum value } } } try: response = await async_client.post("/crawl", json=payload) print(f"Response status: {response.status_code}") response.raise_for_status() data = response.json() except httpx.HTTPStatusError as e: print(f"Server error: {e}") print(f"Response content: {e.response.text}") raise assert data["success"] is True assert isinstance(data["results"], list) assert len(data["results"]) == 1 result = data["results"][0] await assert_crawl_result_structure(result) assert result["success"] is True assert result["url"] == SIMPLE_HTML_URL assert "

Herman Melville - Moby-Dick

" in result["html"] # We don't specify a markdown generator in this test, so don't make assumptions about markdown field # It might be null, missing, or populated depending on the server's default behavior async def test_crawl_with_stream_direct(self, async_client: httpx.AsyncClient): """Test that /crawl endpoint handles stream=True directly without redirect.""" payload = { "urls": [SIMPLE_HTML_URL], "browser_config": { "type": "BrowserConfig", "params": { "headless": True, } }, "crawler_config": { "type": "CrawlerRunConfig", "params": { "stream": True, # Set stream to True for direct streaming "screenshot": False, "cache_mode": CacheMode.BYPASS.value } } } # Send a request to the /crawl endpoint - should handle streaming directly async with async_client.stream("POST", "/crawl", json=payload) as response: assert response.status_code == 200 assert response.headers["content-type"] == "application/x-ndjson" assert response.headers.get("x-stream-status") == "active" results = await process_streaming_response(response) assert len(results) == 1 result = results[0] await assert_crawl_result_structure(result) assert result["success"] is True assert result["url"] == SIMPLE_HTML_URL assert "

Herman Melville - Moby-Dick

" in result["html"] async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient): """Test /crawl/stream with a single URL and simple config values.""" payload = { "urls": [SIMPLE_HTML_URL], "browser_config": { "type": "BrowserConfig", "params": { "headless": True, } }, "crawler_config": { "type": "CrawlerRunConfig", "params": { "stream": True, # Must be true for /crawl/stream "screenshot": False, "cache_mode": CacheMode.BYPASS.value } } } async with async_client.stream("POST", "/crawl/stream", json=payload) as response: response.raise_for_status() results = await process_streaming_response(response) assert len(results) == 1 result = results[0] await assert_crawl_result_structure(result) assert result["success"] is True assert result["url"] == SIMPLE_HTML_URL assert "

Herman Melville - Moby-Dick

" in result["html"] # 2. Multi-URL and Dispatcher async def test_multi_url_crawl(self, async_client: httpx.AsyncClient): """Test /crawl with multiple URLs, implicitly testing dispatcher.""" urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"] payload = { "urls": urls, "browser_config": { "type": "BrowserConfig", "params": {"headless": True} }, "crawler_config": { "type": "CrawlerRunConfig", "params": {"stream": False, "cache_mode": CacheMode.BYPASS.value} } } try: print(f"Sending deep crawl request to server...") response = await async_client.post("/crawl", json=payload) print(f"Response status: {response.status_code}") if response.status_code >= 400: error_detail = response.json().get('detail', 'No detail provided') print(f"Error detail: {error_detail}") print(f"Full response: {response.text}") response.raise_for_status() data = response.json() except httpx.HTTPStatusError as e: print(f"Server error status: {e.response.status_code}") print(f"Server error response: {e.response.text}") try: error_json = e.response.json() print(f"Parsed error: {error_json}") except: print("Could not parse error response as JSON") raise assert data["success"] is True assert isinstance(data["results"], list) assert len(data["results"]) == len(urls) for result in data["results"]: await assert_crawl_result_structure(result) assert result["success"] is True assert result["url"] in urls async def test_multi_url_crawl_streaming(self, async_client: httpx.AsyncClient): """Test /crawl/stream with multiple URLs.""" urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"] payload = { "urls": urls, "browser_config": { "type": "BrowserConfig", "params": {"headless": True} }, "crawler_config": { "type": "CrawlerRunConfig", "params": {"stream": True, "cache_mode": CacheMode.BYPASS.value} } } async with async_client.stream("POST", "/crawl/stream", json=payload) as response: response.raise_for_status() results = await process_streaming_response(response) assert len(results) == len(urls) processed_urls = set() for result in results: await assert_crawl_result_structure(result) assert result["success"] is True assert result["url"] in urls processed_urls.add(result["url"]) assert processed_urls == set(urls) # Ensure all URLs were processed # 3. Class Values and Nested Classes (Markdown Generator) async def test_crawl_with_markdown_pruning_filter(self, async_client: httpx.AsyncClient): """Test /crawl with MarkdownGenerator using PruningContentFilter.""" payload = { "urls": [SIMPLE_HTML_URL], "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { "cache_mode": CacheMode.ENABLED.value, # Test different cache mode "markdown_generator": { "type": "DefaultMarkdownGenerator", "params": { "content_filter": { "type": "PruningContentFilter", "params": { "threshold": 0.5, # Example param "threshold_type": "relative" } } } } } } } try: print(f"Sending deep crawl request to server...") response = await async_client.post("/crawl", json=payload) print(f"Response status: {response.status_code}") if response.status_code >= 400: error_detail = response.json().get('detail', 'No detail provided') print(f"Error detail: {error_detail}") print(f"Full response: {response.text}") response.raise_for_status() data = response.json() except httpx.HTTPStatusError as e: print(f"Server error status: {e.response.status_code}") print(f"Server error response: {e.response.text}") try: error_json = e.response.json() print(f"Parsed error: {error_json}") except: print("Could not parse error response as JSON") raise assert data["success"] is True assert len(data["results"]) == 1 result = data["results"][0] await assert_crawl_result_structure(result) assert result["success"] is True assert "markdown" in result assert isinstance(result["markdown"], dict) assert "raw_markdown" in result["markdown"] assert "fit_markdown" in result["markdown"] # Pruning creates fit_markdown assert "Moby-Dick" in result["markdown"]["raw_markdown"] # Fit markdown content might be different/shorter due to pruning assert len(result["markdown"]["fit_markdown"]) <= len(result["markdown"]["raw_markdown"]) async def test_crawl_with_markdown_bm25_filter(self, async_client: httpx.AsyncClient): """Test /crawl with MarkdownGenerator using BM25ContentFilter.""" payload = { "urls": [SIMPLE_HTML_URL], "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { "markdown_generator": { "type": "DefaultMarkdownGenerator", "params": { "content_filter": { "type": "BM25ContentFilter", "params": { "user_query": "Herman Melville", # Query for BM25 "bm25_threshold": 0.1, # Lower threshold to increase matches "language": "english" # Valid parameters } } } } } } } try: print(f"Payload for BM25 test: {json.dumps(payload)}") response = await async_client.post("/crawl", json=payload) print(f"Response status: {response.status_code}") if response.status_code >= 400: error_detail = response.json().get('detail', 'No detail provided') print(f"Error detail: {error_detail}") print(f"Full response: {response.text}") response.raise_for_status() data = response.json() except httpx.HTTPStatusError as e: print(f"Server error status: {e.response.status_code}") print(f"Server error response: {e.response.text}") try: error_json = e.response.json() print(f"Parsed error: {error_json}") except: print("Could not parse error response as JSON") raise assert data["success"] is True assert len(data["results"]) == 1 result = data["results"][0] await assert_crawl_result_structure(result) assert result["success"] is True assert "markdown" in result assert isinstance(result["markdown"], dict) assert "raw_markdown" in result["markdown"] assert "fit_markdown" in result["markdown"] # BM25 creates fit_markdown # Print values for debug print(f"Raw markdown length: {len(result['markdown']['raw_markdown'])}") print(f"Fit markdown length: {len(result['markdown']['fit_markdown'])}") # Either fit_markdown has content (possibly including our query terms) # or it might be empty if no good BM25 matches were found # Don't assert specific content since it can be environment-dependent # 4. Deep Crawling async def test_deep_crawl(self, async_client: httpx.AsyncClient): """Test /crawl with a deep crawl strategy.""" payload = { "urls": [DEEP_CRAWL_URL], # Start URL "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { "stream": False, "cache_mode": CacheMode.BYPASS.value, "deep_crawl_strategy": { "type": "BFSDeepCrawlStrategy", "params": { "max_depth": 1, # Limit depth for testing speed "max_pages": 5, # Limit pages to crawl "filter_chain": { "type": "FilterChain", "params": { "filters": [ { "type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]} }, { "type": "DomainFilter", "params": {"allowed_domains": ["python.org", "docs.python.org"]} # Include important subdomains } ] } }, "url_scorer": { "type": "CompositeScorer", "params": { "scorers": [ { "type": "KeywordRelevanceScorer", "params": {"keywords": ["documentation", "tutorial"]} }, { "type": "PathDepthScorer", "params": {"weight": 0.5, "optimal_depth": 2} } ] } } } } } } } try: print(f"Sending deep crawl request to server...") response = await async_client.post("/crawl", json=payload) print(f"Response status: {response.status_code}") if response.status_code >= 400: error_detail = response.json().get('detail', 'No detail provided') print(f"Error detail: {error_detail}") print(f"Full response: {response.text}") response.raise_for_status() data = response.json() except httpx.HTTPStatusError as e: print(f"Server error status: {e.response.status_code}") print(f"Server error response: {e.response.text}") try: error_json = e.response.json() print(f"Parsed error: {error_json}") except: print("Could not parse error response as JSON") raise assert data["success"] is True assert isinstance(data["results"], list) # Expect more than 1 result due to deep crawl (start URL + crawled links) assert len(data["results"]) > 1 assert len(data["results"]) <= 6 # Start URL + max_links=5 start_url_found = False crawled_urls_found = False for result in data["results"]: await assert_crawl_result_structure(result) assert result["success"] is True # Print URL for debugging print(f"Crawled URL: {result['url']}") # Allow URLs that contain python.org (including subdomains like docs.python.org) assert "python.org" in result["url"] if result["url"] == DEEP_CRAWL_URL: start_url_found = True else: crawled_urls_found = True assert start_url_found assert crawled_urls_found # 5. Extraction without LLM (JSON/CSS) async def test_json_css_extraction(self, async_client: httpx.AsyncClient): """Test /crawl with JsonCssExtractionStrategy.""" payload = { "urls": [SCRAPE_TARGET_URL], "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { "cache_mode": CacheMode.BYPASS.value, "extraction_strategy": { "type": "JsonCssExtractionStrategy", "params": { "schema": { "type": "dict", # IMPORTANT: Wrap schema dict with type/value structure "value": { "name": "BookList", "baseSelector": "ol.row li.col-xs-6", # Select each book item "fields": [ {"name": "title", "selector": "article.product_pod h3 a", "type": "attribute", "attribute": "title"}, {"name": "price", "selector": "article.product_pod .price_color", "type": "text"}, {"name": "rating", "selector": "article.product_pod p.star-rating", "type": "attribute", "attribute": "class"} ] } } } } } } } try: print(f"Sending deep crawl request to server...") response = await async_client.post("/crawl", json=payload) print(f"Response status: {response.status_code}") if response.status_code >= 400: error_detail = response.json().get('detail', 'No detail provided') print(f"Error detail: {error_detail}") print(f"Full response: {response.text}") response.raise_for_status() data = response.json() except httpx.HTTPStatusError as e: print(f"Server error status: {e.response.status_code}") print(f"Server error response: {e.response.text}") try: error_json = e.response.json() print(f"Parsed error: {error_json}") except: print("Could not parse error response as JSON") raise assert data["success"] is True assert len(data["results"]) == 1 result = data["results"][0] await assert_crawl_result_structure(result) assert result["success"] is True assert "extracted_content" in result assert result["extracted_content"] is not None # Extracted content should be a JSON string representing a list of dicts try: extracted_data = json.loads(result["extracted_content"]) assert isinstance(extracted_data, list) assert len(extracted_data) > 0 # Should find some books # Check structure of the first extracted item first_item = extracted_data[0] assert "title" in first_item assert "price" in first_item assert "rating" in first_item assert "star-rating" in first_item["rating"] # e.g., "star-rating Three" except (json.JSONDecodeError, AssertionError) as e: pytest.fail(f"Extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}") # 6. Extraction with LLM async def test_llm_extraction(self, async_client: httpx.AsyncClient): """ Test /crawl with LLMExtractionStrategy. NOTE: Requires the server to have appropriate LLM API keys (e.g., OPENAI_API_KEY) configured via .llm.env or environment variables. This test uses the default provider configured in the server's config.yml. """ payload = { "urls": [SIMPLE_HTML_URL], "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { "cache_mode": CacheMode.BYPASS.value, "extraction_strategy": { "type": "LLMExtractionStrategy", "params": { "instruction": "Extract the main title and the author mentioned in the text into JSON.", # LLMConfig is implicitly defined by server's config.yml and .llm.env # If you needed to override provider/token PER REQUEST: "llm_config": { "type": "LLMConfig", "params": { "provider": "openai/gpt-4o", # Example override "api_token": os.getenv("OPENAI_API_KEY") # Example override } }, "schema": { # Optional: Provide a schema for structured output "type": "dict", # IMPORTANT: Wrap schema dict "value": { "title": "Book Info", "type": "object", "properties": { "title": {"type": "string", "description": "The main title of the work"}, "author": {"type": "string", "description": "The author of the work"} }, "required": ["title", "author"] } } } } } } } try: response = await async_client.post("/crawl", json=payload) response.raise_for_status() # Will raise if server returns 500 (e.g., bad API key) data = response.json() except httpx.HTTPStatusError as e: # Catch potential server errors (like 500 due to missing/invalid API keys) pytest.fail(f"LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and ensure API keys are correctly configured for the server.") except httpx.RequestError as e: pytest.fail(f"LLM extraction request failed: {e}.") assert data["success"] is True assert len(data["results"]) == 1 result = data["results"][0] await assert_crawl_result_structure(result) assert result["success"] is True assert "extracted_content" in result assert result["extracted_content"] is not None # Extracted content should be JSON (because we provided a schema) try: extracted_data = json.loads(result["extracted_content"]) print(f"\nLLM Extracted Data: {extracted_data}") # Print for verification # Handle both dict and list formats (server returns a list) if isinstance(extracted_data, list): assert len(extracted_data) > 0 extracted_item = extracted_data[0] # Take first item assert isinstance(extracted_item, dict) assert "title" in extracted_item assert "author" in extracted_item assert "Moby-Dick" in extracted_item.get("title", "") assert "Herman Melville" in extracted_item.get("author", "") else: assert isinstance(extracted_data, dict) assert "title" in extracted_data assert "author" in extracted_data assert "Moby-Dick" in extracted_data.get("title", "") assert "Herman Melville" in extracted_data.get("author", "") except (json.JSONDecodeError, AssertionError) as e: pytest.fail(f"LLM extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}") except Exception as e: # Catch any other unexpected error pytest.fail(f"An unexpected error occurred during LLM result processing: {e}\nContent: {result['extracted_content']}") # 7. Error Handling Tests async def test_invalid_url_handling(self, async_client: httpx.AsyncClient): """Test error handling for invalid URLs.""" payload = { "urls": ["invalid-url", "https://nonexistent-domain-12345.com"], "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": CacheMode.BYPASS.value}} } response = await async_client.post("/crawl", json=payload) # Should return 200 with failed results, not 500 print(f"Status code: {response.status_code}") print(f"Response: {response.text}") assert response.status_code == 500 data = response.json() assert data["detail"].startswith("Crawl request failed:") async def test_mixed_success_failure_urls(self, async_client: httpx.AsyncClient): """Test handling of mixed success/failure URLs.""" payload = { "urls": [ SIMPLE_HTML_URL, # Should succeed "https://nonexistent-domain-12345.com", # Should fail "https://invalid-url-with-special-chars-!@#$%^&*()", # Should fail ], "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { "cache_mode": CacheMode.BYPASS.value, "markdown_generator": { "type": "DefaultMarkdownGenerator", "params": { "content_filter": { "type": "PruningContentFilter", "params": {"threshold": 0.5} } } } } } } response = await async_client.post("/crawl", json=payload) assert response.status_code == 200 data = response.json() assert data["success"] is True assert len(data["results"]) == 3 success_count = 0 failure_count = 0 for result in data["results"]: if result["success"]: success_count += 1 else: failure_count += 1 assert "error_message" in result assert len(result["error_message"]) > 0 assert success_count >= 1 # At least one should succeed assert failure_count >= 1 # At least one should fail async def test_streaming_mixed_urls(self, async_client: httpx.AsyncClient): """Test streaming with mixed success/failure URLs.""" payload = { "urls": [ SIMPLE_HTML_URL, # Should succeed "https://nonexistent-domain-12345.com", # Should fail ], "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { "stream": True, "cache_mode": CacheMode.BYPASS.value } } } async with async_client.stream("POST", "/crawl/stream", json=payload) as response: response.raise_for_status() results = await process_streaming_response(response) assert len(results) == 2 success_count = 0 failure_count = 0 for result in results: if result["success"]: success_count += 1 assert result["url"] == SIMPLE_HTML_URL else: failure_count += 1 assert "error_message" in result assert result["error_message"] is not None assert success_count == 1 assert failure_count == 1 async def test_markdown_endpoint_error_handling(self, async_client: httpx.AsyncClient): """Test error handling for markdown endpoint.""" # Test invalid URL invalid_payload = {"url": "invalid-url", "f": "fit"} response = await async_client.post("/md", json=invalid_payload) # Should return 400 for invalid URL format assert response.status_code == 400 # Test non-existent URL nonexistent_payload = {"url": "https://nonexistent-domain-12345.com", "f": "fit"} response = await async_client.post("/md", json=nonexistent_payload) # Should return 500 for crawl failure assert response.status_code == 500 async def test_html_endpoint_error_handling(self, async_client: httpx.AsyncClient): """Test error handling for HTML endpoint.""" # Test invalid URL invalid_payload = {"url": "invalid-url"} response = await async_client.post("/html", json=invalid_payload) # Should return 500 for crawl failure assert response.status_code == 500 async def test_screenshot_endpoint_error_handling(self, async_client: httpx.AsyncClient): """Test error handling for screenshot endpoint.""" # Test invalid URL invalid_payload = {"url": "invalid-url"} response = await async_client.post("/screenshot", json=invalid_payload) # Should return 500 for crawl failure assert response.status_code == 500 async def test_pdf_endpoint_error_handling(self, async_client: httpx.AsyncClient): """Test error handling for PDF endpoint.""" # Test invalid URL invalid_payload = {"url": "invalid-url"} response = await async_client.post("/pdf", json=invalid_payload) # Should return 500 for crawl failure assert response.status_code == 500 async def test_execute_js_endpoint_error_handling(self, async_client: httpx.AsyncClient): """Test error handling for execute_js endpoint.""" # Test invalid URL invalid_payload = {"url": "invalid-url", "scripts": ["return document.title;"]} response = await async_client.post("/execute_js", json=invalid_payload) # Should return 500 for crawl failure assert response.status_code == 500 async def test_llm_endpoint_error_handling(self, async_client: httpx.AsyncClient): """Test error handling for LLM endpoint.""" # Test missing query parameter response = await async_client.get("/llm/https://example.com") assert response.status_code == 422 # FastAPI validation error, not 400 # Test invalid URL response = await async_client.get("/llm/invalid-url?q=test") # Should return 500 for crawl failure assert response.status_code == 500 async def test_ask_endpoint_error_handling(self, async_client: httpx.AsyncClient): """Test error handling for ask endpoint.""" # Test invalid context_type response = await async_client.get("/ask?context_type=invalid") assert response.status_code == 422 # Validation error # Test invalid score_ratio response = await async_client.get("/ask?score_ratio=2.0") # > 1.0 assert response.status_code == 422 # Validation error # Test invalid max_results response = await async_client.get("/ask?max_results=0") # < 1 assert response.status_code == 422 # Validation error async def test_config_dump_error_handling(self, async_client: httpx.AsyncClient): """Test error handling for config dump endpoint.""" # Test invalid code invalid_payload = {"code": "invalid_code"} response = await async_client.post("/config/dump", json=invalid_payload) assert response.status_code == 400 # Test nested function calls (not allowed) nested_payload = {"code": "CrawlerRunConfig(BrowserConfig())"} response = await async_client.post("/config/dump", json=nested_payload) assert response.status_code == 400 async def test_malformed_request_handling(self, async_client: httpx.AsyncClient): """Test handling of malformed requests.""" # Test missing required fields malformed_payload = {"urls": []} # Missing browser_config and crawler_config response = await async_client.post("/crawl", json=malformed_payload) print(f"Response: {response.text}") assert response.status_code == 422 # Validation error # Test empty URLs list empty_urls_payload = { "urls": [], "browser_config": {"type": "BrowserConfig", "params": {}}, "crawler_config": {"type": "CrawlerRunConfig", "params": {}} } response = await async_client.post("/crawl", json=empty_urls_payload) assert response.status_code == 422 # "At least one URL required" if __name__ == "__main__": # Define arguments for pytest programmatically # -v: verbose output # -s: show print statements immediately (useful for debugging) # __file__: tells pytest to run tests in the current file pytest_args = ["-v", "-s", __file__] # You can add more pytest arguments here if needed, for example: # '-k test_llm_extraction': Run only the LLM test function # pytest_args.append("-k test_llm_extraction") print(f"Running pytest with args: {pytest_args}") # Execute pytest exit_code = pytest.main(pytest_args) print(f"Pytest finished with exit code: {exit_code}")