# ==== File: test_rest_api_deep_crawl.py ==== import pytest import pytest_asyncio import httpx import json import asyncio import os from typing import List, Dict, Any, AsyncGenerator from dotenv import load_dotenv load_dotenv() # Load environment variables from .env file if present # --- Test Configuration --- BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # Ensure this points to your running server BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Ensure this points to your running server DEEP_CRAWL_BASE_URL = "https://docs.crawl4ai.com/samples/deepcrawl/" DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com" # Used for domain filter # --- Helper Functions --- def load_proxies_from_env() -> List[Dict]: """Load proxies from PROXIES environment variable""" proxies = [] proxies_str = os.getenv("PROXIES", "") if not proxies_str: print("PROXIES environment variable not set or empty.") return proxies try: proxy_list = proxies_str.split(",") for proxy in proxy_list: proxy = proxy.strip() if not proxy: continue parts = proxy.split(":") if len(parts) == 4: ip, port, username, password = parts proxies.append({ "server": f"http://{ip}:{port}", # Assuming http, adjust if needed "username": username, "password": password, "ip": ip # Store original IP if available }) elif len(parts) == 2: # ip:port only ip, port = parts proxies.append({ "server": f"http://{ip}:{port}", "ip": ip }) else: print(f"Skipping invalid proxy string format: {proxy}") except Exception as e: print(f"Error loading proxies from environment: {e}") return proxies async def check_server_health(client: httpx.AsyncClient): """Check if the server is healthy before running tests.""" try: response = await client.get("/health") response.raise_for_status() print(f"\nServer healthy: {response.json()}") return True except (httpx.RequestError, httpx.HTTPStatusError) as e: pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False) async def assert_crawl_result_structure(result: Dict[str, Any], check_ssl=False): """Asserts the basic structure of a single crawl result.""" assert isinstance(result, dict) assert "url" in result assert "success" in result assert "html" in result # Basic crawls should return HTML assert "metadata" in result assert isinstance(result["metadata"], dict) assert "depth" in result["metadata"] # Deep crawls add depth if check_ssl: assert "ssl_certificate" in result # Check if SSL info is present assert isinstance(result["ssl_certificate"], dict) or result["ssl_certificate"] is None async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]: """Processes an NDJSON streaming response.""" results = [] completed = False async for line in response.aiter_lines(): if line: try: data = json.loads(line) if data.get("status") == "completed": completed = True break # Stop processing after completion marker elif data.get("url"): # Ensure it looks like a result object results.append(data) else: print(f"Received non-result JSON line: {data}") # Log other status messages if needed except json.JSONDecodeError: pytest.fail(f"Failed to decode JSON line: {line}") assert completed, "Streaming response did not end with a completion marker." return results # --- Pytest Fixtures --- @pytest_asyncio.fixture(scope="function") async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]: """Provides an async HTTP client""" # Increased timeout for potentially longer deep crawls async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client: yield client # No explicit close needed with 'async with' # --- Test Class --- @pytest.mark.asyncio class TestDeepCrawlEndpoints: @pytest_asyncio.fixture(autouse=True) async def check_health_before_tests(self, async_client: httpx.AsyncClient): """Fixture to ensure server is healthy before each test in the class.""" await check_server_health(async_client) # 1. Basic Deep Crawl # async def test_deep_crawl_basic_bfs(self, async_client: httpx.AsyncClient): # """Test BFS deep crawl with limited depth and pages.""" # max_depth = 1 # max_pages = 3 # start_url + 2 more # payload = { # "urls": [DEEP_CRAWL_BASE_URL], # "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, # "crawler_config": { # "type": "CrawlerRunConfig", # "params": { # "stream": False, # "cache_mode": "BYPASS", # Use string value for CacheMode # "deep_crawl_strategy": { # "type": "BFSDeepCrawlStrategy", # "params": { # "max_depth": max_depth, # "max_pages": max_pages, # # Minimal filters for basic test # "filter_chain": { # "type": "FilterChain", # "params": { # "filters": [ # { # "type": "DomainFilter", # "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]} # } # ] # } # } # } # } # } # } # } # response = await async_client.post("/crawl", json=payload) # response.raise_for_status() # data = response.json() # assert data["success"] is True # assert isinstance(data["results"], list) # assert len(data["results"]) > 1 # Should be more than just the start URL # assert len(data["results"]) <= max_pages # Respect max_pages # found_depth_0 = False # found_depth_1 = False # for result in data["results"]: # await assert_crawl_result_structure(result) # assert result["success"] is True # assert DEEP_CRAWL_DOMAIN in result["url"] # depth = result["metadata"]["depth"] # assert depth <= max_depth # if depth == 0: found_depth_0 = True # if depth == 1: found_depth_1 = True # assert found_depth_0 # assert found_depth_1 # # 2. Deep Crawl with Filtering # async def test_deep_crawl_with_filters(self, async_client: httpx.AsyncClient): # """Test BFS deep crawl with content type and domain filters.""" # max_depth = 1 # max_pages = 5 # payload = { # "urls": [DEEP_CRAWL_BASE_URL], # "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, # "crawler_config": { # "type": "CrawlerRunConfig", # "params": { # "stream": False, # "cache_mode": "BYPASS", # "deep_crawl_strategy": { # "type": "BFSDeepCrawlStrategy", # "params": { # "max_depth": max_depth, # "max_pages": max_pages, # "filter_chain": { # "type": "FilterChain", # "params": { # "filters": [ # { # "type": "DomainFilter", # "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]} # }, # { # "type": "ContentTypeFilter", # "params": {"allowed_types": ["text/html"]} # }, # # Example: Exclude specific paths using regex # { # "type": "URLPatternFilter", # "params": { # "patterns": ["*/category-3/*"], # Block category 3 # "reverse": True # Block if match # } # } # ] # } # } # } # } # } # } # } # response = await async_client.post("/crawl", json=payload) # response.raise_for_status() # data = response.json() # assert data["success"] is True # assert len(data["results"]) > 0 # assert len(data["results"]) <= max_pages # for result in data["results"]: # await assert_crawl_result_structure(result) # assert result["success"] is True # assert DEEP_CRAWL_DOMAIN in result["url"] # assert "category-3" not in result["url"] # Check if filter worked # assert result["metadata"]["depth"] <= max_depth # # 3. Deep Crawl with Scoring # async def test_deep_crawl_with_scoring(self, async_client: httpx.AsyncClient): # """Test BFS deep crawl with URL scoring.""" # max_depth = 1 # max_pages = 4 # payload = { # "urls": [DEEP_CRAWL_BASE_URL], # "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, # "crawler_config": { # "type": "CrawlerRunConfig", # "params": { # "stream": False, # "cache_mode": "BYPASS", # "deep_crawl_strategy": { # "type": "BFSDeepCrawlStrategy", # "params": { # "max_depth": max_depth, # "max_pages": max_pages, # "filter_chain": { # Keep basic domain filter # "type": "FilterChain", # "params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]} # }, # "url_scorer": { # Add scorer # "type": "CompositeScorer", # "params": { # "scorers": [ # { # Favor pages with 'product' in the URL # "type": "KeywordRelevanceScorer", # "params": {"keywords": ["product"], "weight": 1.0} # }, # { # Penalize deep paths slightly # "type": "PathDepthScorer", # "params": {"optimal_depth": 2, "weight": -0.2} # } # ] # } # }, # # Set a threshold if needed: "score_threshold": 0.1 # } # } # } # } # } # response = await async_client.post("/crawl", json=payload) # response.raise_for_status() # data = response.json() # assert data["success"] is True # assert len(data["results"]) > 0 # assert len(data["results"]) <= max_pages # # Check if results seem biased towards products (harder to assert strictly without knowing exact scores) # product_urls_found = any("product_" in result["url"] for result in data["results"] if result["metadata"]["depth"] > 0) # print(f"Product URLs found among depth > 0 results: {product_urls_found}") # # We expect scoring to prioritize product pages if available within limits # # assert product_urls_found # This might be too strict depending on site structure and limits # for result in data["results"]: # await assert_crawl_result_structure(result) # assert result["success"] is True # assert result["metadata"]["depth"] <= max_depth # # 4. Deep Crawl with CSS Extraction # async def test_deep_crawl_with_css_extraction(self, async_client: httpx.AsyncClient): # """Test BFS deep crawl combined with JsonCssExtractionStrategy.""" # max_depth = 6 # Go deep enough to reach product pages # max_pages = 20 # # Schema to extract product details # product_schema = { # "name": "ProductDetails", # "baseSelector": "div.container", # Base for product page # "fields": [ # {"name": "product_title", "selector": "h1", "type": "text"}, # {"name": "price", "selector": ".product-price", "type": "text"}, # {"name": "description", "selector": ".product-description p", "type": "text"}, # {"name": "specs", "selector": ".product-specs li", "type": "list", "fields":[ # {"name": "spec_name", "selector": ".spec-name", "type": "text"}, # {"name": "spec_value", "selector": ".spec-value", "type": "text"} # ]} # ] # } # payload = { # "urls": [DEEP_CRAWL_BASE_URL], # "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, # "crawler_config": { # "type": "CrawlerRunConfig", # "params": { # "stream": False, # "cache_mode": "BYPASS", # "extraction_strategy": { # Apply extraction to ALL crawled pages # "type": "JsonCssExtractionStrategy", # "params": {"schema": {"type": "dict", "value": product_schema}} # }, # "deep_crawl_strategy": { # "type": "BFSDeepCrawlStrategy", # "params": { # "max_depth": max_depth, # "max_pages": max_pages, # "filter_chain": { # Only crawl HTML on our domain # "type": "FilterChain", # "params": { # "filters": [ # {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}, # {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}} # ] # } # } # # Optional: Add scoring to prioritize product pages for extraction # } # } # } # } # } # response = await async_client.post("/crawl", json=payload) # response.raise_for_status() # data = response.json() # assert data["success"] is True # assert len(data["results"]) > 0 # # assert len(data["results"]) <= max_pages # found_extracted_product = False # for result in data["results"]: # await assert_crawl_result_structure(result) # assert result["success"] is True # assert "extracted_content" in result # if "product_" in result["url"]: # Check product pages specifically # assert result["extracted_content"] is not None # try: # extracted = json.loads(result["extracted_content"]) # # Schema returns list even if one base match # assert isinstance(extracted, list) # if extracted: # item = extracted[0] # assert "product_title" in item and item["product_title"] # assert "price" in item and item["price"] # # Specs might be empty list if not found # assert "specs" in item and isinstance(item["specs"], list) # found_extracted_product = True # print(f"Extracted product: {item.get('product_title')}") # except (json.JSONDecodeError, AssertionError, IndexError) as e: # pytest.fail(f"Extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}") # # else: # # # Non-product pages might have None or empty list depending on schema match # # assert result["extracted_content"] is None or json.loads(result["extracted_content"]) == [] # assert found_extracted_product, "Did not find any pages where product data was successfully extracted." # # 5. Deep Crawl with LLM Extraction (Requires Server LLM Setup) # async def test_deep_crawl_with_llm_extraction(self, async_client: httpx.AsyncClient): # """Test BFS deep crawl combined with LLMExtractionStrategy.""" # max_depth = 1 # Limit depth to keep LLM calls manageable # max_pages = 3 # payload = { # "urls": [DEEP_CRAWL_BASE_URL], # "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, # "crawler_config": { # "type": "CrawlerRunConfig", # "params": { # "stream": False, # "cache_mode": "BYPASS", # "extraction_strategy": { # Apply LLM extraction to crawled pages # "type": "LLMExtractionStrategy", # "params": { # "instruction": "Extract the main H1 title and the text content of the first paragraph.", # "llm_config": { # Example override, rely on server default if possible # "type": "LLMConfig", # "params": {"provider": "openai/gpt-4.1-mini"} # Use a cheaper model for testing # }, # "schema": { # Expected JSON output # "type": "dict", # "value": { # "title": "PageContent", "type": "object", # "properties": { # "h1_title": {"type": "string"}, # "first_paragraph": {"type": "string"} # } # } # } # } # }, # "deep_crawl_strategy": { # "type": "BFSDeepCrawlStrategy", # "params": { # "max_depth": max_depth, # "max_pages": max_pages, # "filter_chain": { # "type": "FilterChain", # "params": { # "filters": [ # {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}, # {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}} # ] # } # } # } # } # } # } # } # try: # response = await async_client.post("/crawl", json=payload) # response.raise_for_status() # data = response.json() # except httpx.HTTPStatusError as e: # pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and LLM API key setup.") # except httpx.RequestError as e: # pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}.") # assert data["success"] is True # assert len(data["results"]) > 0 # assert len(data["results"]) <= max_pages # found_llm_extraction = False # for result in data["results"]: # await assert_crawl_result_structure(result) # assert result["success"] is True # assert "extracted_content" in result # assert result["extracted_content"] is not None # try: # extracted = json.loads(result["extracted_content"]) # if isinstance(extracted, list): extracted = extracted[0] # Handle list output # assert isinstance(extracted, dict) # assert "h1_title" in extracted # Check keys based on schema # assert "first_paragraph" in extracted # found_llm_extraction = True # print(f"LLM extracted from {result['url']}: Title='{extracted.get('h1_title')}'") # except (json.JSONDecodeError, AssertionError, IndexError, TypeError) as e: # pytest.fail(f"LLM extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}") # assert found_llm_extraction, "LLM extraction did not yield expected data on any crawled page." # # 6. Deep Crawl with SSL Certificate Fetching # async def test_deep_crawl_with_ssl(self, async_client: httpx.AsyncClient): # """Test BFS deep crawl with fetch_ssl_certificate enabled.""" # max_depth = 0 # Only fetch for start URL to keep test fast # max_pages = 1 # payload = { # "urls": [DEEP_CRAWL_BASE_URL], # "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, # "crawler_config": { # "type": "CrawlerRunConfig", # "params": { # "stream": False, # "cache_mode": "BYPASS", # "fetch_ssl_certificate": True, # <-- Enable SSL fetching # "deep_crawl_strategy": { # "type": "BFSDeepCrawlStrategy", # "params": { # "max_depth": max_depth, # "max_pages": max_pages, # } # } # } # } # } # response = await async_client.post("/crawl", json=payload) # response.raise_for_status() # data = response.json() # assert data["success"] is True # assert len(data["results"]) == 1 # result = data["results"][0] # await assert_crawl_result_structure(result, check_ssl=True) # <-- Tell helper to check SSL field # assert result["success"] is True # # Check if SSL info was actually retrieved # if result["ssl_certificate"]: # # Assert directly using dictionary keys # assert isinstance(result["ssl_certificate"], dict) # Verify it's a dict # assert "issuer" in result["ssl_certificate"] # assert "subject" in result["ssl_certificate"] # # --- MODIFIED ASSERTIONS --- # assert "not_before" in result["ssl_certificate"] # Check for the actual key # assert "not_after" in result["ssl_certificate"] # Check for the actual key # # --- END MODIFICATIONS --- # assert "fingerprint" in result["ssl_certificate"] # Check another key # # This print statement using .get() already works correctly with dictionaries # print(f"SSL Issuer Org: {result['ssl_certificate'].get('issuer', {}).get('O', 'N/A')}") # print(f"SSL Valid From: {result['ssl_certificate'].get('not_before', 'N/A')}") # else: # # This part remains the same # print("SSL Certificate was null in the result.") # 7. Deep Crawl with Proxy Rotation (Requires PROXIES env var) async def test_deep_crawl_with_proxies(self, async_client: httpx.AsyncClient): """Test BFS deep crawl using proxy rotation.""" proxies = load_proxies_from_env() if not proxies: pytest.skip("Skipping proxy test: PROXIES environment variable not set or empty.") print(f"\nTesting with {len(proxies)} proxies loaded from environment.") max_depth = 1 max_pages = 3 payload = { "urls": [DEEP_CRAWL_BASE_URL], # Use the dummy site # Use a BrowserConfig that *might* pick up proxy if set, but rely on CrawlerRunConfig "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { "stream": False, "cache_mode": "BYPASS", "proxy_rotation_strategy": { # <-- Define the strategy "type": "RoundRobinProxyStrategy", "params": { # Convert ProxyConfig dicts back to the serialized format expected by server "proxies": [{"type": "ProxyConfig", "params": p} for p in proxies] } }, "deep_crawl_strategy": { "type": "BFSDeepCrawlStrategy", "params": { "max_depth": max_depth, "max_pages": max_pages, "filter_chain": { "type": "FilterChain", "params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]} } } } } } } try: response = await async_client.post("/crawl", json=payload) response.raise_for_status() data = response.json() except httpx.HTTPStatusError as e: # Proxies often cause connection errors, catch them pytest.fail(f"Proxy deep crawl failed: {e}. Response: {e.response.text}. Are proxies valid and accessible by the server?") except httpx.RequestError as e: pytest.fail(f"Proxy deep crawl request failed: {e}. Are proxies valid and accessible?") assert data["success"] is True assert len(data["results"]) > 0 assert len(data["results"]) <= max_pages # Primary assertion is that the crawl succeeded *with* proxy config print(f"Proxy deep crawl completed successfully for {len(data['results'])} pages.") # Verifying specific proxy usage requires server logs or custom headers/responses # --- Main Execution Block (for running script directly) --- if __name__ == "__main__": pytest_args = ["-v", "-s", __file__] # Example: Run only proxy test # pytest_args.append("-k test_deep_crawl_with_proxies") print(f"Running pytest with args: {pytest_args}") exit_code = pytest.main(pytest_args) print(f"Pytest finished with exit code: {exit_code}")