Modify the test file

This commit is contained in:
UncleCode
2025-04-15 22:28:01 +08:00
parent 230f22da86
commit 5206c6f2d6

View File

@@ -119,411 +119,411 @@ class TestDeepCrawlEndpoints:
await check_server_health(async_client) await check_server_health(async_client)
# 1. Basic Deep Crawl # 1. Basic Deep Crawl
# async def test_deep_crawl_basic_bfs(self, async_client: httpx.AsyncClient): async def test_deep_crawl_basic_bfs(self, async_client: httpx.AsyncClient):
# """Test BFS deep crawl with limited depth and pages.""" """Test BFS deep crawl with limited depth and pages."""
# max_depth = 1 max_depth = 1
# max_pages = 3 # start_url + 2 more max_pages = 3 # start_url + 2 more
# payload = { payload = {
# "urls": [DEEP_CRAWL_BASE_URL], "urls": [DEEP_CRAWL_BASE_URL],
# "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
# "crawler_config": { "crawler_config": {
# "type": "CrawlerRunConfig", "type": "CrawlerRunConfig",
# "params": { "params": {
# "stream": False, "stream": False,
# "cache_mode": "BYPASS", # Use string value for CacheMode "cache_mode": "BYPASS", # Use string value for CacheMode
# "deep_crawl_strategy": { "deep_crawl_strategy": {
# "type": "BFSDeepCrawlStrategy", "type": "BFSDeepCrawlStrategy",
# "params": { "params": {
# "max_depth": max_depth, "max_depth": max_depth,
# "max_pages": max_pages, "max_pages": max_pages,
# # Minimal filters for basic test # Minimal filters for basic test
# "filter_chain": { "filter_chain": {
# "type": "FilterChain", "type": "FilterChain",
# "params": { "params": {
# "filters": [ "filters": [
# { {
# "type": "DomainFilter", "type": "DomainFilter",
# "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]} "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
# } }
# ] ]
# } }
# } }
# } }
# } }
# } }
# } }
# } }
# response = await async_client.post("/crawl", json=payload) response = await async_client.post("/crawl", json=payload)
# response.raise_for_status() response.raise_for_status()
# data = response.json() data = response.json()
# assert data["success"] is True assert data["success"] is True
# assert isinstance(data["results"], list) assert isinstance(data["results"], list)
# assert len(data["results"]) > 1 # Should be more than just the start URL assert len(data["results"]) > 1 # Should be more than just the start URL
# assert len(data["results"]) <= max_pages # Respect max_pages assert len(data["results"]) <= max_pages # Respect max_pages
# found_depth_0 = False found_depth_0 = False
# found_depth_1 = False found_depth_1 = False
# for result in data["results"]: for result in data["results"]:
# await assert_crawl_result_structure(result) await assert_crawl_result_structure(result)
# assert result["success"] is True assert result["success"] is True
# assert DEEP_CRAWL_DOMAIN in result["url"] assert DEEP_CRAWL_DOMAIN in result["url"]
# depth = result["metadata"]["depth"] depth = result["metadata"]["depth"]
# assert depth <= max_depth assert depth <= max_depth
# if depth == 0: found_depth_0 = True if depth == 0: found_depth_0 = True
# if depth == 1: found_depth_1 = True if depth == 1: found_depth_1 = True
# assert found_depth_0 assert found_depth_0
# assert found_depth_1 assert found_depth_1
# # 2. Deep Crawl with Filtering # 2. Deep Crawl with Filtering
# async def test_deep_crawl_with_filters(self, async_client: httpx.AsyncClient): async def test_deep_crawl_with_filters(self, async_client: httpx.AsyncClient):
# """Test BFS deep crawl with content type and domain filters.""" """Test BFS deep crawl with content type and domain filters."""
# max_depth = 1 max_depth = 1
# max_pages = 5 max_pages = 5
# payload = { payload = {
# "urls": [DEEP_CRAWL_BASE_URL], "urls": [DEEP_CRAWL_BASE_URL],
# "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
# "crawler_config": { "crawler_config": {
# "type": "CrawlerRunConfig", "type": "CrawlerRunConfig",
# "params": { "params": {
# "stream": False, "stream": False,
# "cache_mode": "BYPASS", "cache_mode": "BYPASS",
# "deep_crawl_strategy": { "deep_crawl_strategy": {
# "type": "BFSDeepCrawlStrategy", "type": "BFSDeepCrawlStrategy",
# "params": { "params": {
# "max_depth": max_depth, "max_depth": max_depth,
# "max_pages": max_pages, "max_pages": max_pages,
# "filter_chain": { "filter_chain": {
# "type": "FilterChain", "type": "FilterChain",
# "params": { "params": {
# "filters": [ "filters": [
# { {
# "type": "DomainFilter", "type": "DomainFilter",
# "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]} "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
# }, },
# { {
# "type": "ContentTypeFilter", "type": "ContentTypeFilter",
# "params": {"allowed_types": ["text/html"]} "params": {"allowed_types": ["text/html"]}
# }, },
# # Example: Exclude specific paths using regex # Example: Exclude specific paths using regex
# { {
# "type": "URLPatternFilter", "type": "URLPatternFilter",
# "params": { "params": {
# "patterns": ["*/category-3/*"], # Block category 3 "patterns": ["*/category-3/*"], # Block category 3
# "reverse": True # Block if match "reverse": True # Block if match
# } }
# } }
# ] ]
# } }
# } }
# } }
# } }
# } }
# } }
# } }
# response = await async_client.post("/crawl", json=payload) response = await async_client.post("/crawl", json=payload)
# response.raise_for_status() response.raise_for_status()
# data = response.json() data = response.json()
# assert data["success"] is True assert data["success"] is True
# assert len(data["results"]) > 0 assert len(data["results"]) > 0
# assert len(data["results"]) <= max_pages assert len(data["results"]) <= max_pages
# for result in data["results"]: for result in data["results"]:
# await assert_crawl_result_structure(result) await assert_crawl_result_structure(result)
# assert result["success"] is True assert result["success"] is True
# assert DEEP_CRAWL_DOMAIN in result["url"] assert DEEP_CRAWL_DOMAIN in result["url"]
# assert "category-3" not in result["url"] # Check if filter worked assert "category-3" not in result["url"] # Check if filter worked
# assert result["metadata"]["depth"] <= max_depth assert result["metadata"]["depth"] <= max_depth
# # 3. Deep Crawl with Scoring # 3. Deep Crawl with Scoring
# async def test_deep_crawl_with_scoring(self, async_client: httpx.AsyncClient): async def test_deep_crawl_with_scoring(self, async_client: httpx.AsyncClient):
# """Test BFS deep crawl with URL scoring.""" """Test BFS deep crawl with URL scoring."""
# max_depth = 1 max_depth = 1
# max_pages = 4 max_pages = 4
# payload = { payload = {
# "urls": [DEEP_CRAWL_BASE_URL], "urls": [DEEP_CRAWL_BASE_URL],
# "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
# "crawler_config": { "crawler_config": {
# "type": "CrawlerRunConfig", "type": "CrawlerRunConfig",
# "params": { "params": {
# "stream": False, "stream": False,
# "cache_mode": "BYPASS", "cache_mode": "BYPASS",
# "deep_crawl_strategy": { "deep_crawl_strategy": {
# "type": "BFSDeepCrawlStrategy", "type": "BFSDeepCrawlStrategy",
# "params": { "params": {
# "max_depth": max_depth, "max_depth": max_depth,
# "max_pages": max_pages, "max_pages": max_pages,
# "filter_chain": { # Keep basic domain filter "filter_chain": { # Keep basic domain filter
# "type": "FilterChain", "type": "FilterChain",
# "params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]} "params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
# }, },
# "url_scorer": { # Add scorer "url_scorer": { # Add scorer
# "type": "CompositeScorer", "type": "CompositeScorer",
# "params": { "params": {
# "scorers": [ "scorers": [
# { # Favor pages with 'product' in the URL { # Favor pages with 'product' in the URL
# "type": "KeywordRelevanceScorer", "type": "KeywordRelevanceScorer",
# "params": {"keywords": ["product"], "weight": 1.0} "params": {"keywords": ["product"], "weight": 1.0}
# }, },
# { # Penalize deep paths slightly { # Penalize deep paths slightly
# "type": "PathDepthScorer", "type": "PathDepthScorer",
# "params": {"optimal_depth": 2, "weight": -0.2} "params": {"optimal_depth": 2, "weight": -0.2}
# } }
# ] ]
# } }
# }, },
# # Set a threshold if needed: "score_threshold": 0.1 # Set a threshold if needed: "score_threshold": 0.1
# } }
# } }
# } }
# } }
# } }
# response = await async_client.post("/crawl", json=payload) response = await async_client.post("/crawl", json=payload)
# response.raise_for_status() response.raise_for_status()
# data = response.json() data = response.json()
# assert data["success"] is True assert data["success"] is True
# assert len(data["results"]) > 0 assert len(data["results"]) > 0
# assert len(data["results"]) <= max_pages assert len(data["results"]) <= max_pages
# # Check if results seem biased towards products (harder to assert strictly without knowing exact scores) # Check if results seem biased towards products (harder to assert strictly without knowing exact scores)
# product_urls_found = any("product_" in result["url"] for result in data["results"] if result["metadata"]["depth"] > 0) product_urls_found = any("product_" in result["url"] for result in data["results"] if result["metadata"]["depth"] > 0)
# print(f"Product URLs found among depth > 0 results: {product_urls_found}") print(f"Product URLs found among depth > 0 results: {product_urls_found}")
# # We expect scoring to prioritize product pages if available within limits # We expect scoring to prioritize product pages if available within limits
# # assert product_urls_found # This might be too strict depending on site structure and limits # assert product_urls_found # This might be too strict depending on site structure and limits
# for result in data["results"]: for result in data["results"]:
# await assert_crawl_result_structure(result) await assert_crawl_result_structure(result)
# assert result["success"] is True assert result["success"] is True
# assert result["metadata"]["depth"] <= max_depth assert result["metadata"]["depth"] <= max_depth
# # 4. Deep Crawl with CSS Extraction # 4. Deep Crawl with CSS Extraction
# async def test_deep_crawl_with_css_extraction(self, async_client: httpx.AsyncClient): async def test_deep_crawl_with_css_extraction(self, async_client: httpx.AsyncClient):
# """Test BFS deep crawl combined with JsonCssExtractionStrategy.""" """Test BFS deep crawl combined with JsonCssExtractionStrategy."""
# max_depth = 6 # Go deep enough to reach product pages max_depth = 6 # Go deep enough to reach product pages
# max_pages = 20 max_pages = 20
# # Schema to extract product details # Schema to extract product details
# product_schema = { product_schema = {
# "name": "ProductDetails", "name": "ProductDetails",
# "baseSelector": "div.container", # Base for product page "baseSelector": "div.container", # Base for product page
# "fields": [ "fields": [
# {"name": "product_title", "selector": "h1", "type": "text"}, {"name": "product_title", "selector": "h1", "type": "text"},
# {"name": "price", "selector": ".product-price", "type": "text"}, {"name": "price", "selector": ".product-price", "type": "text"},
# {"name": "description", "selector": ".product-description p", "type": "text"}, {"name": "description", "selector": ".product-description p", "type": "text"},
# {"name": "specs", "selector": ".product-specs li", "type": "list", "fields":[ {"name": "specs", "selector": ".product-specs li", "type": "list", "fields":[
# {"name": "spec_name", "selector": ".spec-name", "type": "text"}, {"name": "spec_name", "selector": ".spec-name", "type": "text"},
# {"name": "spec_value", "selector": ".spec-value", "type": "text"} {"name": "spec_value", "selector": ".spec-value", "type": "text"}
# ]} ]}
# ] ]
# } }
# payload = { payload = {
# "urls": [DEEP_CRAWL_BASE_URL], "urls": [DEEP_CRAWL_BASE_URL],
# "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
# "crawler_config": { "crawler_config": {
# "type": "CrawlerRunConfig", "type": "CrawlerRunConfig",
# "params": { "params": {
# "stream": False, "stream": False,
# "cache_mode": "BYPASS", "cache_mode": "BYPASS",
# "extraction_strategy": { # Apply extraction to ALL crawled pages "extraction_strategy": { # Apply extraction to ALL crawled pages
# "type": "JsonCssExtractionStrategy", "type": "JsonCssExtractionStrategy",
# "params": {"schema": {"type": "dict", "value": product_schema}} "params": {"schema": {"type": "dict", "value": product_schema}}
# }, },
# "deep_crawl_strategy": { "deep_crawl_strategy": {
# "type": "BFSDeepCrawlStrategy", "type": "BFSDeepCrawlStrategy",
# "params": { "params": {
# "max_depth": max_depth, "max_depth": max_depth,
# "max_pages": max_pages, "max_pages": max_pages,
# "filter_chain": { # Only crawl HTML on our domain "filter_chain": { # Only crawl HTML on our domain
# "type": "FilterChain", "type": "FilterChain",
# "params": { "params": {
# "filters": [ "filters": [
# {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}, {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
# {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}} {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
# ] ]
# } }
# } }
# # Optional: Add scoring to prioritize product pages for extraction # Optional: Add scoring to prioritize product pages for extraction
# } }
# } }
# } }
# } }
# } }
# response = await async_client.post("/crawl", json=payload) response = await async_client.post("/crawl", json=payload)
# response.raise_for_status() response.raise_for_status()
# data = response.json() data = response.json()
# assert data["success"] is True assert data["success"] is True
# assert len(data["results"]) > 0 assert len(data["results"]) > 0
# # assert len(data["results"]) <= max_pages # assert len(data["results"]) <= max_pages
# found_extracted_product = False found_extracted_product = False
# for result in data["results"]: for result in data["results"]:
# await assert_crawl_result_structure(result) await assert_crawl_result_structure(result)
# assert result["success"] is True assert result["success"] is True
# assert "extracted_content" in result assert "extracted_content" in result
# if "product_" in result["url"]: # Check product pages specifically if "product_" in result["url"]: # Check product pages specifically
# assert result["extracted_content"] is not None assert result["extracted_content"] is not None
# try: try:
# extracted = json.loads(result["extracted_content"]) extracted = json.loads(result["extracted_content"])
# # Schema returns list even if one base match # Schema returns list even if one base match
# assert isinstance(extracted, list) assert isinstance(extracted, list)
# if extracted: if extracted:
# item = extracted[0] item = extracted[0]
# assert "product_title" in item and item["product_title"] assert "product_title" in item and item["product_title"]
# assert "price" in item and item["price"] assert "price" in item and item["price"]
# # Specs might be empty list if not found # Specs might be empty list if not found
# assert "specs" in item and isinstance(item["specs"], list) assert "specs" in item and isinstance(item["specs"], list)
# found_extracted_product = True found_extracted_product = True
# print(f"Extracted product: {item.get('product_title')}") print(f"Extracted product: {item.get('product_title')}")
# except (json.JSONDecodeError, AssertionError, IndexError) as e: except (json.JSONDecodeError, AssertionError, IndexError) as e:
# pytest.fail(f"Extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}") pytest.fail(f"Extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
# # else: # else:
# # # Non-product pages might have None or empty list depending on schema match # # Non-product pages might have None or empty list depending on schema match
# # assert result["extracted_content"] is None or json.loads(result["extracted_content"]) == [] # assert result["extracted_content"] is None or json.loads(result["extracted_content"]) == []
# assert found_extracted_product, "Did not find any pages where product data was successfully extracted." assert found_extracted_product, "Did not find any pages where product data was successfully extracted."
# # 5. Deep Crawl with LLM Extraction (Requires Server LLM Setup) # 5. Deep Crawl with LLM Extraction (Requires Server LLM Setup)
# async def test_deep_crawl_with_llm_extraction(self, async_client: httpx.AsyncClient): async def test_deep_crawl_with_llm_extraction(self, async_client: httpx.AsyncClient):
# """Test BFS deep crawl combined with LLMExtractionStrategy.""" """Test BFS deep crawl combined with LLMExtractionStrategy."""
# max_depth = 1 # Limit depth to keep LLM calls manageable max_depth = 1 # Limit depth to keep LLM calls manageable
# max_pages = 3 max_pages = 3
# payload = { payload = {
# "urls": [DEEP_CRAWL_BASE_URL], "urls": [DEEP_CRAWL_BASE_URL],
# "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
# "crawler_config": { "crawler_config": {
# "type": "CrawlerRunConfig", "type": "CrawlerRunConfig",
# "params": { "params": {
# "stream": False, "stream": False,
# "cache_mode": "BYPASS", "cache_mode": "BYPASS",
# "extraction_strategy": { # Apply LLM extraction to crawled pages "extraction_strategy": { # Apply LLM extraction to crawled pages
# "type": "LLMExtractionStrategy", "type": "LLMExtractionStrategy",
# "params": { "params": {
# "instruction": "Extract the main H1 title and the text content of the first paragraph.", "instruction": "Extract the main H1 title and the text content of the first paragraph.",
# "llm_config": { # Example override, rely on server default if possible "llm_config": { # Example override, rely on server default if possible
# "type": "LLMConfig", "type": "LLMConfig",
# "params": {"provider": "openai/gpt-4.1-mini"} # Use a cheaper model for testing "params": {"provider": "openai/gpt-4.1-mini"} # Use a cheaper model for testing
# }, },
# "schema": { # Expected JSON output "schema": { # Expected JSON output
# "type": "dict", "type": "dict",
# "value": { "value": {
# "title": "PageContent", "type": "object", "title": "PageContent", "type": "object",
# "properties": { "properties": {
# "h1_title": {"type": "string"}, "h1_title": {"type": "string"},
# "first_paragraph": {"type": "string"} "first_paragraph": {"type": "string"}
# } }
# } }
# } }
# } }
# }, },
# "deep_crawl_strategy": { "deep_crawl_strategy": {
# "type": "BFSDeepCrawlStrategy", "type": "BFSDeepCrawlStrategy",
# "params": { "params": {
# "max_depth": max_depth, "max_depth": max_depth,
# "max_pages": max_pages, "max_pages": max_pages,
# "filter_chain": { "filter_chain": {
# "type": "FilterChain", "type": "FilterChain",
# "params": { "params": {
# "filters": [ "filters": [
# {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}, {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
# {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}} {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
# ] ]
# } }
# } }
# } }
# } }
# } }
# } }
# } }
# try: try:
# response = await async_client.post("/crawl", json=payload) response = await async_client.post("/crawl", json=payload)
# response.raise_for_status() response.raise_for_status()
# data = response.json() data = response.json()
# except httpx.HTTPStatusError as e: except httpx.HTTPStatusError as e:
# pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and LLM API key setup.") pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and LLM API key setup.")
# except httpx.RequestError as e: except httpx.RequestError as e:
# pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}.") pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}.")
# assert data["success"] is True assert data["success"] is True
# assert len(data["results"]) > 0 assert len(data["results"]) > 0
# assert len(data["results"]) <= max_pages assert len(data["results"]) <= max_pages
# found_llm_extraction = False found_llm_extraction = False
# for result in data["results"]: for result in data["results"]:
# await assert_crawl_result_structure(result) await assert_crawl_result_structure(result)
# assert result["success"] is True assert result["success"] is True
# assert "extracted_content" in result assert "extracted_content" in result
# assert result["extracted_content"] is not None assert result["extracted_content"] is not None
# try: try:
# extracted = json.loads(result["extracted_content"]) extracted = json.loads(result["extracted_content"])
# if isinstance(extracted, list): extracted = extracted[0] # Handle list output if isinstance(extracted, list): extracted = extracted[0] # Handle list output
# assert isinstance(extracted, dict) assert isinstance(extracted, dict)
# assert "h1_title" in extracted # Check keys based on schema assert "h1_title" in extracted # Check keys based on schema
# assert "first_paragraph" in extracted assert "first_paragraph" in extracted
# found_llm_extraction = True found_llm_extraction = True
# print(f"LLM extracted from {result['url']}: Title='{extracted.get('h1_title')}'") print(f"LLM extracted from {result['url']}: Title='{extracted.get('h1_title')}'")
# except (json.JSONDecodeError, AssertionError, IndexError, TypeError) as e: except (json.JSONDecodeError, AssertionError, IndexError, TypeError) as e:
# pytest.fail(f"LLM extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}") pytest.fail(f"LLM extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
# assert found_llm_extraction, "LLM extraction did not yield expected data on any crawled page." assert found_llm_extraction, "LLM extraction did not yield expected data on any crawled page."
# # 6. Deep Crawl with SSL Certificate Fetching # 6. Deep Crawl with SSL Certificate Fetching
# async def test_deep_crawl_with_ssl(self, async_client: httpx.AsyncClient): async def test_deep_crawl_with_ssl(self, async_client: httpx.AsyncClient):
# """Test BFS deep crawl with fetch_ssl_certificate enabled.""" """Test BFS deep crawl with fetch_ssl_certificate enabled."""
# max_depth = 0 # Only fetch for start URL to keep test fast max_depth = 0 # Only fetch for start URL to keep test fast
# max_pages = 1 max_pages = 1
# payload = { payload = {
# "urls": [DEEP_CRAWL_BASE_URL], "urls": [DEEP_CRAWL_BASE_URL],
# "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
# "crawler_config": { "crawler_config": {
# "type": "CrawlerRunConfig", "type": "CrawlerRunConfig",
# "params": { "params": {
# "stream": False, "stream": False,
# "cache_mode": "BYPASS", "cache_mode": "BYPASS",
# "fetch_ssl_certificate": True, # <-- Enable SSL fetching "fetch_ssl_certificate": True, # <-- Enable SSL fetching
# "deep_crawl_strategy": { "deep_crawl_strategy": {
# "type": "BFSDeepCrawlStrategy", "type": "BFSDeepCrawlStrategy",
# "params": { "params": {
# "max_depth": max_depth, "max_depth": max_depth,
# "max_pages": max_pages, "max_pages": max_pages,
# } }
# } }
# } }
# } }
# } }
# response = await async_client.post("/crawl", json=payload) response = await async_client.post("/crawl", json=payload)
# response.raise_for_status() response.raise_for_status()
# data = response.json() data = response.json()
# assert data["success"] is True assert data["success"] is True
# assert len(data["results"]) == 1 assert len(data["results"]) == 1
# result = data["results"][0] result = data["results"][0]
# await assert_crawl_result_structure(result, check_ssl=True) # <-- Tell helper to check SSL field await assert_crawl_result_structure(result, check_ssl=True) # <-- Tell helper to check SSL field
# assert result["success"] is True assert result["success"] is True
# # Check if SSL info was actually retrieved # Check if SSL info was actually retrieved
# if result["ssl_certificate"]: if result["ssl_certificate"]:
# # Assert directly using dictionary keys # Assert directly using dictionary keys
# assert isinstance(result["ssl_certificate"], dict) # Verify it's a dict assert isinstance(result["ssl_certificate"], dict) # Verify it's a dict
# assert "issuer" in result["ssl_certificate"] assert "issuer" in result["ssl_certificate"]
# assert "subject" in result["ssl_certificate"] assert "subject" in result["ssl_certificate"]
# # --- MODIFIED ASSERTIONS --- # --- MODIFIED ASSERTIONS ---
# assert "not_before" in result["ssl_certificate"] # Check for the actual key assert "not_before" in result["ssl_certificate"] # Check for the actual key
# assert "not_after" in result["ssl_certificate"] # Check for the actual key assert "not_after" in result["ssl_certificate"] # Check for the actual key
# # --- END MODIFICATIONS --- # --- END MODIFICATIONS ---
# assert "fingerprint" in result["ssl_certificate"] # Check another key assert "fingerprint" in result["ssl_certificate"] # Check another key
# # This print statement using .get() already works correctly with dictionaries # This print statement using .get() already works correctly with dictionaries
# print(f"SSL Issuer Org: {result['ssl_certificate'].get('issuer', {}).get('O', 'N/A')}") print(f"SSL Issuer Org: {result['ssl_certificate'].get('issuer', {}).get('O', 'N/A')}")
# print(f"SSL Valid From: {result['ssl_certificate'].get('not_before', 'N/A')}") print(f"SSL Valid From: {result['ssl_certificate'].get('not_before', 'N/A')}")
# else: else:
# # This part remains the same # This part remains the same
# print("SSL Certificate was null in the result.") print("SSL Certificate was null in the result.")
# 7. Deep Crawl with Proxy Rotation (Requires PROXIES env var) # 7. Deep Crawl with Proxy Rotation (Requires PROXIES env var)