Modify the test file
This commit is contained in:
@@ -119,411 +119,411 @@ class TestDeepCrawlEndpoints:
|
|||||||
await check_server_health(async_client)
|
await check_server_health(async_client)
|
||||||
|
|
||||||
# 1. Basic Deep Crawl
|
# 1. Basic Deep Crawl
|
||||||
# async def test_deep_crawl_basic_bfs(self, async_client: httpx.AsyncClient):
|
async def test_deep_crawl_basic_bfs(self, async_client: httpx.AsyncClient):
|
||||||
# """Test BFS deep crawl with limited depth and pages."""
|
"""Test BFS deep crawl with limited depth and pages."""
|
||||||
# max_depth = 1
|
max_depth = 1
|
||||||
# max_pages = 3 # start_url + 2 more
|
max_pages = 3 # start_url + 2 more
|
||||||
# payload = {
|
payload = {
|
||||||
# "urls": [DEEP_CRAWL_BASE_URL],
|
"urls": [DEEP_CRAWL_BASE_URL],
|
||||||
# "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
# "crawler_config": {
|
"crawler_config": {
|
||||||
# "type": "CrawlerRunConfig",
|
"type": "CrawlerRunConfig",
|
||||||
# "params": {
|
"params": {
|
||||||
# "stream": False,
|
"stream": False,
|
||||||
# "cache_mode": "BYPASS", # Use string value for CacheMode
|
"cache_mode": "BYPASS", # Use string value for CacheMode
|
||||||
# "deep_crawl_strategy": {
|
"deep_crawl_strategy": {
|
||||||
# "type": "BFSDeepCrawlStrategy",
|
"type": "BFSDeepCrawlStrategy",
|
||||||
# "params": {
|
"params": {
|
||||||
# "max_depth": max_depth,
|
"max_depth": max_depth,
|
||||||
# "max_pages": max_pages,
|
"max_pages": max_pages,
|
||||||
# # Minimal filters for basic test
|
# Minimal filters for basic test
|
||||||
# "filter_chain": {
|
"filter_chain": {
|
||||||
# "type": "FilterChain",
|
"type": "FilterChain",
|
||||||
# "params": {
|
"params": {
|
||||||
# "filters": [
|
"filters": [
|
||||||
# {
|
{
|
||||||
# "type": "DomainFilter",
|
"type": "DomainFilter",
|
||||||
# "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
|
"params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
|
||||||
# }
|
}
|
||||||
# ]
|
]
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# response = await async_client.post("/crawl", json=payload)
|
response = await async_client.post("/crawl", json=payload)
|
||||||
# response.raise_for_status()
|
response.raise_for_status()
|
||||||
# data = response.json()
|
data = response.json()
|
||||||
|
|
||||||
# assert data["success"] is True
|
assert data["success"] is True
|
||||||
# assert isinstance(data["results"], list)
|
assert isinstance(data["results"], list)
|
||||||
# assert len(data["results"]) > 1 # Should be more than just the start URL
|
assert len(data["results"]) > 1 # Should be more than just the start URL
|
||||||
# assert len(data["results"]) <= max_pages # Respect max_pages
|
assert len(data["results"]) <= max_pages # Respect max_pages
|
||||||
|
|
||||||
# found_depth_0 = False
|
found_depth_0 = False
|
||||||
# found_depth_1 = False
|
found_depth_1 = False
|
||||||
# for result in data["results"]:
|
for result in data["results"]:
|
||||||
# await assert_crawl_result_structure(result)
|
await assert_crawl_result_structure(result)
|
||||||
# assert result["success"] is True
|
assert result["success"] is True
|
||||||
# assert DEEP_CRAWL_DOMAIN in result["url"]
|
assert DEEP_CRAWL_DOMAIN in result["url"]
|
||||||
# depth = result["metadata"]["depth"]
|
depth = result["metadata"]["depth"]
|
||||||
# assert depth <= max_depth
|
assert depth <= max_depth
|
||||||
# if depth == 0: found_depth_0 = True
|
if depth == 0: found_depth_0 = True
|
||||||
# if depth == 1: found_depth_1 = True
|
if depth == 1: found_depth_1 = True
|
||||||
|
|
||||||
# assert found_depth_0
|
assert found_depth_0
|
||||||
# assert found_depth_1
|
assert found_depth_1
|
||||||
|
|
||||||
# # 2. Deep Crawl with Filtering
|
# 2. Deep Crawl with Filtering
|
||||||
# async def test_deep_crawl_with_filters(self, async_client: httpx.AsyncClient):
|
async def test_deep_crawl_with_filters(self, async_client: httpx.AsyncClient):
|
||||||
# """Test BFS deep crawl with content type and domain filters."""
|
"""Test BFS deep crawl with content type and domain filters."""
|
||||||
# max_depth = 1
|
max_depth = 1
|
||||||
# max_pages = 5
|
max_pages = 5
|
||||||
# payload = {
|
payload = {
|
||||||
# "urls": [DEEP_CRAWL_BASE_URL],
|
"urls": [DEEP_CRAWL_BASE_URL],
|
||||||
# "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
# "crawler_config": {
|
"crawler_config": {
|
||||||
# "type": "CrawlerRunConfig",
|
"type": "CrawlerRunConfig",
|
||||||
# "params": {
|
"params": {
|
||||||
# "stream": False,
|
"stream": False,
|
||||||
# "cache_mode": "BYPASS",
|
"cache_mode": "BYPASS",
|
||||||
# "deep_crawl_strategy": {
|
"deep_crawl_strategy": {
|
||||||
# "type": "BFSDeepCrawlStrategy",
|
"type": "BFSDeepCrawlStrategy",
|
||||||
# "params": {
|
"params": {
|
||||||
# "max_depth": max_depth,
|
"max_depth": max_depth,
|
||||||
# "max_pages": max_pages,
|
"max_pages": max_pages,
|
||||||
# "filter_chain": {
|
"filter_chain": {
|
||||||
# "type": "FilterChain",
|
"type": "FilterChain",
|
||||||
# "params": {
|
"params": {
|
||||||
# "filters": [
|
"filters": [
|
||||||
# {
|
{
|
||||||
# "type": "DomainFilter",
|
"type": "DomainFilter",
|
||||||
# "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
|
"params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
|
||||||
# },
|
},
|
||||||
# {
|
{
|
||||||
# "type": "ContentTypeFilter",
|
"type": "ContentTypeFilter",
|
||||||
# "params": {"allowed_types": ["text/html"]}
|
"params": {"allowed_types": ["text/html"]}
|
||||||
# },
|
},
|
||||||
# # Example: Exclude specific paths using regex
|
# Example: Exclude specific paths using regex
|
||||||
# {
|
{
|
||||||
# "type": "URLPatternFilter",
|
"type": "URLPatternFilter",
|
||||||
# "params": {
|
"params": {
|
||||||
# "patterns": ["*/category-3/*"], # Block category 3
|
"patterns": ["*/category-3/*"], # Block category 3
|
||||||
# "reverse": True # Block if match
|
"reverse": True # Block if match
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# ]
|
]
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# response = await async_client.post("/crawl", json=payload)
|
response = await async_client.post("/crawl", json=payload)
|
||||||
# response.raise_for_status()
|
response.raise_for_status()
|
||||||
# data = response.json()
|
data = response.json()
|
||||||
|
|
||||||
# assert data["success"] is True
|
assert data["success"] is True
|
||||||
# assert len(data["results"]) > 0
|
assert len(data["results"]) > 0
|
||||||
# assert len(data["results"]) <= max_pages
|
assert len(data["results"]) <= max_pages
|
||||||
|
|
||||||
# for result in data["results"]:
|
for result in data["results"]:
|
||||||
# await assert_crawl_result_structure(result)
|
await assert_crawl_result_structure(result)
|
||||||
# assert result["success"] is True
|
assert result["success"] is True
|
||||||
# assert DEEP_CRAWL_DOMAIN in result["url"]
|
assert DEEP_CRAWL_DOMAIN in result["url"]
|
||||||
# assert "category-3" not in result["url"] # Check if filter worked
|
assert "category-3" not in result["url"] # Check if filter worked
|
||||||
# assert result["metadata"]["depth"] <= max_depth
|
assert result["metadata"]["depth"] <= max_depth
|
||||||
|
|
||||||
# # 3. Deep Crawl with Scoring
|
# 3. Deep Crawl with Scoring
|
||||||
# async def test_deep_crawl_with_scoring(self, async_client: httpx.AsyncClient):
|
async def test_deep_crawl_with_scoring(self, async_client: httpx.AsyncClient):
|
||||||
# """Test BFS deep crawl with URL scoring."""
|
"""Test BFS deep crawl with URL scoring."""
|
||||||
# max_depth = 1
|
max_depth = 1
|
||||||
# max_pages = 4
|
max_pages = 4
|
||||||
# payload = {
|
payload = {
|
||||||
# "urls": [DEEP_CRAWL_BASE_URL],
|
"urls": [DEEP_CRAWL_BASE_URL],
|
||||||
# "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
# "crawler_config": {
|
"crawler_config": {
|
||||||
# "type": "CrawlerRunConfig",
|
"type": "CrawlerRunConfig",
|
||||||
# "params": {
|
"params": {
|
||||||
# "stream": False,
|
"stream": False,
|
||||||
# "cache_mode": "BYPASS",
|
"cache_mode": "BYPASS",
|
||||||
# "deep_crawl_strategy": {
|
"deep_crawl_strategy": {
|
||||||
# "type": "BFSDeepCrawlStrategy",
|
"type": "BFSDeepCrawlStrategy",
|
||||||
# "params": {
|
"params": {
|
||||||
# "max_depth": max_depth,
|
"max_depth": max_depth,
|
||||||
# "max_pages": max_pages,
|
"max_pages": max_pages,
|
||||||
# "filter_chain": { # Keep basic domain filter
|
"filter_chain": { # Keep basic domain filter
|
||||||
# "type": "FilterChain",
|
"type": "FilterChain",
|
||||||
# "params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
|
"params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
|
||||||
# },
|
},
|
||||||
# "url_scorer": { # Add scorer
|
"url_scorer": { # Add scorer
|
||||||
# "type": "CompositeScorer",
|
"type": "CompositeScorer",
|
||||||
# "params": {
|
"params": {
|
||||||
# "scorers": [
|
"scorers": [
|
||||||
# { # Favor pages with 'product' in the URL
|
{ # Favor pages with 'product' in the URL
|
||||||
# "type": "KeywordRelevanceScorer",
|
"type": "KeywordRelevanceScorer",
|
||||||
# "params": {"keywords": ["product"], "weight": 1.0}
|
"params": {"keywords": ["product"], "weight": 1.0}
|
||||||
# },
|
},
|
||||||
# { # Penalize deep paths slightly
|
{ # Penalize deep paths slightly
|
||||||
# "type": "PathDepthScorer",
|
"type": "PathDepthScorer",
|
||||||
# "params": {"optimal_depth": 2, "weight": -0.2}
|
"params": {"optimal_depth": 2, "weight": -0.2}
|
||||||
# }
|
}
|
||||||
# ]
|
]
|
||||||
# }
|
}
|
||||||
# },
|
},
|
||||||
# # Set a threshold if needed: "score_threshold": 0.1
|
# Set a threshold if needed: "score_threshold": 0.1
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# response = await async_client.post("/crawl", json=payload)
|
response = await async_client.post("/crawl", json=payload)
|
||||||
# response.raise_for_status()
|
response.raise_for_status()
|
||||||
# data = response.json()
|
data = response.json()
|
||||||
|
|
||||||
# assert data["success"] is True
|
assert data["success"] is True
|
||||||
# assert len(data["results"]) > 0
|
assert len(data["results"]) > 0
|
||||||
# assert len(data["results"]) <= max_pages
|
assert len(data["results"]) <= max_pages
|
||||||
|
|
||||||
# # Check if results seem biased towards products (harder to assert strictly without knowing exact scores)
|
# Check if results seem biased towards products (harder to assert strictly without knowing exact scores)
|
||||||
# product_urls_found = any("product_" in result["url"] for result in data["results"] if result["metadata"]["depth"] > 0)
|
product_urls_found = any("product_" in result["url"] for result in data["results"] if result["metadata"]["depth"] > 0)
|
||||||
# print(f"Product URLs found among depth > 0 results: {product_urls_found}")
|
print(f"Product URLs found among depth > 0 results: {product_urls_found}")
|
||||||
# # We expect scoring to prioritize product pages if available within limits
|
# We expect scoring to prioritize product pages if available within limits
|
||||||
# # assert product_urls_found # This might be too strict depending on site structure and limits
|
# assert product_urls_found # This might be too strict depending on site structure and limits
|
||||||
|
|
||||||
# for result in data["results"]:
|
for result in data["results"]:
|
||||||
# await assert_crawl_result_structure(result)
|
await assert_crawl_result_structure(result)
|
||||||
# assert result["success"] is True
|
assert result["success"] is True
|
||||||
# assert result["metadata"]["depth"] <= max_depth
|
assert result["metadata"]["depth"] <= max_depth
|
||||||
|
|
||||||
# # 4. Deep Crawl with CSS Extraction
|
# 4. Deep Crawl with CSS Extraction
|
||||||
# async def test_deep_crawl_with_css_extraction(self, async_client: httpx.AsyncClient):
|
async def test_deep_crawl_with_css_extraction(self, async_client: httpx.AsyncClient):
|
||||||
# """Test BFS deep crawl combined with JsonCssExtractionStrategy."""
|
"""Test BFS deep crawl combined with JsonCssExtractionStrategy."""
|
||||||
# max_depth = 6 # Go deep enough to reach product pages
|
max_depth = 6 # Go deep enough to reach product pages
|
||||||
# max_pages = 20
|
max_pages = 20
|
||||||
# # Schema to extract product details
|
# Schema to extract product details
|
||||||
# product_schema = {
|
product_schema = {
|
||||||
# "name": "ProductDetails",
|
"name": "ProductDetails",
|
||||||
# "baseSelector": "div.container", # Base for product page
|
"baseSelector": "div.container", # Base for product page
|
||||||
# "fields": [
|
"fields": [
|
||||||
# {"name": "product_title", "selector": "h1", "type": "text"},
|
{"name": "product_title", "selector": "h1", "type": "text"},
|
||||||
# {"name": "price", "selector": ".product-price", "type": "text"},
|
{"name": "price", "selector": ".product-price", "type": "text"},
|
||||||
# {"name": "description", "selector": ".product-description p", "type": "text"},
|
{"name": "description", "selector": ".product-description p", "type": "text"},
|
||||||
# {"name": "specs", "selector": ".product-specs li", "type": "list", "fields":[
|
{"name": "specs", "selector": ".product-specs li", "type": "list", "fields":[
|
||||||
# {"name": "spec_name", "selector": ".spec-name", "type": "text"},
|
{"name": "spec_name", "selector": ".spec-name", "type": "text"},
|
||||||
# {"name": "spec_value", "selector": ".spec-value", "type": "text"}
|
{"name": "spec_value", "selector": ".spec-value", "type": "text"}
|
||||||
# ]}
|
]}
|
||||||
# ]
|
]
|
||||||
# }
|
}
|
||||||
# payload = {
|
payload = {
|
||||||
# "urls": [DEEP_CRAWL_BASE_URL],
|
"urls": [DEEP_CRAWL_BASE_URL],
|
||||||
# "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
# "crawler_config": {
|
"crawler_config": {
|
||||||
# "type": "CrawlerRunConfig",
|
"type": "CrawlerRunConfig",
|
||||||
# "params": {
|
"params": {
|
||||||
# "stream": False,
|
"stream": False,
|
||||||
# "cache_mode": "BYPASS",
|
"cache_mode": "BYPASS",
|
||||||
# "extraction_strategy": { # Apply extraction to ALL crawled pages
|
"extraction_strategy": { # Apply extraction to ALL crawled pages
|
||||||
# "type": "JsonCssExtractionStrategy",
|
"type": "JsonCssExtractionStrategy",
|
||||||
# "params": {"schema": {"type": "dict", "value": product_schema}}
|
"params": {"schema": {"type": "dict", "value": product_schema}}
|
||||||
# },
|
},
|
||||||
# "deep_crawl_strategy": {
|
"deep_crawl_strategy": {
|
||||||
# "type": "BFSDeepCrawlStrategy",
|
"type": "BFSDeepCrawlStrategy",
|
||||||
# "params": {
|
"params": {
|
||||||
# "max_depth": max_depth,
|
"max_depth": max_depth,
|
||||||
# "max_pages": max_pages,
|
"max_pages": max_pages,
|
||||||
# "filter_chain": { # Only crawl HTML on our domain
|
"filter_chain": { # Only crawl HTML on our domain
|
||||||
# "type": "FilterChain",
|
"type": "FilterChain",
|
||||||
# "params": {
|
"params": {
|
||||||
# "filters": [
|
"filters": [
|
||||||
# {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
|
{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
|
||||||
# {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
|
{"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
|
||||||
# ]
|
]
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# # Optional: Add scoring to prioritize product pages for extraction
|
# Optional: Add scoring to prioritize product pages for extraction
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# response = await async_client.post("/crawl", json=payload)
|
response = await async_client.post("/crawl", json=payload)
|
||||||
# response.raise_for_status()
|
response.raise_for_status()
|
||||||
# data = response.json()
|
data = response.json()
|
||||||
|
|
||||||
# assert data["success"] is True
|
assert data["success"] is True
|
||||||
# assert len(data["results"]) > 0
|
assert len(data["results"]) > 0
|
||||||
# # assert len(data["results"]) <= max_pages
|
# assert len(data["results"]) <= max_pages
|
||||||
|
|
||||||
# found_extracted_product = False
|
found_extracted_product = False
|
||||||
# for result in data["results"]:
|
for result in data["results"]:
|
||||||
# await assert_crawl_result_structure(result)
|
await assert_crawl_result_structure(result)
|
||||||
# assert result["success"] is True
|
assert result["success"] is True
|
||||||
# assert "extracted_content" in result
|
assert "extracted_content" in result
|
||||||
# if "product_" in result["url"]: # Check product pages specifically
|
if "product_" in result["url"]: # Check product pages specifically
|
||||||
# assert result["extracted_content"] is not None
|
assert result["extracted_content"] is not None
|
||||||
# try:
|
try:
|
||||||
# extracted = json.loads(result["extracted_content"])
|
extracted = json.loads(result["extracted_content"])
|
||||||
# # Schema returns list even if one base match
|
# Schema returns list even if one base match
|
||||||
# assert isinstance(extracted, list)
|
assert isinstance(extracted, list)
|
||||||
# if extracted:
|
if extracted:
|
||||||
# item = extracted[0]
|
item = extracted[0]
|
||||||
# assert "product_title" in item and item["product_title"]
|
assert "product_title" in item and item["product_title"]
|
||||||
# assert "price" in item and item["price"]
|
assert "price" in item and item["price"]
|
||||||
# # Specs might be empty list if not found
|
# Specs might be empty list if not found
|
||||||
# assert "specs" in item and isinstance(item["specs"], list)
|
assert "specs" in item and isinstance(item["specs"], list)
|
||||||
# found_extracted_product = True
|
found_extracted_product = True
|
||||||
# print(f"Extracted product: {item.get('product_title')}")
|
print(f"Extracted product: {item.get('product_title')}")
|
||||||
# except (json.JSONDecodeError, AssertionError, IndexError) as e:
|
except (json.JSONDecodeError, AssertionError, IndexError) as e:
|
||||||
# pytest.fail(f"Extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
|
pytest.fail(f"Extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
|
||||||
# # else:
|
# else:
|
||||||
# # # Non-product pages might have None or empty list depending on schema match
|
# # Non-product pages might have None or empty list depending on schema match
|
||||||
# # assert result["extracted_content"] is None or json.loads(result["extracted_content"]) == []
|
# assert result["extracted_content"] is None or json.loads(result["extracted_content"]) == []
|
||||||
|
|
||||||
# assert found_extracted_product, "Did not find any pages where product data was successfully extracted."
|
assert found_extracted_product, "Did not find any pages where product data was successfully extracted."
|
||||||
|
|
||||||
# # 5. Deep Crawl with LLM Extraction (Requires Server LLM Setup)
|
# 5. Deep Crawl with LLM Extraction (Requires Server LLM Setup)
|
||||||
# async def test_deep_crawl_with_llm_extraction(self, async_client: httpx.AsyncClient):
|
async def test_deep_crawl_with_llm_extraction(self, async_client: httpx.AsyncClient):
|
||||||
# """Test BFS deep crawl combined with LLMExtractionStrategy."""
|
"""Test BFS deep crawl combined with LLMExtractionStrategy."""
|
||||||
# max_depth = 1 # Limit depth to keep LLM calls manageable
|
max_depth = 1 # Limit depth to keep LLM calls manageable
|
||||||
# max_pages = 3
|
max_pages = 3
|
||||||
# payload = {
|
payload = {
|
||||||
# "urls": [DEEP_CRAWL_BASE_URL],
|
"urls": [DEEP_CRAWL_BASE_URL],
|
||||||
# "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
# "crawler_config": {
|
"crawler_config": {
|
||||||
# "type": "CrawlerRunConfig",
|
"type": "CrawlerRunConfig",
|
||||||
# "params": {
|
"params": {
|
||||||
# "stream": False,
|
"stream": False,
|
||||||
# "cache_mode": "BYPASS",
|
"cache_mode": "BYPASS",
|
||||||
# "extraction_strategy": { # Apply LLM extraction to crawled pages
|
"extraction_strategy": { # Apply LLM extraction to crawled pages
|
||||||
# "type": "LLMExtractionStrategy",
|
"type": "LLMExtractionStrategy",
|
||||||
# "params": {
|
"params": {
|
||||||
# "instruction": "Extract the main H1 title and the text content of the first paragraph.",
|
"instruction": "Extract the main H1 title and the text content of the first paragraph.",
|
||||||
# "llm_config": { # Example override, rely on server default if possible
|
"llm_config": { # Example override, rely on server default if possible
|
||||||
# "type": "LLMConfig",
|
"type": "LLMConfig",
|
||||||
# "params": {"provider": "openai/gpt-4.1-mini"} # Use a cheaper model for testing
|
"params": {"provider": "openai/gpt-4.1-mini"} # Use a cheaper model for testing
|
||||||
# },
|
},
|
||||||
# "schema": { # Expected JSON output
|
"schema": { # Expected JSON output
|
||||||
# "type": "dict",
|
"type": "dict",
|
||||||
# "value": {
|
"value": {
|
||||||
# "title": "PageContent", "type": "object",
|
"title": "PageContent", "type": "object",
|
||||||
# "properties": {
|
"properties": {
|
||||||
# "h1_title": {"type": "string"},
|
"h1_title": {"type": "string"},
|
||||||
# "first_paragraph": {"type": "string"}
|
"first_paragraph": {"type": "string"}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# },
|
},
|
||||||
# "deep_crawl_strategy": {
|
"deep_crawl_strategy": {
|
||||||
# "type": "BFSDeepCrawlStrategy",
|
"type": "BFSDeepCrawlStrategy",
|
||||||
# "params": {
|
"params": {
|
||||||
# "max_depth": max_depth,
|
"max_depth": max_depth,
|
||||||
# "max_pages": max_pages,
|
"max_pages": max_pages,
|
||||||
# "filter_chain": {
|
"filter_chain": {
|
||||||
# "type": "FilterChain",
|
"type": "FilterChain",
|
||||||
# "params": {
|
"params": {
|
||||||
# "filters": [
|
"filters": [
|
||||||
# {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
|
{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
|
||||||
# {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
|
{"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
|
||||||
# ]
|
]
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
|
|
||||||
# try:
|
try:
|
||||||
# response = await async_client.post("/crawl", json=payload)
|
response = await async_client.post("/crawl", json=payload)
|
||||||
# response.raise_for_status()
|
response.raise_for_status()
|
||||||
# data = response.json()
|
data = response.json()
|
||||||
# except httpx.HTTPStatusError as e:
|
except httpx.HTTPStatusError as e:
|
||||||
# pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and LLM API key setup.")
|
pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and LLM API key setup.")
|
||||||
# except httpx.RequestError as e:
|
except httpx.RequestError as e:
|
||||||
# pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}.")
|
pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}.")
|
||||||
|
|
||||||
|
|
||||||
# assert data["success"] is True
|
assert data["success"] is True
|
||||||
# assert len(data["results"]) > 0
|
assert len(data["results"]) > 0
|
||||||
# assert len(data["results"]) <= max_pages
|
assert len(data["results"]) <= max_pages
|
||||||
|
|
||||||
# found_llm_extraction = False
|
found_llm_extraction = False
|
||||||
# for result in data["results"]:
|
for result in data["results"]:
|
||||||
# await assert_crawl_result_structure(result)
|
await assert_crawl_result_structure(result)
|
||||||
# assert result["success"] is True
|
assert result["success"] is True
|
||||||
# assert "extracted_content" in result
|
assert "extracted_content" in result
|
||||||
# assert result["extracted_content"] is not None
|
assert result["extracted_content"] is not None
|
||||||
# try:
|
try:
|
||||||
# extracted = json.loads(result["extracted_content"])
|
extracted = json.loads(result["extracted_content"])
|
||||||
# if isinstance(extracted, list): extracted = extracted[0] # Handle list output
|
if isinstance(extracted, list): extracted = extracted[0] # Handle list output
|
||||||
# assert isinstance(extracted, dict)
|
assert isinstance(extracted, dict)
|
||||||
# assert "h1_title" in extracted # Check keys based on schema
|
assert "h1_title" in extracted # Check keys based on schema
|
||||||
# assert "first_paragraph" in extracted
|
assert "first_paragraph" in extracted
|
||||||
# found_llm_extraction = True
|
found_llm_extraction = True
|
||||||
# print(f"LLM extracted from {result['url']}: Title='{extracted.get('h1_title')}'")
|
print(f"LLM extracted from {result['url']}: Title='{extracted.get('h1_title')}'")
|
||||||
# except (json.JSONDecodeError, AssertionError, IndexError, TypeError) as e:
|
except (json.JSONDecodeError, AssertionError, IndexError, TypeError) as e:
|
||||||
# pytest.fail(f"LLM extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
|
pytest.fail(f"LLM extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
|
||||||
|
|
||||||
# assert found_llm_extraction, "LLM extraction did not yield expected data on any crawled page."
|
assert found_llm_extraction, "LLM extraction did not yield expected data on any crawled page."
|
||||||
|
|
||||||
|
|
||||||
# # 6. Deep Crawl with SSL Certificate Fetching
|
# 6. Deep Crawl with SSL Certificate Fetching
|
||||||
# async def test_deep_crawl_with_ssl(self, async_client: httpx.AsyncClient):
|
async def test_deep_crawl_with_ssl(self, async_client: httpx.AsyncClient):
|
||||||
# """Test BFS deep crawl with fetch_ssl_certificate enabled."""
|
"""Test BFS deep crawl with fetch_ssl_certificate enabled."""
|
||||||
# max_depth = 0 # Only fetch for start URL to keep test fast
|
max_depth = 0 # Only fetch for start URL to keep test fast
|
||||||
# max_pages = 1
|
max_pages = 1
|
||||||
# payload = {
|
payload = {
|
||||||
# "urls": [DEEP_CRAWL_BASE_URL],
|
"urls": [DEEP_CRAWL_BASE_URL],
|
||||||
# "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
# "crawler_config": {
|
"crawler_config": {
|
||||||
# "type": "CrawlerRunConfig",
|
"type": "CrawlerRunConfig",
|
||||||
# "params": {
|
"params": {
|
||||||
# "stream": False,
|
"stream": False,
|
||||||
# "cache_mode": "BYPASS",
|
"cache_mode": "BYPASS",
|
||||||
# "fetch_ssl_certificate": True, # <-- Enable SSL fetching
|
"fetch_ssl_certificate": True, # <-- Enable SSL fetching
|
||||||
# "deep_crawl_strategy": {
|
"deep_crawl_strategy": {
|
||||||
# "type": "BFSDeepCrawlStrategy",
|
"type": "BFSDeepCrawlStrategy",
|
||||||
# "params": {
|
"params": {
|
||||||
# "max_depth": max_depth,
|
"max_depth": max_depth,
|
||||||
# "max_pages": max_pages,
|
"max_pages": max_pages,
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
# response = await async_client.post("/crawl", json=payload)
|
response = await async_client.post("/crawl", json=payload)
|
||||||
# response.raise_for_status()
|
response.raise_for_status()
|
||||||
# data = response.json()
|
data = response.json()
|
||||||
|
|
||||||
# assert data["success"] is True
|
assert data["success"] is True
|
||||||
# assert len(data["results"]) == 1
|
assert len(data["results"]) == 1
|
||||||
# result = data["results"][0]
|
result = data["results"][0]
|
||||||
|
|
||||||
# await assert_crawl_result_structure(result, check_ssl=True) # <-- Tell helper to check SSL field
|
await assert_crawl_result_structure(result, check_ssl=True) # <-- Tell helper to check SSL field
|
||||||
# assert result["success"] is True
|
assert result["success"] is True
|
||||||
# # Check if SSL info was actually retrieved
|
# Check if SSL info was actually retrieved
|
||||||
# if result["ssl_certificate"]:
|
if result["ssl_certificate"]:
|
||||||
# # Assert directly using dictionary keys
|
# Assert directly using dictionary keys
|
||||||
# assert isinstance(result["ssl_certificate"], dict) # Verify it's a dict
|
assert isinstance(result["ssl_certificate"], dict) # Verify it's a dict
|
||||||
# assert "issuer" in result["ssl_certificate"]
|
assert "issuer" in result["ssl_certificate"]
|
||||||
# assert "subject" in result["ssl_certificate"]
|
assert "subject" in result["ssl_certificate"]
|
||||||
# # --- MODIFIED ASSERTIONS ---
|
# --- MODIFIED ASSERTIONS ---
|
||||||
# assert "not_before" in result["ssl_certificate"] # Check for the actual key
|
assert "not_before" in result["ssl_certificate"] # Check for the actual key
|
||||||
# assert "not_after" in result["ssl_certificate"] # Check for the actual key
|
assert "not_after" in result["ssl_certificate"] # Check for the actual key
|
||||||
# # --- END MODIFICATIONS ---
|
# --- END MODIFICATIONS ---
|
||||||
# assert "fingerprint" in result["ssl_certificate"] # Check another key
|
assert "fingerprint" in result["ssl_certificate"] # Check another key
|
||||||
|
|
||||||
# # This print statement using .get() already works correctly with dictionaries
|
# This print statement using .get() already works correctly with dictionaries
|
||||||
# print(f"SSL Issuer Org: {result['ssl_certificate'].get('issuer', {}).get('O', 'N/A')}")
|
print(f"SSL Issuer Org: {result['ssl_certificate'].get('issuer', {}).get('O', 'N/A')}")
|
||||||
# print(f"SSL Valid From: {result['ssl_certificate'].get('not_before', 'N/A')}")
|
print(f"SSL Valid From: {result['ssl_certificate'].get('not_before', 'N/A')}")
|
||||||
# else:
|
else:
|
||||||
# # This part remains the same
|
# This part remains the same
|
||||||
# print("SSL Certificate was null in the result.")
|
print("SSL Certificate was null in the result.")
|
||||||
|
|
||||||
|
|
||||||
# 7. Deep Crawl with Proxy Rotation (Requires PROXIES env var)
|
# 7. Deep Crawl with Proxy Rotation (Requires PROXIES env var)
|
||||||
|
|||||||
Reference in New Issue
Block a user