refactor(proxy): move ProxyConfig to async_configs and improve LLM token handling
Moved ProxyConfig class from proxy_strategy.py to async_configs.py for better organization. Improved LLM token handling with new PROVIDER_MODELS_PREFIXES. Added test cases for deep crawling and proxy rotation. Removed docker_config from BrowserConfig as it's handled separately. BREAKING CHANGE: ProxyConfig import path changed from crawl4ai.proxy_strategy to crawl4ai
This commit is contained in:
596
tests/docker/test_rest_api_deep_crawl.py
Normal file
596
tests/docker/test_rest_api_deep_crawl.py
Normal file
@@ -0,0 +1,596 @@
|
||||
# ==== File: test_rest_api_deep_crawl.py ====
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import httpx
|
||||
import json
|
||||
import asyncio
|
||||
import os
|
||||
from typing import List, Dict, Any, AsyncGenerator
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv() # Load environment variables from .env file if present
|
||||
|
||||
# --- Test Configuration ---
|
||||
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # Ensure this points to your running server
|
||||
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Ensure this points to your running server
|
||||
DEEP_CRAWL_BASE_URL = "https://docs.crawl4ai.com/samples/deepcrawl/"
|
||||
DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com" # Used for domain filter
|
||||
|
||||
# --- Helper Functions ---
|
||||
def load_proxies_from_env() -> List[Dict]:
|
||||
"""Load proxies from PROXIES environment variable"""
|
||||
proxies = []
|
||||
proxies_str = os.getenv("PROXIES", "")
|
||||
if not proxies_str:
|
||||
print("PROXIES environment variable not set or empty.")
|
||||
return proxies
|
||||
try:
|
||||
proxy_list = proxies_str.split(",")
|
||||
for proxy in proxy_list:
|
||||
proxy = proxy.strip()
|
||||
if not proxy:
|
||||
continue
|
||||
parts = proxy.split(":")
|
||||
if len(parts) == 4:
|
||||
ip, port, username, password = parts
|
||||
proxies.append({
|
||||
"server": f"http://{ip}:{port}", # Assuming http, adjust if needed
|
||||
"username": username,
|
||||
"password": password,
|
||||
"ip": ip # Store original IP if available
|
||||
})
|
||||
elif len(parts) == 2: # ip:port only
|
||||
ip, port = parts
|
||||
proxies.append({
|
||||
"server": f"http://{ip}:{port}",
|
||||
"ip": ip
|
||||
})
|
||||
else:
|
||||
print(f"Skipping invalid proxy string format: {proxy}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error loading proxies from environment: {e}")
|
||||
return proxies
|
||||
|
||||
|
||||
async def check_server_health(client: httpx.AsyncClient):
|
||||
"""Check if the server is healthy before running tests."""
|
||||
try:
|
||||
response = await client.get("/health")
|
||||
response.raise_for_status()
|
||||
print(f"\nServer healthy: {response.json()}")
|
||||
return True
|
||||
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
||||
pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
|
||||
|
||||
async def assert_crawl_result_structure(result: Dict[str, Any], check_ssl=False):
|
||||
"""Asserts the basic structure of a single crawl result."""
|
||||
assert isinstance(result, dict)
|
||||
assert "url" in result
|
||||
assert "success" in result
|
||||
assert "html" in result # Basic crawls should return HTML
|
||||
assert "metadata" in result
|
||||
assert isinstance(result["metadata"], dict)
|
||||
assert "depth" in result["metadata"] # Deep crawls add depth
|
||||
|
||||
if check_ssl:
|
||||
assert "ssl_certificate" in result # Check if SSL info is present
|
||||
assert isinstance(result["ssl_certificate"], dict) or result["ssl_certificate"] is None
|
||||
|
||||
|
||||
async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
|
||||
"""Processes an NDJSON streaming response."""
|
||||
results = []
|
||||
completed = False
|
||||
async for line in response.aiter_lines():
|
||||
if line:
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if data.get("status") == "completed":
|
||||
completed = True
|
||||
break # Stop processing after completion marker
|
||||
elif data.get("url"): # Ensure it looks like a result object
|
||||
results.append(data)
|
||||
else:
|
||||
print(f"Received non-result JSON line: {data}") # Log other status messages if needed
|
||||
except json.JSONDecodeError:
|
||||
pytest.fail(f"Failed to decode JSON line: {line}")
|
||||
assert completed, "Streaming response did not end with a completion marker."
|
||||
return results
|
||||
|
||||
|
||||
# --- Pytest Fixtures ---
|
||||
@pytest_asyncio.fixture(scope="function")
|
||||
async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
|
||||
"""Provides an async HTTP client"""
|
||||
# Increased timeout for potentially longer deep crawls
|
||||
async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
|
||||
yield client
|
||||
# No explicit close needed with 'async with'
|
||||
|
||||
# --- Test Class ---
|
||||
@pytest.mark.asyncio
|
||||
class TestDeepCrawlEndpoints:
|
||||
|
||||
@pytest_asyncio.fixture(autouse=True)
|
||||
async def check_health_before_tests(self, async_client: httpx.AsyncClient):
|
||||
"""Fixture to ensure server is healthy before each test in the class."""
|
||||
await check_server_health(async_client)
|
||||
|
||||
# 1. Basic Deep Crawl
|
||||
# async def test_deep_crawl_basic_bfs(self, async_client: httpx.AsyncClient):
|
||||
# """Test BFS deep crawl with limited depth and pages."""
|
||||
# max_depth = 1
|
||||
# max_pages = 3 # start_url + 2 more
|
||||
# payload = {
|
||||
# "urls": [DEEP_CRAWL_BASE_URL],
|
||||
# "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
# "crawler_config": {
|
||||
# "type": "CrawlerRunConfig",
|
||||
# "params": {
|
||||
# "stream": False,
|
||||
# "cache_mode": "BYPASS", # Use string value for CacheMode
|
||||
# "deep_crawl_strategy": {
|
||||
# "type": "BFSDeepCrawlStrategy",
|
||||
# "params": {
|
||||
# "max_depth": max_depth,
|
||||
# "max_pages": max_pages,
|
||||
# # Minimal filters for basic test
|
||||
# "filter_chain": {
|
||||
# "type": "FilterChain",
|
||||
# "params": {
|
||||
# "filters": [
|
||||
# {
|
||||
# "type": "DomainFilter",
|
||||
# "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
|
||||
# }
|
||||
# ]
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# response = await async_client.post("/crawl", json=payload)
|
||||
# response.raise_for_status()
|
||||
# data = response.json()
|
||||
|
||||
# assert data["success"] is True
|
||||
# assert isinstance(data["results"], list)
|
||||
# assert len(data["results"]) > 1 # Should be more than just the start URL
|
||||
# assert len(data["results"]) <= max_pages # Respect max_pages
|
||||
|
||||
# found_depth_0 = False
|
||||
# found_depth_1 = False
|
||||
# for result in data["results"]:
|
||||
# await assert_crawl_result_structure(result)
|
||||
# assert result["success"] is True
|
||||
# assert DEEP_CRAWL_DOMAIN in result["url"]
|
||||
# depth = result["metadata"]["depth"]
|
||||
# assert depth <= max_depth
|
||||
# if depth == 0: found_depth_0 = True
|
||||
# if depth == 1: found_depth_1 = True
|
||||
|
||||
# assert found_depth_0
|
||||
# assert found_depth_1
|
||||
|
||||
# # 2. Deep Crawl with Filtering
|
||||
# async def test_deep_crawl_with_filters(self, async_client: httpx.AsyncClient):
|
||||
# """Test BFS deep crawl with content type and domain filters."""
|
||||
# max_depth = 1
|
||||
# max_pages = 5
|
||||
# payload = {
|
||||
# "urls": [DEEP_CRAWL_BASE_URL],
|
||||
# "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
# "crawler_config": {
|
||||
# "type": "CrawlerRunConfig",
|
||||
# "params": {
|
||||
# "stream": False,
|
||||
# "cache_mode": "BYPASS",
|
||||
# "deep_crawl_strategy": {
|
||||
# "type": "BFSDeepCrawlStrategy",
|
||||
# "params": {
|
||||
# "max_depth": max_depth,
|
||||
# "max_pages": max_pages,
|
||||
# "filter_chain": {
|
||||
# "type": "FilterChain",
|
||||
# "params": {
|
||||
# "filters": [
|
||||
# {
|
||||
# "type": "DomainFilter",
|
||||
# "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
|
||||
# },
|
||||
# {
|
||||
# "type": "ContentTypeFilter",
|
||||
# "params": {"allowed_types": ["text/html"]}
|
||||
# },
|
||||
# # Example: Exclude specific paths using regex
|
||||
# {
|
||||
# "type": "URLPatternFilter",
|
||||
# "params": {
|
||||
# "patterns": ["*/category-3/*"], # Block category 3
|
||||
# "reverse": True # Block if match
|
||||
# }
|
||||
# }
|
||||
# ]
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# response = await async_client.post("/crawl", json=payload)
|
||||
# response.raise_for_status()
|
||||
# data = response.json()
|
||||
|
||||
# assert data["success"] is True
|
||||
# assert len(data["results"]) > 0
|
||||
# assert len(data["results"]) <= max_pages
|
||||
|
||||
# for result in data["results"]:
|
||||
# await assert_crawl_result_structure(result)
|
||||
# assert result["success"] is True
|
||||
# assert DEEP_CRAWL_DOMAIN in result["url"]
|
||||
# assert "category-3" not in result["url"] # Check if filter worked
|
||||
# assert result["metadata"]["depth"] <= max_depth
|
||||
|
||||
# # 3. Deep Crawl with Scoring
|
||||
# async def test_deep_crawl_with_scoring(self, async_client: httpx.AsyncClient):
|
||||
# """Test BFS deep crawl with URL scoring."""
|
||||
# max_depth = 1
|
||||
# max_pages = 4
|
||||
# payload = {
|
||||
# "urls": [DEEP_CRAWL_BASE_URL],
|
||||
# "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
# "crawler_config": {
|
||||
# "type": "CrawlerRunConfig",
|
||||
# "params": {
|
||||
# "stream": False,
|
||||
# "cache_mode": "BYPASS",
|
||||
# "deep_crawl_strategy": {
|
||||
# "type": "BFSDeepCrawlStrategy",
|
||||
# "params": {
|
||||
# "max_depth": max_depth,
|
||||
# "max_pages": max_pages,
|
||||
# "filter_chain": { # Keep basic domain filter
|
||||
# "type": "FilterChain",
|
||||
# "params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
|
||||
# },
|
||||
# "url_scorer": { # Add scorer
|
||||
# "type": "CompositeScorer",
|
||||
# "params": {
|
||||
# "scorers": [
|
||||
# { # Favor pages with 'product' in the URL
|
||||
# "type": "KeywordRelevanceScorer",
|
||||
# "params": {"keywords": ["product"], "weight": 1.0}
|
||||
# },
|
||||
# { # Penalize deep paths slightly
|
||||
# "type": "PathDepthScorer",
|
||||
# "params": {"optimal_depth": 2, "weight": -0.2}
|
||||
# }
|
||||
# ]
|
||||
# }
|
||||
# },
|
||||
# # Set a threshold if needed: "score_threshold": 0.1
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# response = await async_client.post("/crawl", json=payload)
|
||||
# response.raise_for_status()
|
||||
# data = response.json()
|
||||
|
||||
# assert data["success"] is True
|
||||
# assert len(data["results"]) > 0
|
||||
# assert len(data["results"]) <= max_pages
|
||||
|
||||
# # Check if results seem biased towards products (harder to assert strictly without knowing exact scores)
|
||||
# product_urls_found = any("product_" in result["url"] for result in data["results"] if result["metadata"]["depth"] > 0)
|
||||
# print(f"Product URLs found among depth > 0 results: {product_urls_found}")
|
||||
# # We expect scoring to prioritize product pages if available within limits
|
||||
# # assert product_urls_found # This might be too strict depending on site structure and limits
|
||||
|
||||
# for result in data["results"]:
|
||||
# await assert_crawl_result_structure(result)
|
||||
# assert result["success"] is True
|
||||
# assert result["metadata"]["depth"] <= max_depth
|
||||
|
||||
# # 4. Deep Crawl with CSS Extraction
|
||||
# async def test_deep_crawl_with_css_extraction(self, async_client: httpx.AsyncClient):
|
||||
# """Test BFS deep crawl combined with JsonCssExtractionStrategy."""
|
||||
# max_depth = 6 # Go deep enough to reach product pages
|
||||
# max_pages = 20
|
||||
# # Schema to extract product details
|
||||
# product_schema = {
|
||||
# "name": "ProductDetails",
|
||||
# "baseSelector": "div.container", # Base for product page
|
||||
# "fields": [
|
||||
# {"name": "product_title", "selector": "h1", "type": "text"},
|
||||
# {"name": "price", "selector": ".product-price", "type": "text"},
|
||||
# {"name": "description", "selector": ".product-description p", "type": "text"},
|
||||
# {"name": "specs", "selector": ".product-specs li", "type": "list", "fields":[
|
||||
# {"name": "spec_name", "selector": ".spec-name", "type": "text"},
|
||||
# {"name": "spec_value", "selector": ".spec-value", "type": "text"}
|
||||
# ]}
|
||||
# ]
|
||||
# }
|
||||
# payload = {
|
||||
# "urls": [DEEP_CRAWL_BASE_URL],
|
||||
# "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
# "crawler_config": {
|
||||
# "type": "CrawlerRunConfig",
|
||||
# "params": {
|
||||
# "stream": False,
|
||||
# "cache_mode": "BYPASS",
|
||||
# "extraction_strategy": { # Apply extraction to ALL crawled pages
|
||||
# "type": "JsonCssExtractionStrategy",
|
||||
# "params": {"schema": {"type": "dict", "value": product_schema}}
|
||||
# },
|
||||
# "deep_crawl_strategy": {
|
||||
# "type": "BFSDeepCrawlStrategy",
|
||||
# "params": {
|
||||
# "max_depth": max_depth,
|
||||
# "max_pages": max_pages,
|
||||
# "filter_chain": { # Only crawl HTML on our domain
|
||||
# "type": "FilterChain",
|
||||
# "params": {
|
||||
# "filters": [
|
||||
# {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
|
||||
# {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
|
||||
# ]
|
||||
# }
|
||||
# }
|
||||
# # Optional: Add scoring to prioritize product pages for extraction
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# response = await async_client.post("/crawl", json=payload)
|
||||
# response.raise_for_status()
|
||||
# data = response.json()
|
||||
|
||||
# assert data["success"] is True
|
||||
# assert len(data["results"]) > 0
|
||||
# # assert len(data["results"]) <= max_pages
|
||||
|
||||
# found_extracted_product = False
|
||||
# for result in data["results"]:
|
||||
# await assert_crawl_result_structure(result)
|
||||
# assert result["success"] is True
|
||||
# assert "extracted_content" in result
|
||||
# if "product_" in result["url"]: # Check product pages specifically
|
||||
# assert result["extracted_content"] is not None
|
||||
# try:
|
||||
# extracted = json.loads(result["extracted_content"])
|
||||
# # Schema returns list even if one base match
|
||||
# assert isinstance(extracted, list)
|
||||
# if extracted:
|
||||
# item = extracted[0]
|
||||
# assert "product_title" in item and item["product_title"]
|
||||
# assert "price" in item and item["price"]
|
||||
# # Specs might be empty list if not found
|
||||
# assert "specs" in item and isinstance(item["specs"], list)
|
||||
# found_extracted_product = True
|
||||
# print(f"Extracted product: {item.get('product_title')}")
|
||||
# except (json.JSONDecodeError, AssertionError, IndexError) as e:
|
||||
# pytest.fail(f"Extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
|
||||
# # else:
|
||||
# # # Non-product pages might have None or empty list depending on schema match
|
||||
# # assert result["extracted_content"] is None or json.loads(result["extracted_content"]) == []
|
||||
|
||||
# assert found_extracted_product, "Did not find any pages where product data was successfully extracted."
|
||||
|
||||
# # 5. Deep Crawl with LLM Extraction (Requires Server LLM Setup)
|
||||
# async def test_deep_crawl_with_llm_extraction(self, async_client: httpx.AsyncClient):
|
||||
# """Test BFS deep crawl combined with LLMExtractionStrategy."""
|
||||
# max_depth = 1 # Limit depth to keep LLM calls manageable
|
||||
# max_pages = 3
|
||||
# payload = {
|
||||
# "urls": [DEEP_CRAWL_BASE_URL],
|
||||
# "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
# "crawler_config": {
|
||||
# "type": "CrawlerRunConfig",
|
||||
# "params": {
|
||||
# "stream": False,
|
||||
# "cache_mode": "BYPASS",
|
||||
# "extraction_strategy": { # Apply LLM extraction to crawled pages
|
||||
# "type": "LLMExtractionStrategy",
|
||||
# "params": {
|
||||
# "instruction": "Extract the main H1 title and the text content of the first paragraph.",
|
||||
# "llm_config": { # Example override, rely on server default if possible
|
||||
# "type": "LLMConfig",
|
||||
# "params": {"provider": "openai/gpt-4.1-mini"} # Use a cheaper model for testing
|
||||
# },
|
||||
# "schema": { # Expected JSON output
|
||||
# "type": "dict",
|
||||
# "value": {
|
||||
# "title": "PageContent", "type": "object",
|
||||
# "properties": {
|
||||
# "h1_title": {"type": "string"},
|
||||
# "first_paragraph": {"type": "string"}
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# },
|
||||
# "deep_crawl_strategy": {
|
||||
# "type": "BFSDeepCrawlStrategy",
|
||||
# "params": {
|
||||
# "max_depth": max_depth,
|
||||
# "max_pages": max_pages,
|
||||
# "filter_chain": {
|
||||
# "type": "FilterChain",
|
||||
# "params": {
|
||||
# "filters": [
|
||||
# {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
|
||||
# {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
|
||||
# ]
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
|
||||
# try:
|
||||
# response = await async_client.post("/crawl", json=payload)
|
||||
# response.raise_for_status()
|
||||
# data = response.json()
|
||||
# except httpx.HTTPStatusError as e:
|
||||
# pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and LLM API key setup.")
|
||||
# except httpx.RequestError as e:
|
||||
# pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}.")
|
||||
|
||||
|
||||
# assert data["success"] is True
|
||||
# assert len(data["results"]) > 0
|
||||
# assert len(data["results"]) <= max_pages
|
||||
|
||||
# found_llm_extraction = False
|
||||
# for result in data["results"]:
|
||||
# await assert_crawl_result_structure(result)
|
||||
# assert result["success"] is True
|
||||
# assert "extracted_content" in result
|
||||
# assert result["extracted_content"] is not None
|
||||
# try:
|
||||
# extracted = json.loads(result["extracted_content"])
|
||||
# if isinstance(extracted, list): extracted = extracted[0] # Handle list output
|
||||
# assert isinstance(extracted, dict)
|
||||
# assert "h1_title" in extracted # Check keys based on schema
|
||||
# assert "first_paragraph" in extracted
|
||||
# found_llm_extraction = True
|
||||
# print(f"LLM extracted from {result['url']}: Title='{extracted.get('h1_title')}'")
|
||||
# except (json.JSONDecodeError, AssertionError, IndexError, TypeError) as e:
|
||||
# pytest.fail(f"LLM extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
|
||||
|
||||
# assert found_llm_extraction, "LLM extraction did not yield expected data on any crawled page."
|
||||
|
||||
|
||||
# # 6. Deep Crawl with SSL Certificate Fetching
|
||||
# async def test_deep_crawl_with_ssl(self, async_client: httpx.AsyncClient):
|
||||
# """Test BFS deep crawl with fetch_ssl_certificate enabled."""
|
||||
# max_depth = 0 # Only fetch for start URL to keep test fast
|
||||
# max_pages = 1
|
||||
# payload = {
|
||||
# "urls": [DEEP_CRAWL_BASE_URL],
|
||||
# "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
# "crawler_config": {
|
||||
# "type": "CrawlerRunConfig",
|
||||
# "params": {
|
||||
# "stream": False,
|
||||
# "cache_mode": "BYPASS",
|
||||
# "fetch_ssl_certificate": True, # <-- Enable SSL fetching
|
||||
# "deep_crawl_strategy": {
|
||||
# "type": "BFSDeepCrawlStrategy",
|
||||
# "params": {
|
||||
# "max_depth": max_depth,
|
||||
# "max_pages": max_pages,
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# response = await async_client.post("/crawl", json=payload)
|
||||
# response.raise_for_status()
|
||||
# data = response.json()
|
||||
|
||||
# assert data["success"] is True
|
||||
# assert len(data["results"]) == 1
|
||||
# result = data["results"][0]
|
||||
|
||||
# await assert_crawl_result_structure(result, check_ssl=True) # <-- Tell helper to check SSL field
|
||||
# assert result["success"] is True
|
||||
# # Check if SSL info was actually retrieved
|
||||
# if result["ssl_certificate"]:
|
||||
# # Assert directly using dictionary keys
|
||||
# assert isinstance(result["ssl_certificate"], dict) # Verify it's a dict
|
||||
# assert "issuer" in result["ssl_certificate"]
|
||||
# assert "subject" in result["ssl_certificate"]
|
||||
# # --- MODIFIED ASSERTIONS ---
|
||||
# assert "not_before" in result["ssl_certificate"] # Check for the actual key
|
||||
# assert "not_after" in result["ssl_certificate"] # Check for the actual key
|
||||
# # --- END MODIFICATIONS ---
|
||||
# assert "fingerprint" in result["ssl_certificate"] # Check another key
|
||||
|
||||
# # This print statement using .get() already works correctly with dictionaries
|
||||
# print(f"SSL Issuer Org: {result['ssl_certificate'].get('issuer', {}).get('O', 'N/A')}")
|
||||
# print(f"SSL Valid From: {result['ssl_certificate'].get('not_before', 'N/A')}")
|
||||
# else:
|
||||
# # This part remains the same
|
||||
# print("SSL Certificate was null in the result.")
|
||||
|
||||
|
||||
# 7. Deep Crawl with Proxy Rotation (Requires PROXIES env var)
|
||||
async def test_deep_crawl_with_proxies(self, async_client: httpx.AsyncClient):
|
||||
"""Test BFS deep crawl using proxy rotation."""
|
||||
proxies = load_proxies_from_env()
|
||||
if not proxies:
|
||||
pytest.skip("Skipping proxy test: PROXIES environment variable not set or empty.")
|
||||
|
||||
print(f"\nTesting with {len(proxies)} proxies loaded from environment.")
|
||||
|
||||
max_depth = 1
|
||||
max_pages = 3
|
||||
payload = {
|
||||
"urls": [DEEP_CRAWL_BASE_URL], # Use the dummy site
|
||||
# Use a BrowserConfig that *might* pick up proxy if set, but rely on CrawlerRunConfig
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": False,
|
||||
"cache_mode": "BYPASS",
|
||||
"proxy_rotation_strategy": { # <-- Define the strategy
|
||||
"type": "RoundRobinProxyStrategy",
|
||||
"params": {
|
||||
# Convert ProxyConfig dicts back to the serialized format expected by server
|
||||
"proxies": [{"type": "ProxyConfig", "params": p} for p in proxies]
|
||||
}
|
||||
},
|
||||
"deep_crawl_strategy": {
|
||||
"type": "BFSDeepCrawlStrategy",
|
||||
"params": {
|
||||
"max_depth": max_depth,
|
||||
"max_pages": max_pages,
|
||||
"filter_chain": {
|
||||
"type": "FilterChain",
|
||||
"params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
try:
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except httpx.HTTPStatusError as e:
|
||||
# Proxies often cause connection errors, catch them
|
||||
pytest.fail(f"Proxy deep crawl failed: {e}. Response: {e.response.text}. Are proxies valid and accessible by the server?")
|
||||
except httpx.RequestError as e:
|
||||
pytest.fail(f"Proxy deep crawl request failed: {e}. Are proxies valid and accessible?")
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) > 0
|
||||
assert len(data["results"]) <= max_pages
|
||||
# Primary assertion is that the crawl succeeded *with* proxy config
|
||||
print(f"Proxy deep crawl completed successfully for {len(data['results'])} pages.")
|
||||
|
||||
# Verifying specific proxy usage requires server logs or custom headers/responses
|
||||
|
||||
|
||||
# --- Main Execution Block (for running script directly) ---
|
||||
if __name__ == "__main__":
|
||||
pytest_args = ["-v", "-s", __file__]
|
||||
# Example: Run only proxy test
|
||||
# pytest_args.append("-k test_deep_crawl_with_proxies")
|
||||
print(f"Running pytest with args: {pytest_args}")
|
||||
exit_code = pytest.main(pytest_args)
|
||||
print(f"Pytest finished with exit code: {exit_code}")
|
||||
335
tests/general/generate_dummy_site.py
Normal file
335
tests/general/generate_dummy_site.py
Normal file
@@ -0,0 +1,335 @@
|
||||
# ==== File: build_dummy_site.py ====
|
||||
|
||||
import os
|
||||
import random
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote
|
||||
|
||||
# --- Configuration ---
|
||||
NUM_CATEGORIES = 3
|
||||
NUM_SUBCATEGORIES_PER_CAT = 2 # Results in NUM_CATEGORIES * NUM_SUBCATEGORIES_PER_CAT total L2 categories
|
||||
NUM_PRODUCTS_PER_SUBCAT = 5 # Products listed on L3 pages
|
||||
MAX_DEPTH_TARGET = 5 # Explicitly set target depth
|
||||
|
||||
# --- Helper Functions ---
|
||||
|
||||
def generate_lorem(words=20):
|
||||
"""Generates simple placeholder text."""
|
||||
lorem_words = ["lorem", "ipsum", "dolor", "sit", "amet", "consectetur",
|
||||
"adipiscing", "elit", "sed", "do", "eiusmod", "tempor",
|
||||
"incididunt", "ut", "labore", "et", "dolore", "magna", "aliqua"]
|
||||
return " ".join(random.choice(lorem_words) for _ in range(words)).capitalize() + "."
|
||||
|
||||
def create_html_page(filepath: Path, title: str, body_content: str, breadcrumbs: list = [], head_extras: str = ""):
|
||||
"""Creates an HTML file with basic structure and inline CSS."""
|
||||
os.makedirs(filepath.parent, exist_ok=True)
|
||||
|
||||
# Generate breadcrumb HTML using the 'link' provided in the breadcrumbs list
|
||||
breadcrumb_html = ""
|
||||
if breadcrumbs:
|
||||
links_html = " » ".join(f'<a href="{bc["link"]}">{bc["name"]}</a>' for bc in breadcrumbs)
|
||||
breadcrumb_html = f"<nav class='breadcrumbs'>{links_html} » {title}</nav>"
|
||||
|
||||
# Basic CSS for structure identification (kept the same)
|
||||
css = """
|
||||
<style>
|
||||
body {
|
||||
font-family: sans-serif;
|
||||
padding: 20px;
|
||||
background-color: #1e1e1e;
|
||||
color: #d1d1d1;
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 960px;
|
||||
margin: auto;
|
||||
background: #2c2c2c;
|
||||
padding: 20px;
|
||||
border-radius: 5px;
|
||||
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.5);
|
||||
}
|
||||
|
||||
h1, h2 {
|
||||
color: #ccc;
|
||||
}
|
||||
|
||||
a {
|
||||
color: #9bcdff;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
a:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
ul {
|
||||
list-style: none;
|
||||
padding-left: 0;
|
||||
}
|
||||
|
||||
li {
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.category-link,
|
||||
.subcategory-link,
|
||||
.product-link,
|
||||
.details-link,
|
||||
.reviews-link {
|
||||
display: block;
|
||||
padding: 8px;
|
||||
background-color: #3a3a3a;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
.product-preview {
|
||||
border: 1px solid #444;
|
||||
padding: 10px;
|
||||
margin-bottom: 10px;
|
||||
border-radius: 4px;
|
||||
background-color: #2a2a2a;
|
||||
}
|
||||
|
||||
.product-title {
|
||||
color: #d1d1d1;
|
||||
}
|
||||
|
||||
.product-price {
|
||||
font-weight: bold;
|
||||
color: #85e085;
|
||||
}
|
||||
|
||||
.product-description,
|
||||
.product-specs,
|
||||
.product-reviews {
|
||||
margin-top: 15px;
|
||||
line-height: 1.6;
|
||||
}
|
||||
|
||||
.product-specs li {
|
||||
margin-bottom: 5px;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
.spec-name {
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.breadcrumbs {
|
||||
margin-bottom: 20px;
|
||||
font-size: 0.9em;
|
||||
color: #888;
|
||||
}
|
||||
|
||||
.breadcrumbs a {
|
||||
color: #9bcdff;
|
||||
}
|
||||
</style>
|
||||
"""
|
||||
html_content = f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>{title} - FakeShop</title>
|
||||
{head_extras}
|
||||
{css}
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
{breadcrumb_html}
|
||||
<h1>{title}</h1>
|
||||
{body_content}
|
||||
</div>
|
||||
</body>
|
||||
</html>"""
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
f.write(html_content)
|
||||
# Keep print statement concise for clarity
|
||||
# print(f"Created: {filepath}")
|
||||
|
||||
def generate_site(base_dir: Path, site_name: str = "FakeShop", base_path: str = ""):
|
||||
"""Generates the dummy website structure."""
|
||||
base_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# --- Clean and prepare the base path for URL construction ---
|
||||
# Ensure it starts with '/' if not empty, and remove any trailing '/'
|
||||
if base_path:
|
||||
full_base_path = "/" + base_path.strip('/')
|
||||
else:
|
||||
full_base_path = "" # Represents the root
|
||||
|
||||
print(f"Using base path for links: '{full_base_path}'")
|
||||
|
||||
# --- Level 0: Homepage ---
|
||||
home_body = "<h2>Welcome to FakeShop!</h2><p>Your one-stop shop for imaginary items.</p><h3>Categories:</h3>\n<ul>"
|
||||
# Define the *actual* link path for the homepage breadcrumb
|
||||
home_link_path = f"{full_base_path}/index.html"
|
||||
breadcrumbs_home = [{"name": "Home", "link": home_link_path}] # Base breadcrumb
|
||||
|
||||
# Links *within* the page content should remain relative
|
||||
for i in range(NUM_CATEGORIES):
|
||||
cat_name = f"Category-{i+1}"
|
||||
cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
|
||||
# This path is relative to the current directory (index.html)
|
||||
cat_relative_page_path = f"{cat_folder_name}/index.html"
|
||||
home_body += f'<li><a class="category-link" href="{cat_relative_page_path}">{cat_name}</a> - {generate_lorem(10)}</li>'
|
||||
home_body += "</ul>"
|
||||
create_html_page(base_dir / "index.html", "Homepage", home_body, []) # No breadcrumbs *on* the homepage itself
|
||||
|
||||
# --- Levels 1-5 ---
|
||||
for i in range(NUM_CATEGORIES):
|
||||
cat_name = f"Category-{i+1}"
|
||||
cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
|
||||
cat_dir = base_dir / cat_folder_name
|
||||
# This is the *absolute* path for the breadcrumb link
|
||||
cat_link_path = f"{full_base_path}/{cat_folder_name}/index.html"
|
||||
# Update breadcrumbs list for this level
|
||||
breadcrumbs_cat = breadcrumbs_home + [{"name": cat_name, "link": cat_link_path}]
|
||||
|
||||
# --- Level 1: Category Page ---
|
||||
cat_body = f"<p>{generate_lorem(15)} for {cat_name}.</p><h3>Sub-Categories:</h3>\n<ul>"
|
||||
for j in range(NUM_SUBCATEGORIES_PER_CAT):
|
||||
subcat_name = f"{cat_name}-Sub-{j+1}"
|
||||
subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
|
||||
# Path relative to the category page
|
||||
subcat_relative_page_path = f"{subcat_folder_name}/index.html"
|
||||
cat_body += f'<li><a class="subcategory-link" href="{subcat_relative_page_path}">{subcat_name}</a> - {generate_lorem(8)}</li>'
|
||||
cat_body += "</ul>"
|
||||
# Pass the updated breadcrumbs list
|
||||
create_html_page(cat_dir / "index.html", cat_name, cat_body, breadcrumbs_home) # Parent breadcrumb needed here
|
||||
|
||||
for j in range(NUM_SUBCATEGORIES_PER_CAT):
|
||||
subcat_name = f"{cat_name}-Sub-{j+1}"
|
||||
subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
|
||||
subcat_dir = cat_dir / subcat_folder_name
|
||||
# Absolute path for the breadcrumb link
|
||||
subcat_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/index.html"
|
||||
# Update breadcrumbs list for this level
|
||||
breadcrumbs_subcat = breadcrumbs_cat + [{"name": subcat_name, "link": subcat_link_path}]
|
||||
|
||||
# --- Level 2: Sub-Category Page (Product List) ---
|
||||
subcat_body = f"<p>Explore products in {subcat_name}. {generate_lorem(12)}</p><h3>Products:</h3>\n<ul class='product-list'>"
|
||||
for k in range(NUM_PRODUCTS_PER_SUBCAT):
|
||||
prod_id = f"P{i+1}{j+1}{k+1:03d}" # e.g., P11001
|
||||
prod_name = f"{subcat_name} Product {k+1} ({prod_id})"
|
||||
# Filename relative to the subcategory page
|
||||
prod_filename = f"product_{prod_id}.html"
|
||||
# Absolute path for the breadcrumb link
|
||||
prod_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{prod_filename}"
|
||||
|
||||
# Preview on list page (link remains relative)
|
||||
subcat_body += f"""
|
||||
<li>
|
||||
<div class="product-preview">
|
||||
<a class="product-link" href="{prod_filename}"><strong>{prod_name}</strong></a>
|
||||
<p>{generate_lorem(10)}</p>
|
||||
<span class="product-price">£{random.uniform(10, 500):.2f}</span>
|
||||
</div>
|
||||
</li>"""
|
||||
|
||||
# --- Level 3: Product Page ---
|
||||
prod_price = random.uniform(10, 500)
|
||||
prod_desc = generate_lorem(40)
|
||||
prod_specs = {f"Spec {s+1}": generate_lorem(3) for s in range(random.randint(3,6))}
|
||||
prod_reviews_count = random.randint(0, 150)
|
||||
# Relative filenames for links on this page
|
||||
details_filename_relative = f"product_{prod_id}_details.html"
|
||||
reviews_filename_relative = f"product_{prod_id}_reviews.html"
|
||||
|
||||
prod_body = f"""
|
||||
<p class="product-price">Price: £{prod_price:.2f}</p>
|
||||
<div class="product-description">
|
||||
<h2>Description</h2>
|
||||
<p>{prod_desc}</p>
|
||||
</div>
|
||||
<div class="product-specs">
|
||||
<h2>Specifications</h2>
|
||||
<ul>
|
||||
{''.join(f'<li><span class="spec-name">{name}</span>: <span class="spec-value">{value}</span></li>' for name, value in prod_specs.items())}
|
||||
</ul>
|
||||
</div>
|
||||
<div class="product-reviews">
|
||||
<h2>Reviews</h2>
|
||||
<p>Total Reviews: <span class="review-count">{prod_reviews_count}</span></p>
|
||||
</div>
|
||||
<hr>
|
||||
<p>
|
||||
<a class="details-link" href="{details_filename_relative}">View More Details</a> |
|
||||
<a class="reviews-link" href="{reviews_filename_relative}">See All Reviews</a>
|
||||
</p>
|
||||
"""
|
||||
# Update breadcrumbs list for this level
|
||||
breadcrumbs_prod = breadcrumbs_subcat + [{"name": prod_name, "link": prod_link_path}]
|
||||
# Pass the updated breadcrumbs list
|
||||
create_html_page(subcat_dir / prod_filename, prod_name, prod_body, breadcrumbs_subcat) # Parent breadcrumb needed here
|
||||
|
||||
# --- Level 4: Product Details Page ---
|
||||
details_filename = f"product_{prod_id}_details.html" # Actual filename
|
||||
# Absolute path for the breadcrumb link
|
||||
details_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{details_filename}"
|
||||
details_body = f"<p>This page contains extremely detailed information about {prod_name}.</p>{generate_lorem(100)}"
|
||||
# Update breadcrumbs list for this level
|
||||
breadcrumbs_details = breadcrumbs_prod + [{"name": "Details", "link": details_link_path}]
|
||||
# Pass the updated breadcrumbs list
|
||||
create_html_page(subcat_dir / details_filename, f"{prod_name} - Details", details_body, breadcrumbs_prod) # Parent breadcrumb needed here
|
||||
|
||||
# --- Level 5: Product Reviews Page ---
|
||||
reviews_filename = f"product_{prod_id}_reviews.html" # Actual filename
|
||||
# Absolute path for the breadcrumb link
|
||||
reviews_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{reviews_filename}"
|
||||
reviews_body = f"<p>All {prod_reviews_count} reviews for {prod_name} are listed here.</p><ul>"
|
||||
for r in range(prod_reviews_count):
|
||||
reviews_body += f"<li>Review {r+1}: {generate_lorem(random.randint(15, 50))}</li>"
|
||||
reviews_body += "</ul>"
|
||||
# Update breadcrumbs list for this level
|
||||
breadcrumbs_reviews = breadcrumbs_prod + [{"name": "Reviews", "link": reviews_link_path}]
|
||||
# Pass the updated breadcrumbs list
|
||||
create_html_page(subcat_dir / reviews_filename, f"{prod_name} - Reviews", reviews_body, breadcrumbs_prod) # Parent breadcrumb needed here
|
||||
|
||||
|
||||
subcat_body += "</ul>" # Close product-list ul
|
||||
# Pass the correct breadcrumbs list for the subcategory index page
|
||||
create_html_page(subcat_dir / "index.html", subcat_name, subcat_body, breadcrumbs_cat) # Parent breadcrumb needed here
|
||||
|
||||
|
||||
# --- Main Execution ---
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Generate a dummy multi-level retail website.")
|
||||
parser.add_argument(
|
||||
"-o", "--output-dir",
|
||||
type=str,
|
||||
default="dummy_retail_site",
|
||||
help="Directory to generate the website in."
|
||||
)
|
||||
parser.add_argument(
|
||||
"-n", "--site-name",
|
||||
type=str,
|
||||
default="FakeShop",
|
||||
help="Name of the fake shop."
|
||||
)
|
||||
parser.add_argument(
|
||||
"-b", "--base-path",
|
||||
type=str,
|
||||
default="",
|
||||
help="Base path for hosting the site (e.g., 'samples/deepcrawl'). Leave empty if hosted at the root."
|
||||
)
|
||||
# Optional: Add more args to configure counts if needed
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
output_directory = Path(args.output_dir)
|
||||
site_name = args.site_name
|
||||
base_path = args.base_path
|
||||
|
||||
print(f"Generating dummy site '{site_name}' in '{output_directory}'...")
|
||||
# Pass the base_path to the generation function
|
||||
generate_site(output_directory, site_name, base_path)
|
||||
print(f"\nCreated {sum(1 for _ in output_directory.rglob('*.html'))} HTML pages.")
|
||||
print("Dummy site generation complete.")
|
||||
print(f"To serve locally (example): python -m http.server --directory {output_directory} 8000")
|
||||
if base_path:
|
||||
print(f"Access the site at: http://localhost:8000/{base_path.strip('/')}/index.html")
|
||||
else:
|
||||
print(f"Access the site at: http://localhost:8000/index.html")
|
||||
Reference in New Issue
Block a user