Apply Ruff Corrections
This commit is contained in:
@@ -6,63 +6,80 @@ import base64
|
||||
import os
|
||||
from typing import Dict, Any
|
||||
|
||||
|
||||
class Crawl4AiTester:
|
||||
def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
|
||||
self.base_url = base_url
|
||||
self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code" # Check environment variable as fallback
|
||||
self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
|
||||
|
||||
def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
|
||||
self.api_token = (
|
||||
api_token or os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"
|
||||
) # Check environment variable as fallback
|
||||
self.headers = (
|
||||
{"Authorization": f"Bearer {self.api_token}"} if self.api_token else {}
|
||||
)
|
||||
|
||||
def submit_and_wait(
|
||||
self, request_data: Dict[str, Any], timeout: int = 300
|
||||
) -> Dict[str, Any]:
|
||||
# Submit crawl job
|
||||
response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers)
|
||||
response = requests.post(
|
||||
f"{self.base_url}/crawl", json=request_data, headers=self.headers
|
||||
)
|
||||
if response.status_code == 403:
|
||||
raise Exception("API token is invalid or missing")
|
||||
task_id = response.json()["task_id"]
|
||||
print(f"Task ID: {task_id}")
|
||||
|
||||
|
||||
# Poll for result
|
||||
start_time = time.time()
|
||||
while True:
|
||||
if time.time() - start_time > timeout:
|
||||
raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
|
||||
|
||||
result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers)
|
||||
raise TimeoutError(
|
||||
f"Task {task_id} did not complete within {timeout} seconds"
|
||||
)
|
||||
|
||||
result = requests.get(
|
||||
f"{self.base_url}/task/{task_id}", headers=self.headers
|
||||
)
|
||||
status = result.json()
|
||||
|
||||
|
||||
if status["status"] == "failed":
|
||||
print("Task failed:", status.get("error"))
|
||||
raise Exception(f"Task failed: {status.get('error')}")
|
||||
|
||||
|
||||
if status["status"] == "completed":
|
||||
return status
|
||||
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
|
||||
def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60)
|
||||
response = requests.post(
|
||||
f"{self.base_url}/crawl_sync",
|
||||
json=request_data,
|
||||
headers=self.headers,
|
||||
timeout=60,
|
||||
)
|
||||
if response.status_code == 408:
|
||||
raise TimeoutError("Task did not complete within server timeout")
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def crawl_direct(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Directly crawl without using task queue"""
|
||||
response = requests.post(
|
||||
f"{self.base_url}/crawl_direct",
|
||||
json=request_data,
|
||||
headers=self.headers
|
||||
f"{self.base_url}/crawl_direct", json=request_data, headers=self.headers
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def test_docker_deployment(version="basic"):
|
||||
tester = Crawl4AiTester(
|
||||
base_url="http://localhost:11235" ,
|
||||
base_url="http://localhost:11235",
|
||||
# base_url="https://api.crawl4ai.com" # just for example
|
||||
# api_token="test" # just for example
|
||||
)
|
||||
print(f"Testing Crawl4AI Docker {version} version")
|
||||
|
||||
|
||||
# Health check with timeout and retry
|
||||
max_retries = 5
|
||||
for i in range(max_retries):
|
||||
@@ -70,19 +87,19 @@ def test_docker_deployment(version="basic"):
|
||||
health = requests.get(f"{tester.base_url}/health", timeout=10)
|
||||
print("Health check:", health.json())
|
||||
break
|
||||
except requests.exceptions.RequestException as e:
|
||||
except requests.exceptions.RequestException:
|
||||
if i == max_retries - 1:
|
||||
print(f"Failed to connect after {max_retries} attempts")
|
||||
sys.exit(1)
|
||||
print(f"Waiting for service to start (attempt {i+1}/{max_retries})...")
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
# Test cases based on version
|
||||
test_basic_crawl_direct(tester)
|
||||
test_basic_crawl(tester)
|
||||
test_basic_crawl(tester)
|
||||
test_basic_crawl_sync(tester)
|
||||
|
||||
|
||||
if version in ["full", "transformer"]:
|
||||
test_cosine_extraction(tester)
|
||||
|
||||
@@ -92,49 +109,52 @@ def test_docker_deployment(version="basic"):
|
||||
test_llm_extraction(tester)
|
||||
test_llm_with_ollama(tester)
|
||||
test_screenshot(tester)
|
||||
|
||||
|
||||
|
||||
def test_basic_crawl(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Basic Crawl ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 10,
|
||||
"session_id": "test"
|
||||
"priority": 10,
|
||||
"session_id": "test",
|
||||
}
|
||||
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||
assert result["result"]["success"]
|
||||
assert len(result["result"]["markdown"]) > 0
|
||||
|
||||
|
||||
def test_basic_crawl_sync(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Basic Crawl (Sync) ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 10,
|
||||
"session_id": "test"
|
||||
"session_id": "test",
|
||||
}
|
||||
|
||||
|
||||
result = tester.submit_sync(request)
|
||||
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||
assert result['status'] == 'completed'
|
||||
assert result['result']['success']
|
||||
assert len(result['result']['markdown']) > 0
|
||||
|
||||
assert result["status"] == "completed"
|
||||
assert result["result"]["success"]
|
||||
assert len(result["result"]["markdown"]) > 0
|
||||
|
||||
|
||||
def test_basic_crawl_direct(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Basic Crawl (Direct) ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 10,
|
||||
# "session_id": "test"
|
||||
"cache_mode": "bypass" # or "enabled", "disabled", "read_only", "write_only"
|
||||
"cache_mode": "bypass", # or "enabled", "disabled", "read_only", "write_only"
|
||||
}
|
||||
|
||||
|
||||
result = tester.crawl_direct(request)
|
||||
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||
assert result['result']['success']
|
||||
assert len(result['result']['markdown']) > 0
|
||||
|
||||
assert result["result"]["success"]
|
||||
assert len(result["result"]["markdown"]) > 0
|
||||
|
||||
|
||||
def test_js_execution(tester: Crawl4AiTester):
|
||||
print("\n=== Testing JS Execution ===")
|
||||
request = {
|
||||
@@ -144,32 +164,29 @@ def test_js_execution(tester: Crawl4AiTester):
|
||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||
],
|
||||
"wait_for": "article.tease-card:nth-child(10)",
|
||||
"crawler_params": {
|
||||
"headless": True
|
||||
}
|
||||
"crawler_params": {"headless": True},
|
||||
}
|
||||
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
print(f"JS execution result length: {len(result['result']['markdown'])}")
|
||||
assert result["result"]["success"]
|
||||
|
||||
|
||||
def test_css_selector(tester: Crawl4AiTester):
|
||||
print("\n=== Testing CSS Selector ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 7,
|
||||
"css_selector": ".wide-tease-item__description",
|
||||
"crawler_params": {
|
||||
"headless": True
|
||||
},
|
||||
"extra": {"word_count_threshold": 10}
|
||||
|
||||
"crawler_params": {"headless": True},
|
||||
"extra": {"word_count_threshold": 10},
|
||||
}
|
||||
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
print(f"CSS selector result length: {len(result['result']['markdown'])}")
|
||||
assert result["result"]["success"]
|
||||
|
||||
|
||||
def test_structured_extraction(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Structured Extraction ===")
|
||||
schema = {
|
||||
@@ -190,21 +207,16 @@ def test_structured_extraction(tester: Crawl4AiTester):
|
||||
"name": "price",
|
||||
"selector": "td:nth-child(2)",
|
||||
"type": "text",
|
||||
}
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
request = {
|
||||
"urls": "https://www.coinbase.com/explore",
|
||||
"priority": 9,
|
||||
"extraction_config": {
|
||||
"type": "json_css",
|
||||
"params": {
|
||||
"schema": schema
|
||||
}
|
||||
}
|
||||
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
|
||||
}
|
||||
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
extracted = json.loads(result["result"]["extracted_content"])
|
||||
print(f"Extracted {len(extracted)} items")
|
||||
@@ -212,6 +224,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
|
||||
assert result["result"]["success"]
|
||||
assert len(extracted) > 0
|
||||
|
||||
|
||||
def test_llm_extraction(tester: Crawl4AiTester):
|
||||
print("\n=== Testing LLM Extraction ===")
|
||||
schema = {
|
||||
@@ -219,20 +232,20 @@ def test_llm_extraction(tester: Crawl4AiTester):
|
||||
"properties": {
|
||||
"model_name": {
|
||||
"type": "string",
|
||||
"description": "Name of the OpenAI model."
|
||||
"description": "Name of the OpenAI model.",
|
||||
},
|
||||
"input_fee": {
|
||||
"type": "string",
|
||||
"description": "Fee for input token for the OpenAI model."
|
||||
"description": "Fee for input token for the OpenAI model.",
|
||||
},
|
||||
"output_fee": {
|
||||
"type": "string",
|
||||
"description": "Fee for output token for the OpenAI model."
|
||||
}
|
||||
"description": "Fee for output token for the OpenAI model.",
|
||||
},
|
||||
},
|
||||
"required": ["model_name", "input_fee", "output_fee"]
|
||||
"required": ["model_name", "input_fee", "output_fee"],
|
||||
}
|
||||
|
||||
|
||||
request = {
|
||||
"urls": "https://openai.com/api/pricing",
|
||||
"priority": 8,
|
||||
@@ -243,12 +256,12 @@ def test_llm_extraction(tester: Crawl4AiTester):
|
||||
"api_token": os.getenv("OPENAI_API_KEY"),
|
||||
"schema": schema,
|
||||
"extraction_type": "schema",
|
||||
"instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens."""
|
||||
}
|
||||
"instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens.""",
|
||||
},
|
||||
},
|
||||
"crawler_params": {"word_count_threshold": 1}
|
||||
"crawler_params": {"word_count_threshold": 1},
|
||||
}
|
||||
|
||||
|
||||
try:
|
||||
result = tester.submit_and_wait(request)
|
||||
extracted = json.loads(result["result"]["extracted_content"])
|
||||
@@ -258,6 +271,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
|
||||
except Exception as e:
|
||||
print(f"LLM extraction test failed (might be due to missing API key): {str(e)}")
|
||||
|
||||
|
||||
def test_llm_with_ollama(tester: Crawl4AiTester):
|
||||
print("\n=== Testing LLM with Ollama ===")
|
||||
schema = {
|
||||
@@ -265,20 +279,20 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
||||
"properties": {
|
||||
"article_title": {
|
||||
"type": "string",
|
||||
"description": "The main title of the news article"
|
||||
"description": "The main title of the news article",
|
||||
},
|
||||
"summary": {
|
||||
"type": "string",
|
||||
"description": "A brief summary of the article content"
|
||||
"description": "A brief summary of the article content",
|
||||
},
|
||||
"main_topics": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "Main topics or themes discussed in the article"
|
||||
}
|
||||
}
|
||||
"description": "Main topics or themes discussed in the article",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 8,
|
||||
@@ -288,13 +302,13 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
||||
"provider": "ollama/llama2",
|
||||
"schema": schema,
|
||||
"extraction_type": "schema",
|
||||
"instruction": "Extract the main article information including title, summary, and main topics."
|
||||
}
|
||||
"instruction": "Extract the main article information including title, summary, and main topics.",
|
||||
},
|
||||
},
|
||||
"extra": {"word_count_threshold": 1},
|
||||
"crawler_params": {"verbose": True}
|
||||
"crawler_params": {"verbose": True},
|
||||
}
|
||||
|
||||
|
||||
try:
|
||||
result = tester.submit_and_wait(request)
|
||||
extracted = json.loads(result["result"]["extracted_content"])
|
||||
@@ -303,6 +317,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
||||
except Exception as e:
|
||||
print(f"Ollama extraction test failed: {str(e)}")
|
||||
|
||||
|
||||
def test_cosine_extraction(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Cosine Extraction ===")
|
||||
request = {
|
||||
@@ -314,11 +329,11 @@ def test_cosine_extraction(tester: Crawl4AiTester):
|
||||
"semantic_filter": "business finance economy",
|
||||
"word_count_threshold": 10,
|
||||
"max_dist": 0.2,
|
||||
"top_k": 3
|
||||
}
|
||||
}
|
||||
"top_k": 3,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
try:
|
||||
result = tester.submit_and_wait(request)
|
||||
extracted = json.loads(result["result"]["extracted_content"])
|
||||
@@ -328,30 +343,30 @@ def test_cosine_extraction(tester: Crawl4AiTester):
|
||||
except Exception as e:
|
||||
print(f"Cosine extraction test failed: {str(e)}")
|
||||
|
||||
|
||||
def test_screenshot(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Screenshot ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 5,
|
||||
"screenshot": True,
|
||||
"crawler_params": {
|
||||
"headless": True
|
||||
}
|
||||
"crawler_params": {"headless": True},
|
||||
}
|
||||
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
print("Screenshot captured:", bool(result["result"]["screenshot"]))
|
||||
|
||||
|
||||
if result["result"]["screenshot"]:
|
||||
# Save screenshot
|
||||
screenshot_data = base64.b64decode(result["result"]["screenshot"])
|
||||
with open("test_screenshot.jpg", "wb") as f:
|
||||
f.write(screenshot_data)
|
||||
print("Screenshot saved as test_screenshot.jpg")
|
||||
|
||||
|
||||
assert result["result"]["success"]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
version = sys.argv[1] if len(sys.argv) > 1 else "basic"
|
||||
# version = "full"
|
||||
test_docker_deployment(version)
|
||||
test_docker_deployment(version)
|
||||
|
||||
Reference in New Issue
Block a user