Apply Ruff Corrections

This commit is contained in:
UncleCode
2025-01-13 19:19:58 +08:00
parent c3370ec5da
commit 8ec12d7d68
84 changed files with 6861 additions and 5076 deletions

View File

@@ -6,53 +6,72 @@ import base64
import os
from typing import Dict, Any
class Crawl4AiTester:
def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
self.base_url = base_url
self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') # Check environment variable as fallback
self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
self.api_token = api_token or os.getenv(
"CRAWL4AI_API_TOKEN"
) # Check environment variable as fallback
self.headers = (
{"Authorization": f"Bearer {self.api_token}"} if self.api_token else {}
)
def submit_and_wait(
self, request_data: Dict[str, Any], timeout: int = 300
) -> Dict[str, Any]:
# Submit crawl job
response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers)
response = requests.post(
f"{self.base_url}/crawl", json=request_data, headers=self.headers
)
if response.status_code == 403:
raise Exception("API token is invalid or missing")
task_id = response.json()["task_id"]
print(f"Task ID: {task_id}")
# Poll for result
start_time = time.time()
while True:
if time.time() - start_time > timeout:
raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers)
raise TimeoutError(
f"Task {task_id} did not complete within {timeout} seconds"
)
result = requests.get(
f"{self.base_url}/task/{task_id}", headers=self.headers
)
status = result.json()
if status["status"] == "failed":
print("Task failed:", status.get("error"))
raise Exception(f"Task failed: {status.get('error')}")
if status["status"] == "completed":
return status
time.sleep(2)
def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60)
response = requests.post(
f"{self.base_url}/crawl_sync",
json=request_data,
headers=self.headers,
timeout=60,
)
if response.status_code == 408:
raise TimeoutError("Task did not complete within server timeout")
response.raise_for_status()
return response.json()
def test_docker_deployment(version="basic"):
tester = Crawl4AiTester(
# base_url="http://localhost:11235" ,
base_url="https://crawl4ai-sby74.ondigitalocean.app",
api_token="test"
api_token="test",
)
print(f"Testing Crawl4AI Docker {version} version")
# Health check with timeout and retry
max_retries = 5
for i in range(max_retries):
@@ -60,18 +79,18 @@ def test_docker_deployment(version="basic"):
health = requests.get(f"{tester.base_url}/health", timeout=10)
print("Health check:", health.json())
break
except requests.exceptions.RequestException as e:
except requests.exceptions.RequestException:
if i == max_retries - 1:
print(f"Failed to connect after {max_retries} attempts")
sys.exit(1)
print(f"Waiting for service to start (attempt {i+1}/{max_retries})...")
time.sleep(5)
# Test cases based on version
test_basic_crawl(tester)
test_basic_crawl(tester)
test_basic_crawl_sync(tester)
# if version in ["full", "transformer"]:
# test_cosine_extraction(tester)
@@ -81,35 +100,37 @@ def test_docker_deployment(version="basic"):
# test_llm_extraction(tester)
# test_llm_with_ollama(tester)
# test_screenshot(tester)
def test_basic_crawl(tester: Crawl4AiTester):
print("\n=== Testing Basic Crawl ===")
request = {
"urls": "https://www.nbcnews.com/business",
"priority": 10,
"session_id": "test"
"priority": 10,
"session_id": "test",
}
result = tester.submit_and_wait(request)
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
assert result["result"]["success"]
assert len(result["result"]["markdown"]) > 0
def test_basic_crawl_sync(tester: Crawl4AiTester):
print("\n=== Testing Basic Crawl (Sync) ===")
request = {
"urls": "https://www.nbcnews.com/business",
"priority": 10,
"session_id": "test"
"session_id": "test",
}
result = tester.submit_sync(request)
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
assert result['status'] == 'completed'
assert result['result']['success']
assert len(result['result']['markdown']) > 0
assert result["status"] == "completed"
assert result["result"]["success"]
assert len(result["result"]["markdown"]) > 0
def test_js_execution(tester: Crawl4AiTester):
print("\n=== Testing JS Execution ===")
request = {
@@ -119,32 +140,29 @@ def test_js_execution(tester: Crawl4AiTester):
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
],
"wait_for": "article.tease-card:nth-child(10)",
"crawler_params": {
"headless": True
}
"crawler_params": {"headless": True},
}
result = tester.submit_and_wait(request)
print(f"JS execution result length: {len(result['result']['markdown'])}")
assert result["result"]["success"]
def test_css_selector(tester: Crawl4AiTester):
print("\n=== Testing CSS Selector ===")
request = {
"urls": "https://www.nbcnews.com/business",
"priority": 7,
"css_selector": ".wide-tease-item__description",
"crawler_params": {
"headless": True
},
"extra": {"word_count_threshold": 10}
"crawler_params": {"headless": True},
"extra": {"word_count_threshold": 10},
}
result = tester.submit_and_wait(request)
print(f"CSS selector result length: {len(result['result']['markdown'])}")
assert result["result"]["success"]
def test_structured_extraction(tester: Crawl4AiTester):
print("\n=== Testing Structured Extraction ===")
schema = {
@@ -165,21 +183,16 @@ def test_structured_extraction(tester: Crawl4AiTester):
"name": "price",
"selector": "td:nth-child(2)",
"type": "text",
}
},
],
}
request = {
"urls": "https://www.coinbase.com/explore",
"priority": 9,
"extraction_config": {
"type": "json_css",
"params": {
"schema": schema
}
}
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
}
result = tester.submit_and_wait(request)
extracted = json.loads(result["result"]["extracted_content"])
print(f"Extracted {len(extracted)} items")
@@ -187,6 +200,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
assert result["result"]["success"]
assert len(extracted) > 0
def test_llm_extraction(tester: Crawl4AiTester):
print("\n=== Testing LLM Extraction ===")
schema = {
@@ -194,20 +208,20 @@ def test_llm_extraction(tester: Crawl4AiTester):
"properties": {
"model_name": {
"type": "string",
"description": "Name of the OpenAI model."
"description": "Name of the OpenAI model.",
},
"input_fee": {
"type": "string",
"description": "Fee for input token for the OpenAI model."
"description": "Fee for input token for the OpenAI model.",
},
"output_fee": {
"type": "string",
"description": "Fee for output token for the OpenAI model."
}
"description": "Fee for output token for the OpenAI model.",
},
},
"required": ["model_name", "input_fee", "output_fee"]
"required": ["model_name", "input_fee", "output_fee"],
}
request = {
"urls": "https://openai.com/api/pricing",
"priority": 8,
@@ -218,12 +232,12 @@ def test_llm_extraction(tester: Crawl4AiTester):
"api_token": os.getenv("OPENAI_API_KEY"),
"schema": schema,
"extraction_type": "schema",
"instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens."""
}
"instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens.""",
},
},
"crawler_params": {"word_count_threshold": 1}
"crawler_params": {"word_count_threshold": 1},
}
try:
result = tester.submit_and_wait(request)
extracted = json.loads(result["result"]["extracted_content"])
@@ -233,6 +247,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
except Exception as e:
print(f"LLM extraction test failed (might be due to missing API key): {str(e)}")
def test_llm_with_ollama(tester: Crawl4AiTester):
print("\n=== Testing LLM with Ollama ===")
schema = {
@@ -240,20 +255,20 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
"properties": {
"article_title": {
"type": "string",
"description": "The main title of the news article"
"description": "The main title of the news article",
},
"summary": {
"type": "string",
"description": "A brief summary of the article content"
"description": "A brief summary of the article content",
},
"main_topics": {
"type": "array",
"items": {"type": "string"},
"description": "Main topics or themes discussed in the article"
}
}
"description": "Main topics or themes discussed in the article",
},
},
}
request = {
"urls": "https://www.nbcnews.com/business",
"priority": 8,
@@ -263,13 +278,13 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
"provider": "ollama/llama2",
"schema": schema,
"extraction_type": "schema",
"instruction": "Extract the main article information including title, summary, and main topics."
}
"instruction": "Extract the main article information including title, summary, and main topics.",
},
},
"extra": {"word_count_threshold": 1},
"crawler_params": {"verbose": True}
"crawler_params": {"verbose": True},
}
try:
result = tester.submit_and_wait(request)
extracted = json.loads(result["result"]["extracted_content"])
@@ -278,6 +293,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
except Exception as e:
print(f"Ollama extraction test failed: {str(e)}")
def test_cosine_extraction(tester: Crawl4AiTester):
print("\n=== Testing Cosine Extraction ===")
request = {
@@ -289,11 +305,11 @@ def test_cosine_extraction(tester: Crawl4AiTester):
"semantic_filter": "business finance economy",
"word_count_threshold": 10,
"max_dist": 0.2,
"top_k": 3
}
}
"top_k": 3,
},
},
}
try:
result = tester.submit_and_wait(request)
extracted = json.loads(result["result"]["extracted_content"])
@@ -303,30 +319,30 @@ def test_cosine_extraction(tester: Crawl4AiTester):
except Exception as e:
print(f"Cosine extraction test failed: {str(e)}")
def test_screenshot(tester: Crawl4AiTester):
print("\n=== Testing Screenshot ===")
request = {
"urls": "https://www.nbcnews.com/business",
"priority": 5,
"screenshot": True,
"crawler_params": {
"headless": True
}
"crawler_params": {"headless": True},
}
result = tester.submit_and_wait(request)
print("Screenshot captured:", bool(result["result"]["screenshot"]))
if result["result"]["screenshot"]:
# Save screenshot
screenshot_data = base64.b64decode(result["result"]["screenshot"])
with open("test_screenshot.jpg", "wb") as f:
f.write(screenshot_data)
print("Screenshot saved as test_screenshot.jpg")
assert result["result"]["success"]
if __name__ == "__main__":
version = sys.argv[1] if len(sys.argv) > 1 else "basic"
# version = "full"
test_docker_deployment(version)
test_docker_deployment(version)