Apply Ruff Corrections

2025-01-13 19:19:58 +08:00
parent c3370ec5da
commit 8ec12d7d68
84 changed files with 6861 additions and 5076 deletions
--- a/docs/examples/docker_example.py
+++ b/docs/examples/docker_example.py
@@ -6,63 +6,80 @@ import base64
 import os
 from typing import Dict, Any

+
 class Crawl4AiTester:
    def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
        self.base_url = base_url
-        self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code"  # Check environment variable as fallback
-        self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
-        
-    def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
+        self.api_token = (
+            api_token or os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"
+        )  # Check environment variable as fallback
+        self.headers = (
+            {"Authorization": f"Bearer {self.api_token}"} if self.api_token else {}
+        )
+
+    def submit_and_wait(
+        self, request_data: Dict[str, Any], timeout: int = 300
+    ) -> Dict[str, Any]:
        # Submit crawl job
-        response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers)
+        response = requests.post(
+            f"{self.base_url}/crawl", json=request_data, headers=self.headers
+        )
        if response.status_code == 403:
            raise Exception("API token is invalid or missing")
        task_id = response.json()["task_id"]
        print(f"Task ID: {task_id}")
-        
+
        # Poll for result
        start_time = time.time()
        while True:
            if time.time() - start_time > timeout:
-                raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
-                
-            result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers)
+                raise TimeoutError(
+                    f"Task {task_id} did not complete within {timeout} seconds"
+                )
+
+            result = requests.get(
+                f"{self.base_url}/task/{task_id}", headers=self.headers
+            )
            status = result.json()
-            
+
            if status["status"] == "failed":
                print("Task failed:", status.get("error"))
                raise Exception(f"Task failed: {status.get('error')}")
-                
+
            if status["status"] == "completed":
                return status
-                
+
            time.sleep(2)
-            
+
    def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
-        response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60)
+        response = requests.post(
+            f"{self.base_url}/crawl_sync",
+            json=request_data,
+            headers=self.headers,
+            timeout=60,
+        )
        if response.status_code == 408:
            raise TimeoutError("Task did not complete within server timeout")
        response.raise_for_status()
        return response.json()
-    
+
    def crawl_direct(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
        """Directly crawl without using task queue"""
        response = requests.post(
-            f"{self.base_url}/crawl_direct", 
-            json=request_data, 
-            headers=self.headers
+            f"{self.base_url}/crawl_direct", json=request_data, headers=self.headers
        )
        response.raise_for_status()
        return response.json()

+
 def test_docker_deployment(version="basic"):
    tester = Crawl4AiTester(
-        base_url="http://localhost:11235" ,
+        base_url="http://localhost:11235",
        # base_url="https://api.crawl4ai.com" # just for example
        # api_token="test" # just for example
    )
    print(f"Testing Crawl4AI Docker {version} version")
-    
+
    # Health check with timeout and retry
    max_retries = 5
    for i in range(max_retries):
@@ -70,19 +87,19 @@ def test_docker_deployment(version="basic"):
            health = requests.get(f"{tester.base_url}/health", timeout=10)
            print("Health check:", health.json())
            break
-        except requests.exceptions.RequestException as e:
+        except requests.exceptions.RequestException:
            if i == max_retries - 1:
                print(f"Failed to connect after {max_retries} attempts")
                sys.exit(1)
            print(f"Waiting for service to start (attempt {i+1}/{max_retries})...")
            time.sleep(5)
-    
+
    # Test cases based on version
    test_basic_crawl_direct(tester)
    test_basic_crawl(tester)
    test_basic_crawl(tester)
    test_basic_crawl_sync(tester)
-    
+
    if version in ["full", "transformer"]:
        test_cosine_extraction(tester)

@@ -92,49 +109,52 @@ def test_docker_deployment(version="basic"):
    test_llm_extraction(tester)
    test_llm_with_ollama(tester)
    test_screenshot(tester)
-    
+

 def test_basic_crawl(tester: Crawl4AiTester):
    print("\n=== Testing Basic Crawl ===")
    request = {
        "urls": "https://www.nbcnews.com/business",
-        "priority": 10, 
-        "session_id": "test"
+        "priority": 10,
+        "session_id": "test",
    }
-    
+
    result = tester.submit_and_wait(request)
    print(f"Basic crawl result length: {len(result['result']['markdown'])}")
    assert result["result"]["success"]
    assert len(result["result"]["markdown"]) > 0

+
 def test_basic_crawl_sync(tester: Crawl4AiTester):
    print("\n=== Testing Basic Crawl (Sync) ===")
    request = {
        "urls": "https://www.nbcnews.com/business",
        "priority": 10,
-        "session_id": "test"
+        "session_id": "test",
    }
-    
+
    result = tester.submit_sync(request)
    print(f"Basic crawl result length: {len(result['result']['markdown'])}")
-    assert result['status'] == 'completed'
-    assert result['result']['success']
-    assert len(result['result']['markdown']) > 0
-    
+    assert result["status"] == "completed"
+    assert result["result"]["success"]
+    assert len(result["result"]["markdown"]) > 0
+
+
 def test_basic_crawl_direct(tester: Crawl4AiTester):
    print("\n=== Testing Basic Crawl (Direct) ===")
    request = {
        "urls": "https://www.nbcnews.com/business",
        "priority": 10,
        # "session_id": "test"
-        "cache_mode": "bypass"  # or "enabled", "disabled", "read_only", "write_only"
+        "cache_mode": "bypass",  # or "enabled", "disabled", "read_only", "write_only"
    }
-    
+
    result = tester.crawl_direct(request)
    print(f"Basic crawl result length: {len(result['result']['markdown'])}")
-    assert result['result']['success']
-    assert len(result['result']['markdown']) > 0
-    
+    assert result["result"]["success"]
+    assert len(result["result"]["markdown"]) > 0
+
+
 def test_js_execution(tester: Crawl4AiTester):
    print("\n=== Testing JS Execution ===")
    request = {
@@ -144,32 +164,29 @@ def test_js_execution(tester: Crawl4AiTester):
            "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
        ],
        "wait_for": "article.tease-card:nth-child(10)",
-        "crawler_params": {
-            "headless": True
-        }
+        "crawler_params": {"headless": True},
    }
-    
+
    result = tester.submit_and_wait(request)
    print(f"JS execution result length: {len(result['result']['markdown'])}")
    assert result["result"]["success"]

+
 def test_css_selector(tester: Crawl4AiTester):
    print("\n=== Testing CSS Selector ===")
    request = {
        "urls": "https://www.nbcnews.com/business",
        "priority": 7,
        "css_selector": ".wide-tease-item__description",
-        "crawler_params": {
-            "headless": True
-        },
-        "extra": {"word_count_threshold": 10}
-        
+        "crawler_params": {"headless": True},
+        "extra": {"word_count_threshold": 10},
    }
-    
+
    result = tester.submit_and_wait(request)
    print(f"CSS selector result length: {len(result['result']['markdown'])}")
    assert result["result"]["success"]

+
 def test_structured_extraction(tester: Crawl4AiTester):
    print("\n=== Testing Structured Extraction ===")
    schema = {
@@ -190,21 +207,16 @@ def test_structured_extraction(tester: Crawl4AiTester):
                "name": "price",
                "selector": "td:nth-child(2)",
                "type": "text",
-            }
+            },
        ],
    }
-    
+
    request = {
        "urls": "https://www.coinbase.com/explore",
        "priority": 9,
-        "extraction_config": {
-            "type": "json_css",
-            "params": {
-                "schema": schema
-            }
-        }
+        "extraction_config": {"type": "json_css", "params": {"schema": schema}},
    }
-    
+
    result = tester.submit_and_wait(request)
    extracted = json.loads(result["result"]["extracted_content"])
    print(f"Extracted {len(extracted)} items")
@@ -212,6 +224,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
    assert result["result"]["success"]
    assert len(extracted) > 0

+
 def test_llm_extraction(tester: Crawl4AiTester):
    print("\n=== Testing LLM Extraction ===")
    schema = {
@@ -219,20 +232,20 @@ def test_llm_extraction(tester: Crawl4AiTester):
        "properties": {
            "model_name": {
                "type": "string",
-                "description": "Name of the OpenAI model."
+                "description": "Name of the OpenAI model.",
            },
            "input_fee": {
                "type": "string",
-                "description": "Fee for input token for the OpenAI model."
+                "description": "Fee for input token for the OpenAI model.",
            },
            "output_fee": {
                "type": "string",
-                "description": "Fee for output token for the OpenAI model."
-            }
+                "description": "Fee for output token for the OpenAI model.",
+            },
        },
-        "required": ["model_name", "input_fee", "output_fee"]
+        "required": ["model_name", "input_fee", "output_fee"],
    }
-    
+
    request = {
        "urls": "https://openai.com/api/pricing",
        "priority": 8,
@@ -243,12 +256,12 @@ def test_llm_extraction(tester: Crawl4AiTester):
                "api_token": os.getenv("OPENAI_API_KEY"),
                "schema": schema,
                "extraction_type": "schema",
-                "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens."""
-            }
+                "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens.""",
+            },
        },
-        "crawler_params": {"word_count_threshold": 1}
+        "crawler_params": {"word_count_threshold": 1},
    }
-    
+
    try:
        result = tester.submit_and_wait(request)
        extracted = json.loads(result["result"]["extracted_content"])
@@ -258,6 +271,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
    except Exception as e:
        print(f"LLM extraction test failed (might be due to missing API key): {str(e)}")

+
 def test_llm_with_ollama(tester: Crawl4AiTester):
    print("\n=== Testing LLM with Ollama ===")
    schema = {
@@ -265,20 +279,20 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
        "properties": {
            "article_title": {
                "type": "string",
-                "description": "The main title of the news article"
+                "description": "The main title of the news article",
            },
            "summary": {
                "type": "string",
-                "description": "A brief summary of the article content"
+                "description": "A brief summary of the article content",
            },
            "main_topics": {
                "type": "array",
                "items": {"type": "string"},
-                "description": "Main topics or themes discussed in the article"
-            }
-        }
+                "description": "Main topics or themes discussed in the article",
+            },
+        },
    }
-    
+
    request = {
        "urls": "https://www.nbcnews.com/business",
        "priority": 8,
@@ -288,13 +302,13 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
                "provider": "ollama/llama2",
                "schema": schema,
                "extraction_type": "schema",
-                "instruction": "Extract the main article information including title, summary, and main topics."
-            }
+                "instruction": "Extract the main article information including title, summary, and main topics.",
+            },
        },
        "extra": {"word_count_threshold": 1},
-        "crawler_params": {"verbose": True}
+        "crawler_params": {"verbose": True},
    }
-    
+
    try:
        result = tester.submit_and_wait(request)
        extracted = json.loads(result["result"]["extracted_content"])
@@ -303,6 +317,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
    except Exception as e:
        print(f"Ollama extraction test failed: {str(e)}")

+
 def test_cosine_extraction(tester: Crawl4AiTester):
    print("\n=== Testing Cosine Extraction ===")
    request = {
@@ -314,11 +329,11 @@ def test_cosine_extraction(tester: Crawl4AiTester):
                "semantic_filter": "business finance economy",
                "word_count_threshold": 10,
                "max_dist": 0.2,
-                "top_k": 3
-            }
-        }
+                "top_k": 3,
+            },
+        },
    }
-    
+
    try:
        result = tester.submit_and_wait(request)
        extracted = json.loads(result["result"]["extracted_content"])
@@ -328,30 +343,30 @@ def test_cosine_extraction(tester: Crawl4AiTester):
    except Exception as e:
        print(f"Cosine extraction test failed: {str(e)}")

+
 def test_screenshot(tester: Crawl4AiTester):
    print("\n=== Testing Screenshot ===")
    request = {
        "urls": "https://www.nbcnews.com/business",
        "priority": 5,
        "screenshot": True,
-        "crawler_params": {
-            "headless": True
-        }
+        "crawler_params": {"headless": True},
    }
-    
+
    result = tester.submit_and_wait(request)
    print("Screenshot captured:", bool(result["result"]["screenshot"]))
-    
+
    if result["result"]["screenshot"]:
        # Save screenshot
        screenshot_data = base64.b64decode(result["result"]["screenshot"])
        with open("test_screenshot.jpg", "wb") as f:
            f.write(screenshot_data)
        print("Screenshot saved as test_screenshot.jpg")
-    
+
    assert result["result"]["success"]

+
 if __name__ == "__main__":
    version = sys.argv[1] if len(sys.argv) > 1 else "basic"
    # version = "full"
-    test_docker_deployment(version)
+    test_docker_deployment(version)