Apply Ruff Corrections

2025-01-13 19:19:58 +08:00
parent c3370ec5da
commit 8ec12d7d68
84 changed files with 6861 additions and 5076 deletions
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -3,8 +3,8 @@ import aiohttp
 import json
 import time
 import os
-from typing import Optional, Dict, Any
-from pydantic import BaseModel, HttpUrl
+from typing import Dict, Any
+

 class NBCNewsAPITest:
    def __init__(self, base_url: str = "http://localhost:8000"):
@@ -20,7 +20,9 @@ class NBCNewsAPITest:
            await self.session.close()

    async def submit_crawl(self, request_data: Dict[str, Any]) -> str:
-        async with self.session.post(f"{self.base_url}/crawl", json=request_data) as response:
+        async with self.session.post(
+            f"{self.base_url}/crawl", json=request_data
+        ) as response:
            result = await response.json()
            return result["task_id"]

@@ -28,11 +30,15 @@ class NBCNewsAPITest:
        async with self.session.get(f"{self.base_url}/task/{task_id}") as response:
            return await response.json()

-    async def wait_for_task(self, task_id: str, timeout: int = 300, poll_interval: int = 2) -> Dict[str, Any]:
+    async def wait_for_task(
+        self, task_id: str, timeout: int = 300, poll_interval: int = 2
+    ) -> Dict[str, Any]:
        start_time = time.time()
        while True:
            if time.time() - start_time > timeout:
-                raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
+                raise TimeoutError(
+                    f"Task {task_id} did not complete within {timeout} seconds"
+                )

            status = await self.get_task_status(task_id)
            if status["status"] in ["completed", "failed"]:
@@ -44,13 +50,11 @@ class NBCNewsAPITest:
        async with self.session.get(f"{self.base_url}/health") as response:
            return await response.json()

+
 async def test_basic_crawl():
    print("\n=== Testing Basic Crawl ===")
    async with NBCNewsAPITest() as api:
-        request = {
-            "urls": "https://www.nbcnews.com/business",
-            "priority": 10
-        }
+        request = {"urls": "https://www.nbcnews.com/business", "priority": 10}
        task_id = await api.submit_crawl(request)
        result = await api.wait_for_task(task_id)
        print(f"Basic crawl result length: {len(result['result']['markdown'])}")
@@ -58,6 +62,7 @@ async def test_basic_crawl():
        assert "result" in result
        assert result["result"]["success"]

+
 async def test_js_execution():
    print("\n=== Testing JS Execution ===")
    async with NBCNewsAPITest() as api:
@@ -68,9 +73,7 @@ async def test_js_execution():
                "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
            ],
            "wait_for": "article.tease-card:nth-child(10)",
-            "crawler_params": {
-                "headless": True
-            }
+            "crawler_params": {"headless": True},
        }
        task_id = await api.submit_crawl(request)
        result = await api.wait_for_task(task_id)
@@ -78,13 +81,14 @@ async def test_js_execution():
        assert result["status"] == "completed"
        assert result["result"]["success"]

+
 async def test_css_selector():
    print("\n=== Testing CSS Selector ===")
    async with NBCNewsAPITest() as api:
        request = {
            "urls": "https://www.nbcnews.com/business",
            "priority": 7,
-            "css_selector": ".wide-tease-item__description"
+            "css_selector": ".wide-tease-item__description",
        }
        task_id = await api.submit_crawl(request)
        result = await api.wait_for_task(task_id)
@@ -92,6 +96,7 @@ async def test_css_selector():
        assert result["status"] == "completed"
        assert result["result"]["success"]

+
 async def test_structured_extraction():
    print("\n=== Testing Structured Extraction ===")
    async with NBCNewsAPITest() as api:
@@ -99,34 +104,25 @@ async def test_structured_extraction():
            "name": "NBC News Articles",
            "baseSelector": "article.tease-card",
            "fields": [
-                {
-                    "name": "title",
-                    "selector": "h2",
-                    "type": "text"
-                },
+                {"name": "title", "selector": "h2", "type": "text"},
                {
                    "name": "description",
                    "selector": ".tease-card__description",
-                    "type": "text"
+                    "type": "text",
                },
                {
                    "name": "link",
                    "selector": "a",
                    "type": "attribute",
-                    "attribute": "href"
-                }
-            ]
+                    "attribute": "href",
+                },
+            ],
        }
-        
+
        request = {
            "urls": "https://www.nbcnews.com/business",
            "priority": 9,
-            "extraction_config": {
-                "type": "json_css",
-                "params": {
-                    "schema": schema
-                }
-            }
+            "extraction_config": {"type": "json_css", "params": {"schema": schema}},
        }
        task_id = await api.submit_crawl(request)
        result = await api.wait_for_task(task_id)
@@ -136,6 +132,7 @@ async def test_structured_extraction():
        assert result["result"]["success"]
        assert len(extracted) > 0

+
 async def test_batch_crawl():
    print("\n=== Testing Batch Crawl ===")
    async with NBCNewsAPITest() as api:
@@ -143,12 +140,10 @@ async def test_batch_crawl():
            "urls": [
                "https://www.nbcnews.com/business",
                "https://www.nbcnews.com/business/consumer",
-                "https://www.nbcnews.com/business/economy"
+                "https://www.nbcnews.com/business/economy",
            ],
            "priority": 6,
-            "crawler_params": {
-                "headless": True
-            }
+            "crawler_params": {"headless": True},
        }
        task_id = await api.submit_crawl(request)
        result = await api.wait_for_task(task_id)
@@ -157,6 +152,7 @@ async def test_batch_crawl():
        assert "results" in result
        assert len(result["results"]) == 3

+
 async def test_llm_extraction():
    print("\n=== Testing LLM Extraction with Ollama ===")
    async with NBCNewsAPITest() as api:
@@ -165,19 +161,19 @@ async def test_llm_extraction():
            "properties": {
                "article_title": {
                    "type": "string",
-                    "description": "The main title of the news article"
+                    "description": "The main title of the news article",
                },
                "summary": {
                    "type": "string",
-                    "description": "A brief summary of the article content"
+                    "description": "A brief summary of the article content",
                },
                "main_topics": {
                    "type": "array",
                    "items": {"type": "string"},
-                    "description": "Main topics or themes discussed in the article"
-                }
+                    "description": "Main topics or themes discussed in the article",
+                },
            },
-            "required": ["article_title", "summary", "main_topics"]
+            "required": ["article_title", "summary", "main_topics"],
        }

        request = {
@@ -191,26 +187,24 @@ async def test_llm_extraction():
                    "schema": schema,
                    "extraction_type": "schema",
                    "instruction": """Extract the main article information including title, a brief summary, and main topics discussed. 
-                    Focus on the primary business news article on the page."""
-                }
+                    Focus on the primary business news article on the page.""",
+                },
            },
-            "crawler_params": {
-                "headless": True,
-                "word_count_threshold": 1
-            }
+            "crawler_params": {"headless": True, "word_count_threshold": 1},
        }
-        
+
        task_id = await api.submit_crawl(request)
        result = await api.wait_for_task(task_id)
-        
+
        if result["status"] == "completed":
            extracted = json.loads(result["result"]["extracted_content"])
-            print(f"Extracted article analysis:")
+            print("Extracted article analysis:")
            print(json.dumps(extracted, indent=2))
-        
+
        assert result["status"] == "completed"
        assert result["result"]["success"]

+
 async def test_screenshot():
    print("\n=== Testing Screenshot ===")
    async with NBCNewsAPITest() as api:
@@ -218,9 +212,7 @@ async def test_screenshot():
            "urls": "https://www.nbcnews.com/business",
            "priority": 5,
            "screenshot": True,
-            "crawler_params": {
-                "headless": True
-            }
+            "crawler_params": {"headless": True},
        }
        task_id = await api.submit_crawl(request)
        result = await api.wait_for_task(task_id)
@@ -229,6 +221,7 @@ async def test_screenshot():
        assert result["result"]["success"]
        assert result["result"]["screenshot"] is not None

+
 async def test_priority_handling():
    print("\n=== Testing Priority Handling ===")
    async with NBCNewsAPITest() as api:
@@ -236,7 +229,7 @@ async def test_priority_handling():
        low_priority = {
            "urls": "https://www.nbcnews.com/business",
            "priority": 1,
-            "crawler_params": {"headless": True}
+            "crawler_params": {"headless": True},
        }
        low_task_id = await api.submit_crawl(low_priority)

@@ -244,7 +237,7 @@ async def test_priority_handling():
        high_priority = {
            "urls": "https://www.nbcnews.com/business/consumer",
            "priority": 10,
-            "crawler_params": {"headless": True}
+            "crawler_params": {"headless": True},
        }
        high_task_id = await api.submit_crawl(high_priority)

@@ -256,6 +249,7 @@ async def test_priority_handling():
        assert high_result["status"] == "completed"
        assert low_result["status"] == "completed"

+
 async def main():
    try:
        # Start with health check
@@ -277,5 +271,6 @@ async def main():
        print(f"Test failed: {str(e)}")
        raise

+
 if __name__ == "__main__":
-    asyncio.run(main())
+    asyncio.run(main())