refactor(config): enhance serialization and config handling

- Add ignore_default_value option to to_serializable_dict - Add viewport dict support in BrowserConfig - Replace FastFilterChain with FilterChain - Add deprecation warnings for unwanted properties - Clean up unused imports - Rename example files for consistency - Add comprehensive Docker configuration tutorial BREAKING CHANGE: FastFilterChain has been replaced with FilterChain
2025-02-19 17:23:25 +08:00
parent dad592c801
commit 3cb28875c3
7 changed files with 308 additions and 33 deletions
--- a/docs/examples/docker_python_rest_api.py
+++ b/docs/examples/docker_python_rest_api.py
@@ -0,0 +1,214 @@
+import asyncio
+import json
+from typing import Optional
+from urllib.parse import quote
+
+async def get_token(session, email: str = "test@example.com") -> str:
+    """Fetch a JWT token from the /token endpoint."""
+    url = "http://localhost:8000/token"
+    payload = {"email": email}
+    print(f"\nFetching token from {url} with email: {email}")
+    try:
+        async with session.post(url, json=payload) as response:
+            status = response.status
+            data = await response.json()
+            print(f"Token Response Status: {status}")
+            print(f"Token Response: {json.dumps(data, indent=2)}")
+            if status == 200:
+                return data["access_token"]
+            else:
+                raise Exception(f"Failed to get token: {data.get('detail', 'Unknown error')}")
+    except Exception as e:
+        print(f"Error fetching token: {str(e)}")
+        raise
+
+async def test_endpoint(
+    session,
+    endpoint: str,
+    url: str,
+    token: str,
+    params: Optional[dict] = None,
+    expected_status: int = 200
+) -> Optional[dict]:
+    """Test an endpoint with token and print results."""
+    params = params or {}
+    param_str = "&".join(f"{k}={v}" for k, v in params.items())
+    full_url = f"http://localhost:8000/{endpoint}/{quote(url)}"
+    if param_str:
+        full_url += f"?{param_str}"
+    
+    headers = {"Authorization": f"Bearer {token}"}
+    print(f"\nTesting: {full_url}")
+    
+    try:
+        async with session.get(full_url, headers=headers) as response:
+            status = response.status
+            try:
+                data = await response.json()
+            except:
+                data = await response.text()
+            
+            print(f"Status: {status} (Expected: {expected_status})")
+            if isinstance(data, dict):
+                print(f"Response: {json.dumps(data, indent=2)}")
+            else:
+                print(f"Response: {data[:500]}...")  # First 500 chars
+            assert status == expected_status, f"Expected {expected_status}, got {status}"
+            return data
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        return None
+
+
+async def test_stream_crawl(session, token: str):
+    """Test the /crawl/stream endpoint with multiple URLs."""
+    url = "http://localhost:8000/crawl/stream"
+    payload = {
+        "urls": [
+            "https://example.com",
+            "https://example.com/page1",  # Replicated example.com with variation
+            "https://example.com/page2",  # Replicated example.com with variation
+            "https://example.com/page3",  # Replicated example.com with variation
+            # "https://www.python.org",
+            # "https://news.ycombinator.com/news"
+        ],
+        "browser_config": {"headless": True, "viewport": {"width": 1200}},
+        "crawler_config": {"stream": True, "cache_mode": "aggressive"}
+    }
+    headers = {"Authorization": f"Bearer {token}"}
+    print(f"\nTesting Streaming Crawl: {url}")
+    print(f"Payload: {json.dumps(payload, indent=2)}")
+    
+    try:
+        async with session.post(url, json=payload, headers=headers) as response:
+            status = response.status
+            print(f"Status: {status} (Expected: 200)")
+            assert status == 200, f"Expected 200, got {status}"
+            
+            # Read streaming response line-by-line (NDJSON)
+            async for line in response.content:
+                if line:
+                    data = json.loads(line.decode('utf-8').strip())
+                    print(f"Streamed Result: {json.dumps(data, indent=2)}")
+    except Exception as e:
+        print(f"Error in streaming crawl test: {str(e)}")
+
+async def run_tests():
+    import aiohttp
+    print("Starting API Tests...")
+    
+    # Test URLs
+    urls = [
+        "example.com",
+        "https://www.python.org",
+        "https://news.ycombinator.com/news",
+        "https://github.com/trending"
+    ]
+    
+    async with aiohttp.ClientSession() as session:
+        token = "test_token"
+        # If jwt is enabled, authenticate first
+        # Fetch token once and reuse it
+        # token = await get_token(session)
+        # if not token:
+        #     print("Aborting tests due to token failure!")
+        #     return
+        
+        print("\n=== Testing Crawl Endpoint ===")
+        crawl_payload = {
+            "urls": ["https://example.com"],
+            "browser_config": {"headless": True},
+            "crawler_config": {"stream": False}
+        }
+        async with session.post(
+            "http://localhost:8000/crawl",
+            json=crawl_payload,
+            headers={"Authorization": f"Bearer {token}"}
+        ) as response:
+            status = response.status
+            data = await response.json()
+            print(f"\nCrawl Endpoint Status: {status}")
+            print(f"Crawl Response: {json.dumps(data, indent=2)}")
+        
+
+        print("\n=== Testing Crawl Stream Endpoint ===")
+        await test_stream_crawl(session, token)
+
+        print("\n=== Testing Markdown Endpoint ===")
+        for url in []: #urls:
+            for filter_type in ["raw", "fit", "bm25", "llm"]:
+                params = {"f": filter_type}
+                if filter_type in ["bm25", "llm"]:
+                    params["q"] = "extract main content"
+                
+                for cache in ["0", "1"]:
+                    params["c"] = cache
+                    await test_endpoint(session, "md", url, token, params)
+                    await asyncio.sleep(1)  # Be nice to the server
+
+        print("\n=== Testing LLM Endpoint ===")
+        for url in urls:
+            # Test basic extraction (direct response now)
+            result = await test_endpoint(
+                session,
+                "llm",
+                url,
+                token,
+                {"q": "Extract title and main content"}
+            )
+            
+            # Test with schema (direct response)
+            schema = {
+                "type": "object",
+                "properties": {
+                    "title": {"type": "string"},
+                    "content": {"type": "string"},
+                    "links": {"type": "array", "items": {"type": "string"}}
+                }
+            }
+            result = await test_endpoint(
+                session,
+                "llm",
+                url,
+                token,
+                {
+                    "q": "Extract content with links",
+                    "s": json.dumps(schema),
+                    "c": "1"  # Test with cache
+                }
+            )
+            await asyncio.sleep(2)  # Be nice to the server
+        
+        print("\n=== Testing Error Cases ===")
+        # Test invalid URL
+        await test_endpoint(
+            session,
+            "md",
+            "not_a_real_url",
+            token,
+            expected_status=500
+        )
+        
+        # Test invalid filter type
+        await test_endpoint(
+            session,
+            "md",
+            "example.com",
+            token,
+            {"f": "invalid"},
+            expected_status=422
+        )
+        
+        # Test LLM without query (should fail per your server logic)
+        await test_endpoint(
+            session,
+            "llm",
+            "example.com",
+            token,
+            expected_status=400
+        )
+        
+    print("\nAll tests completed!")
+
+if __name__ == "__main__":
+    asyncio.run(run_tests())