feat(docker): add JWT authentication and improve server architecture

Add JWT token-based authentication to Docker server and client. Refactor server architecture for better code organization and error handling. Move Dockerfile to root deploy directory and update configuration. Add comprehensive documentation and examples. BREAKING CHANGE: Docker server now requires authentication by default. Endpoints require JWT tokens when security.jwt_enabled is true in config.
2025-02-18 22:07:13 +08:00
parent 2864015469
commit 392c923980
16 changed files with 1294 additions and 1364 deletions
--- a/docs/examples/docker_python_rest_api_example.py
+++ b/docs/examples/docker_python_rest_api_example.py
@@ -0,0 +1,214 @@
+import asyncio
+import json
+from typing import Optional
+from urllib.parse import quote
+
+async def get_token(session, email: str = "test@example.com") -> str:
+    """Fetch a JWT token from the /token endpoint."""
+    url = "http://localhost:8000/token"
+    payload = {"email": email}
+    print(f"\nFetching token from {url} with email: {email}")
+    try:
+        async with session.post(url, json=payload) as response:
+            status = response.status
+            data = await response.json()
+            print(f"Token Response Status: {status}")
+            print(f"Token Response: {json.dumps(data, indent=2)}")
+            if status == 200:
+                return data["access_token"]
+            else:
+                raise Exception(f"Failed to get token: {data.get('detail', 'Unknown error')}")
+    except Exception as e:
+        print(f"Error fetching token: {str(e)}")
+        raise
+
+async def test_endpoint(
+    session,
+    endpoint: str,
+    url: str,
+    token: str,
+    params: Optional[dict] = None,
+    expected_status: int = 200
+) -> Optional[dict]:
+    """Test an endpoint with token and print results."""
+    params = params or {}
+    param_str = "&".join(f"{k}={v}" for k, v in params.items())
+    full_url = f"http://localhost:8000/{endpoint}/{quote(url)}"
+    if param_str:
+        full_url += f"?{param_str}"
+    
+    headers = {"Authorization": f"Bearer {token}"}
+    print(f"\nTesting: {full_url}")
+    
+    try:
+        async with session.get(full_url, headers=headers) as response:
+            status = response.status
+            try:
+                data = await response.json()
+            except:
+                data = await response.text()
+            
+            print(f"Status: {status} (Expected: {expected_status})")
+            if isinstance(data, dict):
+                print(f"Response: {json.dumps(data, indent=2)}")
+            else:
+                print(f"Response: {data[:500]}...")  # First 500 chars
+            assert status == expected_status, f"Expected {expected_status}, got {status}"
+            return data
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        return None
+
+
+async def test_stream_crawl(session, token: str):
+    """Test the /crawl/stream endpoint with multiple URLs."""
+    url = "http://localhost:8000/crawl/stream"
+    payload = {
+        "urls": [
+            "https://example.com",
+            "https://example.com/page1",  # Replicated example.com with variation
+            "https://example.com/page2",  # Replicated example.com with variation
+            "https://example.com/page3",  # Replicated example.com with variation
+            # "https://www.python.org",
+            # "https://news.ycombinator.com/news"
+        ],
+        "browser_config": {"headless": True, "viewport": {"width": 1200}},
+        "crawler_config": {"stream": True, "cache_mode": "aggressive"}
+    }
+    headers = {"Authorization": f"Bearer {token}"}
+    print(f"\nTesting Streaming Crawl: {url}")
+    print(f"Payload: {json.dumps(payload, indent=2)}")
+    
+    try:
+        async with session.post(url, json=payload, headers=headers) as response:
+            status = response.status
+            print(f"Status: {status} (Expected: 200)")
+            assert status == 200, f"Expected 200, got {status}"
+            
+            # Read streaming response line-by-line (NDJSON)
+            async for line in response.content:
+                if line:
+                    data = json.loads(line.decode('utf-8').strip())
+                    print(f"Streamed Result: {json.dumps(data, indent=2)}")
+    except Exception as e:
+        print(f"Error in streaming crawl test: {str(e)}")
+
+async def run_tests():
+    import aiohttp
+    print("Starting API Tests...")
+    
+    # Test URLs
+    urls = [
+        "example.com",
+        "https://www.python.org",
+        "https://news.ycombinator.com/news",
+        "https://github.com/trending"
+    ]
+    
+    async with aiohttp.ClientSession() as session:
+        token = "test_token"
+        # If jwt is enabled, authenticate first
+        # Fetch token once and reuse it
+        # token = await get_token(session)
+        # if not token:
+        #     print("Aborting tests due to token failure!")
+        #     return
+        
+        print("\n=== Testing Crawl Endpoint ===")
+        crawl_payload = {
+            "urls": ["https://example.com"],
+            "browser_config": {"headless": True},
+            "crawler_config": {"stream": False}
+        }
+        async with session.post(
+            "http://localhost:8000/crawl",
+            json=crawl_payload,
+            headers={"Authorization": f"Bearer {token}"}
+        ) as response:
+            status = response.status
+            data = await response.json()
+            print(f"\nCrawl Endpoint Status: {status}")
+            print(f"Crawl Response: {json.dumps(data, indent=2)}")
+        
+
+        print("\n=== Testing Crawl Stream Endpoint ===")
+        await test_stream_crawl(session, token)
+
+        print("\n=== Testing Markdown Endpoint ===")
+        for url in []: #urls:
+            for filter_type in ["raw", "fit", "bm25", "llm"]:
+                params = {"f": filter_type}
+                if filter_type in ["bm25", "llm"]:
+                    params["q"] = "extract main content"
+                
+                for cache in ["0", "1"]:
+                    params["c"] = cache
+                    await test_endpoint(session, "md", url, token, params)
+                    await asyncio.sleep(1)  # Be nice to the server
+
+        print("\n=== Testing LLM Endpoint ===")
+        for url in urls:
+            # Test basic extraction (direct response now)
+            result = await test_endpoint(
+                session,
+                "llm",
+                url,
+                token,
+                {"q": "Extract title and main content"}
+            )
+            
+            # Test with schema (direct response)
+            schema = {
+                "type": "object",
+                "properties": {
+                    "title": {"type": "string"},
+                    "content": {"type": "string"},
+                    "links": {"type": "array", "items": {"type": "string"}}
+                }
+            }
+            result = await test_endpoint(
+                session,
+                "llm",
+                url,
+                token,
+                {
+                    "q": "Extract content with links",
+                    "s": json.dumps(schema),
+                    "c": "1"  # Test with cache
+                }
+            )
+            await asyncio.sleep(2)  # Be nice to the server
+        
+        print("\n=== Testing Error Cases ===")
+        # Test invalid URL
+        await test_endpoint(
+            session,
+            "md",
+            "not_a_real_url",
+            token,
+            expected_status=500
+        )
+        
+        # Test invalid filter type
+        await test_endpoint(
+            session,
+            "md",
+            "example.com",
+            token,
+            {"f": "invalid"},
+            expected_status=422
+        )
+        
+        # Test LLM without query (should fail per your server logic)
+        await test_endpoint(
+            session,
+            "llm",
+            "example.com",
+            token,
+            expected_status=400
+        )
+        
+    print("\nAll tests completed!")
+
+if __name__ == "__main__":
+    asyncio.run(run_tests())
--- a/docs/examples/docker_python_sdk_example.py
+++ b/docs/examples/docker_python_sdk_example.py
@@ -0,0 +1,35 @@
+import asyncio
+from crawl4ai.docker_client import Crawl4aiDockerClient
+from crawl4ai import (
+    BrowserConfig,
+    CrawlerRunConfig
+)
+
+async def main():
+    async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client:
+        # If jwt is enabled, authenticate first
+        # await client.authenticate("test@example.com")
+        
+        # Non-streaming crawl
+        results = await client.crawl(
+            ["https://example.com", "https://python.org"],
+            browser_config=BrowserConfig(headless=True),
+            crawler_config=CrawlerRunConfig()
+        )
+        print(f"Non-streaming results: {results}")
+        
+        # Streaming crawl
+        crawler_config = CrawlerRunConfig(stream=True)
+        async for result in await client.crawl(
+            ["https://example.com", "https://python.org"],
+            browser_config=BrowserConfig(headless=True),
+            crawler_config=crawler_config
+        ):
+            print(f"Streamed result: {result}")
+        
+        # Get schema
+        schema = await client.get_schema()
+        print(f"Schema: {schema}")
+
+if __name__ == "__main__":
+    asyncio.run(main())