refactor(config): enhance serialization and config handling
- Add ignore_default_value option to to_serializable_dict - Add viewport dict support in BrowserConfig - Replace FastFilterChain with FilterChain - Add deprecation warnings for unwanted properties - Clean up unused imports - Rename example files for consistency - Add comprehensive Docker configuration tutorial BREAKING CHANGE: FastFilterChain has been replaced with FilterChain
This commit is contained in:
214
docs/examples/docker_python_rest_api.py
Normal file
214
docs/examples/docker_python_rest_api.py
Normal file
@@ -0,0 +1,214 @@
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Optional
|
||||
from urllib.parse import quote
|
||||
|
||||
async def get_token(session, email: str = "test@example.com") -> str:
|
||||
"""Fetch a JWT token from the /token endpoint."""
|
||||
url = "http://localhost:8000/token"
|
||||
payload = {"email": email}
|
||||
print(f"\nFetching token from {url} with email: {email}")
|
||||
try:
|
||||
async with session.post(url, json=payload) as response:
|
||||
status = response.status
|
||||
data = await response.json()
|
||||
print(f"Token Response Status: {status}")
|
||||
print(f"Token Response: {json.dumps(data, indent=2)}")
|
||||
if status == 200:
|
||||
return data["access_token"]
|
||||
else:
|
||||
raise Exception(f"Failed to get token: {data.get('detail', 'Unknown error')}")
|
||||
except Exception as e:
|
||||
print(f"Error fetching token: {str(e)}")
|
||||
raise
|
||||
|
||||
async def test_endpoint(
|
||||
session,
|
||||
endpoint: str,
|
||||
url: str,
|
||||
token: str,
|
||||
params: Optional[dict] = None,
|
||||
expected_status: int = 200
|
||||
) -> Optional[dict]:
|
||||
"""Test an endpoint with token and print results."""
|
||||
params = params or {}
|
||||
param_str = "&".join(f"{k}={v}" for k, v in params.items())
|
||||
full_url = f"http://localhost:8000/{endpoint}/{quote(url)}"
|
||||
if param_str:
|
||||
full_url += f"?{param_str}"
|
||||
|
||||
headers = {"Authorization": f"Bearer {token}"}
|
||||
print(f"\nTesting: {full_url}")
|
||||
|
||||
try:
|
||||
async with session.get(full_url, headers=headers) as response:
|
||||
status = response.status
|
||||
try:
|
||||
data = await response.json()
|
||||
except:
|
||||
data = await response.text()
|
||||
|
||||
print(f"Status: {status} (Expected: {expected_status})")
|
||||
if isinstance(data, dict):
|
||||
print(f"Response: {json.dumps(data, indent=2)}")
|
||||
else:
|
||||
print(f"Response: {data[:500]}...") # First 500 chars
|
||||
assert status == expected_status, f"Expected {expected_status}, got {status}"
|
||||
return data
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
async def test_stream_crawl(session, token: str):
|
||||
"""Test the /crawl/stream endpoint with multiple URLs."""
|
||||
url = "http://localhost:8000/crawl/stream"
|
||||
payload = {
|
||||
"urls": [
|
||||
"https://example.com",
|
||||
"https://example.com/page1", # Replicated example.com with variation
|
||||
"https://example.com/page2", # Replicated example.com with variation
|
||||
"https://example.com/page3", # Replicated example.com with variation
|
||||
# "https://www.python.org",
|
||||
# "https://news.ycombinator.com/news"
|
||||
],
|
||||
"browser_config": {"headless": True, "viewport": {"width": 1200}},
|
||||
"crawler_config": {"stream": True, "cache_mode": "aggressive"}
|
||||
}
|
||||
headers = {"Authorization": f"Bearer {token}"}
|
||||
print(f"\nTesting Streaming Crawl: {url}")
|
||||
print(f"Payload: {json.dumps(payload, indent=2)}")
|
||||
|
||||
try:
|
||||
async with session.post(url, json=payload, headers=headers) as response:
|
||||
status = response.status
|
||||
print(f"Status: {status} (Expected: 200)")
|
||||
assert status == 200, f"Expected 200, got {status}"
|
||||
|
||||
# Read streaming response line-by-line (NDJSON)
|
||||
async for line in response.content:
|
||||
if line:
|
||||
data = json.loads(line.decode('utf-8').strip())
|
||||
print(f"Streamed Result: {json.dumps(data, indent=2)}")
|
||||
except Exception as e:
|
||||
print(f"Error in streaming crawl test: {str(e)}")
|
||||
|
||||
async def run_tests():
|
||||
import aiohttp
|
||||
print("Starting API Tests...")
|
||||
|
||||
# Test URLs
|
||||
urls = [
|
||||
"example.com",
|
||||
"https://www.python.org",
|
||||
"https://news.ycombinator.com/news",
|
||||
"https://github.com/trending"
|
||||
]
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
token = "test_token"
|
||||
# If jwt is enabled, authenticate first
|
||||
# Fetch token once and reuse it
|
||||
# token = await get_token(session)
|
||||
# if not token:
|
||||
# print("Aborting tests due to token failure!")
|
||||
# return
|
||||
|
||||
print("\n=== Testing Crawl Endpoint ===")
|
||||
crawl_payload = {
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": {"headless": True},
|
||||
"crawler_config": {"stream": False}
|
||||
}
|
||||
async with session.post(
|
||||
"http://localhost:8000/crawl",
|
||||
json=crawl_payload,
|
||||
headers={"Authorization": f"Bearer {token}"}
|
||||
) as response:
|
||||
status = response.status
|
||||
data = await response.json()
|
||||
print(f"\nCrawl Endpoint Status: {status}")
|
||||
print(f"Crawl Response: {json.dumps(data, indent=2)}")
|
||||
|
||||
|
||||
print("\n=== Testing Crawl Stream Endpoint ===")
|
||||
await test_stream_crawl(session, token)
|
||||
|
||||
print("\n=== Testing Markdown Endpoint ===")
|
||||
for url in []: #urls:
|
||||
for filter_type in ["raw", "fit", "bm25", "llm"]:
|
||||
params = {"f": filter_type}
|
||||
if filter_type in ["bm25", "llm"]:
|
||||
params["q"] = "extract main content"
|
||||
|
||||
for cache in ["0", "1"]:
|
||||
params["c"] = cache
|
||||
await test_endpoint(session, "md", url, token, params)
|
||||
await asyncio.sleep(1) # Be nice to the server
|
||||
|
||||
print("\n=== Testing LLM Endpoint ===")
|
||||
for url in urls:
|
||||
# Test basic extraction (direct response now)
|
||||
result = await test_endpoint(
|
||||
session,
|
||||
"llm",
|
||||
url,
|
||||
token,
|
||||
{"q": "Extract title and main content"}
|
||||
)
|
||||
|
||||
# Test with schema (direct response)
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"content": {"type": "string"},
|
||||
"links": {"type": "array", "items": {"type": "string"}}
|
||||
}
|
||||
}
|
||||
result = await test_endpoint(
|
||||
session,
|
||||
"llm",
|
||||
url,
|
||||
token,
|
||||
{
|
||||
"q": "Extract content with links",
|
||||
"s": json.dumps(schema),
|
||||
"c": "1" # Test with cache
|
||||
}
|
||||
)
|
||||
await asyncio.sleep(2) # Be nice to the server
|
||||
|
||||
print("\n=== Testing Error Cases ===")
|
||||
# Test invalid URL
|
||||
await test_endpoint(
|
||||
session,
|
||||
"md",
|
||||
"not_a_real_url",
|
||||
token,
|
||||
expected_status=500
|
||||
)
|
||||
|
||||
# Test invalid filter type
|
||||
await test_endpoint(
|
||||
session,
|
||||
"md",
|
||||
"example.com",
|
||||
token,
|
||||
{"f": "invalid"},
|
||||
expected_status=422
|
||||
)
|
||||
|
||||
# Test LLM without query (should fail per your server logic)
|
||||
await test_endpoint(
|
||||
session,
|
||||
"llm",
|
||||
"example.com",
|
||||
token,
|
||||
expected_status=400
|
||||
)
|
||||
|
||||
print("\nAll tests completed!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
Reference in New Issue
Block a user