feat: Add HTTP-only crawling endpoints and related models

- Introduced HTTPCrawlRequest and HTTPCrawlRequestWithHooks models for HTTP-only crawling. - Implemented /crawl/http and /crawl/http/stream endpoints for fast, lightweight crawling without browser rendering. - Enhanced server.py to handle HTTP crawl requests and streaming responses. - Updated utils.py to disable memory wait timeout for testing. - Expanded API documentation to include new HTTP crawling features. - Added tests for HTTP crawling endpoints, including error handling and streaming responses.
2025-10-15 17:45:58 +08:00
parent aebf5a3694
commit 674d0741da
8 changed files with 1091 additions and 45 deletions
--- a/deploy/docker/schemas.py
+++ b/deploy/docker/schemas.py
@@ -123,6 +123,34 @@ class CrawlRequestWithHooks(CrawlRequest):
    )


+class HTTPCrawlRequest(BaseModel):
+    """Request model for HTTP-only crawling endpoints."""
+    
+    urls: List[str] = Field(min_length=1, max_length=100, description="List of URLs to crawl")
+    http_config: Optional[Dict] = Field(
+        default_factory=dict, 
+        description="HTTP crawler configuration (method, headers, timeout, etc.)"
+    )
+    crawler_config: Optional[Dict] = Field(
+        default_factory=dict,
+        description="Crawler run configuration (extraction, filtering, etc.)"
+    )
+    
+    # Dispatcher selection (same as browser crawling)
+    dispatcher: Optional[DispatcherType] = Field(
+        None, 
+        description="Dispatcher type to use. Defaults to memory_adaptive if not specified."
+    )
+
+
+class HTTPCrawlRequestWithHooks(HTTPCrawlRequest):
+    """Extended HTTP crawl request with hooks support"""
+
+    hooks: Optional[HookConfig] = Field(
+        default=None, description="Optional user-provided hook functions"
+    )
+
+
 class MarkdownRequest(BaseModel):
    """Request body for the /md endpoint."""