diff --git a/deploy/docker/api.py b/deploy/docker/api.py index b54bae65..58d8c01f 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -65,7 +65,7 @@ async def handle_llm_qa( ) -> str: """Process QA using LLM with crawled content as context.""" try: - if not url.startswith(('http://', 'https://')): + if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")): url = 'https://' + url # Extract base URL by finding last '?q=' occurrence last_q_index = url.rfind('?q=') @@ -191,7 +191,7 @@ async def handle_markdown_request( detail=error_msg ) decoded_url = unquote(url) - if not decoded_url.startswith(('http://', 'https://')): + if not decoded_url.startswith(('http://', 'https://')) and not decoded_url.startswith(("raw:", "raw://")): decoded_url = 'https://' + decoded_url if filter_type == FilterType.RAW: @@ -328,7 +328,7 @@ async def create_new_task( ) -> JSONResponse: """Create and initialize a new task.""" decoded_url = unquote(input_path) - if not decoded_url.startswith(('http://', 'https://')): + if not decoded_url.startswith(('http://', 'https://')) and not decoded_url.startswith(("raw:", "raw://")): decoded_url = 'https://' + decoded_url from datetime import datetime @@ -428,7 +428,7 @@ async def handle_crawl_request( peak_mem_mb = start_mem_mb try: - urls = [('https://' + url) if not url.startswith(('http://', 'https://')) else url for url in urls] + urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls] browser_config = BrowserConfig.load(browser_config) crawler_config = CrawlerRunConfig.load(crawler_config) diff --git a/deploy/docker/server.py b/deploy/docker/server.py index 12ebbb53..57fd3d6d 100644 --- a/deploy/docker/server.py +++ b/deploy/docker/server.py @@ -237,9 +237,9 @@ async def get_markdown( body: MarkdownRequest, _td: Dict = Depends(token_dep), ): - if not body.url.startswith(("http://", "https://")): + if not body.url.startswith(("http://", "https://")) and not body.url.startswith(("raw:", "raw://")): raise HTTPException( - 400, "URL must be absolute and start with http/https") + 400, "Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)") markdown = await handle_markdown_request( body.url, body.f, body.q, body.c, config, body.provider ) @@ -401,7 +401,7 @@ async def llm_endpoint( ): if not q: raise HTTPException(400, "Query parameter 'q' is required") - if not url.startswith(("http://", "https://")): + if not url.startswith(("http://", "https://")) and not url.startswith(("raw:", "raw://")): url = "https://" + url answer = await handle_llm_qa(url, q, config) return JSONResponse({"answer": answer}) diff --git a/tests/docker/simple_api_test.py b/tests/docker/simple_api_test.py index 0a966d5e..10fb2320 100644 --- a/tests/docker/simple_api_test.py +++ b/tests/docker/simple_api_test.py @@ -168,7 +168,7 @@ class SimpleApiTester: print("\n=== CORE APIs ===") test_url = "https://example.com" - + test_raw_html_url = "raw://