diff --git a/deploy/docker/api.py b/deploy/docker/api.py index b54bae65..58d8c01f 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -65,7 +65,7 @@ async def handle_llm_qa( ) -> str: """Process QA using LLM with crawled content as context.""" try: - if not url.startswith(('http://', 'https://')): + if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")): url = 'https://' + url # Extract base URL by finding last '?q=' occurrence last_q_index = url.rfind('?q=') @@ -191,7 +191,7 @@ async def handle_markdown_request( detail=error_msg ) decoded_url = unquote(url) - if not decoded_url.startswith(('http://', 'https://')): + if not decoded_url.startswith(('http://', 'https://')) and not decoded_url.startswith(("raw:", "raw://")): decoded_url = 'https://' + decoded_url if filter_type == FilterType.RAW: @@ -328,7 +328,7 @@ async def create_new_task( ) -> JSONResponse: """Create and initialize a new task.""" decoded_url = unquote(input_path) - if not decoded_url.startswith(('http://', 'https://')): + if not decoded_url.startswith(('http://', 'https://')) and not decoded_url.startswith(("raw:", "raw://")): decoded_url = 'https://' + decoded_url from datetime import datetime @@ -428,7 +428,7 @@ async def handle_crawl_request( peak_mem_mb = start_mem_mb try: - urls = [('https://' + url) if not url.startswith(('http://', 'https://')) else url for url in urls] + urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls] browser_config = BrowserConfig.load(browser_config) crawler_config = CrawlerRunConfig.load(crawler_config) diff --git a/deploy/docker/server.py b/deploy/docker/server.py index 12ebbb53..57fd3d6d 100644 --- a/deploy/docker/server.py +++ b/deploy/docker/server.py @@ -237,9 +237,9 @@ async def get_markdown( body: MarkdownRequest, _td: Dict = Depends(token_dep), ): - if not body.url.startswith(("http://", "https://")): + if not body.url.startswith(("http://", "https://")) and not body.url.startswith(("raw:", "raw://")): raise HTTPException( - 400, "URL must be absolute and start with http/https") + 400, "Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)") markdown = await handle_markdown_request( body.url, body.f, body.q, body.c, config, body.provider ) @@ -401,7 +401,7 @@ async def llm_endpoint( ): if not q: raise HTTPException(400, "Query parameter 'q' is required") - if not url.startswith(("http://", "https://")): + if not url.startswith(("http://", "https://")) and not url.startswith(("raw:", "raw://")): url = "https://" + url answer = await handle_llm_qa(url, q, config) return JSONResponse({"answer": answer}) diff --git a/tests/docker/simple_api_test.py b/tests/docker/simple_api_test.py index 0a966d5e..10fb2320 100644 --- a/tests/docker/simple_api_test.py +++ b/tests/docker/simple_api_test.py @@ -168,7 +168,7 @@ class SimpleApiTester: print("\n=== CORE APIs ===") test_url = "https://example.com" - + test_raw_html_url = "raw://

Hello, World!

" # Test markdown endpoint md_payload = { "url": test_url, @@ -180,6 +180,17 @@ class SimpleApiTester: # print(result['data'].get('markdown', '')) self.print_result(result) + # Test markdown endpoint with raw HTML + raw_md_payload = { + "url": test_raw_html_url, + "f": "fit", + "q": "test query", + "c": "0" + } + result = self.test_post_endpoint("/md", raw_md_payload) + self.print_result(result) + + # Test HTML endpoint html_payload = {"url": test_url} result = self.test_post_endpoint("/html", html_payload) @@ -215,6 +226,15 @@ class SimpleApiTester: result = self.test_post_endpoint("/crawl", crawl_payload) self.print_result(result) + # Test crawl endpoint with raw HTML + crawl_payload = { + "urls": [test_raw_html_url], + "browser_config": {}, + "crawler_config": {} + } + result = self.test_post_endpoint("/crawl", crawl_payload) + self.print_result(result) + # Test config dump config_payload = {"code": "CrawlerRunConfig()"} result = self.test_post_endpoint("/config/dump", config_payload) diff --git a/tests/docker/test_docker.py b/tests/docker/test_docker.py index cf95671e..87723a70 100644 --- a/tests/docker/test_docker.py +++ b/tests/docker/test_docker.py @@ -74,7 +74,7 @@ async def test_direct_api(): # Make direct API call async with httpx.AsyncClient() as client: response = await client.post( - "http://localhost:8000/crawl", + "http://localhost:11235/crawl", json=request_data, timeout=300 ) @@ -100,13 +100,24 @@ async def test_direct_api(): async with httpx.AsyncClient() as client: response = await client.post( - "http://localhost:8000/crawl", + "http://localhost:11235/crawl", json=request_data ) assert response.status_code == 200 result = response.json() print("Structured extraction result:", result["success"]) + # Test 3: Raw HTML + request_data["urls"] = ["raw://

Hello, World!

Example"] + async with httpx.AsyncClient() as client: + response = await client.post( + "http://localhost:11235/crawl", + json=request_data + ) + assert response.status_code == 200 + result = response.json() + print("Raw HTML result:", result["success"]) + # Test 3: Get schema # async with httpx.AsyncClient() as client: # response = await client.get("http://localhost:8000/schema") @@ -118,7 +129,7 @@ async def test_with_client(): """Test using the Crawl4AI Docker client SDK""" print("\n=== Testing Client SDK ===") - async with Crawl4aiDockerClient(verbose=True) as client: + async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client: # Test 1: Basic crawl browser_config = BrowserConfig(headless=True) crawler_config = CrawlerRunConfig(