fix: Check for raw: and raw:// URLs before auto-appending https:// prefix

- Add raw HTML URL validation alongside http/https checks - Fix URL preprocessing logic to handle raw: and raw:// prefixes - Update error message and add comprehensive test cases
2025-08-11 22:10:53 +05:30
parent f0ce7b2710
commit f30811b524
4 changed files with 42 additions and 11 deletions
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -65,7 +65,7 @@ async def handle_llm_qa(
 ) -> str:
    """Process QA using LLM with crawled content as context."""
    try:
-        if not url.startswith(('http://', 'https://')):
+        if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")):
            url = 'https://' + url
        # Extract base URL by finding last '?q=' occurrence
        last_q_index = url.rfind('?q=')
@@ -191,7 +191,7 @@ async def handle_markdown_request(
                    detail=error_msg
                )
        decoded_url = unquote(url)
-        if not decoded_url.startswith(('http://', 'https://')):
+        if not decoded_url.startswith(('http://', 'https://')) and not decoded_url.startswith(("raw:", "raw://")):
            decoded_url = 'https://' + decoded_url

        if filter_type == FilterType.RAW:
@@ -328,7 +328,7 @@ async def create_new_task(
 ) -> JSONResponse:
    """Create and initialize a new task."""
    decoded_url = unquote(input_path)
-    if not decoded_url.startswith(('http://', 'https://')):
+    if not decoded_url.startswith(('http://', 'https://')) and not decoded_url.startswith(("raw:", "raw://")):
        decoded_url = 'https://' + decoded_url

    from datetime import datetime
@@ -428,7 +428,7 @@ async def handle_crawl_request(
    peak_mem_mb = start_mem_mb
    
    try:
-        urls = [('https://' + url) if not url.startswith(('http://', 'https://')) else url for url in urls]
+        urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls]
        browser_config = BrowserConfig.load(browser_config)
        crawler_config = CrawlerRunConfig.load(crawler_config)

--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -237,9 +237,9 @@ async def get_markdown(
    body: MarkdownRequest,
    _td: Dict = Depends(token_dep),
 ):
-    if not body.url.startswith(("http://", "https://")):
+    if not body.url.startswith(("http://", "https://")) and not body.url.startswith(("raw:", "raw://")):
        raise HTTPException(
-            400, "URL must be absolute and start with http/https")
+            400, "Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)")
    markdown = await handle_markdown_request(
        body.url, body.f, body.q, body.c, config, body.provider
    )
@@ -401,7 +401,7 @@ async def llm_endpoint(
 ):
    if not q:
        raise HTTPException(400, "Query parameter 'q' is required")
-    if not url.startswith(("http://", "https://")):
+    if not url.startswith(("http://", "https://")) and not url.startswith(("raw:", "raw://")):
        url = "https://" + url
    answer = await handle_llm_qa(url, q, config)
    return JSONResponse({"answer": answer})
--- a/tests/docker/simple_api_test.py
+++ b/tests/docker/simple_api_test.py
@@ -168,7 +168,7 @@ class SimpleApiTester:
        print("\n=== CORE APIs ===")
        
        test_url = "https://example.com"
-        
+        test_raw_html_url = "raw://<html><body><h1>Hello, World!</h1></body></html>"
        # Test markdown endpoint
        md_payload = {
            "url": test_url,
@@ -180,6 +180,17 @@ class SimpleApiTester:
        # print(result['data'].get('markdown', ''))
        self.print_result(result)
        
+        # Test markdown endpoint with raw HTML
+        raw_md_payload = {
+            "url": test_raw_html_url,
+            "f": "fit",
+            "q": "test query",
+            "c": "0"
+        }
+        result = self.test_post_endpoint("/md", raw_md_payload)
+        self.print_result(result)
+        
+
        # Test HTML endpoint
        html_payload = {"url": test_url}
        result = self.test_post_endpoint("/html", html_payload)
@@ -215,6 +226,15 @@ class SimpleApiTester:
        result = self.test_post_endpoint("/crawl", crawl_payload)
        self.print_result(result)
        
+        # Test crawl endpoint with raw HTML
+        crawl_payload = {
+            "urls": [test_raw_html_url],
+            "browser_config": {},
+            "crawler_config": {}
+        }
+        result = self.test_post_endpoint("/crawl", crawl_payload)
+        self.print_result(result)
+
        # Test config dump
        config_payload = {"code": "CrawlerRunConfig()"}
        result = self.test_post_endpoint("/config/dump", config_payload)
--- a/tests/docker/test_docker.py
+++ b/tests/docker/test_docker.py
@@ -74,7 +74,7 @@ async def test_direct_api():
    # Make direct API call
    async with httpx.AsyncClient() as client:
        response = await client.post(
-            "http://localhost:8000/crawl",
+            "http://localhost:11235/crawl",
            json=request_data,
            timeout=300
        )
@@ -100,13 +100,24 @@ async def test_direct_api():

    async with httpx.AsyncClient() as client:
        response = await client.post(
-            "http://localhost:8000/crawl",
+            "http://localhost:11235/crawl",
            json=request_data
        )
        assert response.status_code == 200
        result = response.json()
        print("Structured extraction result:", result["success"])

+    # Test 3: Raw HTML
+    request_data["urls"] = ["raw://<html><body><h1>Hello, World!</h1><a href='https://example.com'>Example</a></body></html>"]
+    async with httpx.AsyncClient() as client:
+        response = await client.post(
+            "http://localhost:11235/crawl",
+            json=request_data
+        )
+        assert response.status_code == 200
+        result = response.json()
+        print("Raw HTML result:", result["success"])
+
    # Test 3: Get schema
    # async with httpx.AsyncClient() as client:
    #     response = await client.get("http://localhost:8000/schema")
@@ -118,7 +129,7 @@ async def test_with_client():
    """Test using the Crawl4AI Docker client SDK"""
    print("\n=== Testing Client SDK ===")
    
-    async with Crawl4aiDockerClient(verbose=True) as client:
+    async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
        # Test 1: Basic crawl
        browser_config = BrowserConfig(headless=True)
        crawler_config = CrawlerRunConfig(