From 27af4cc27bafa5bfa60849c5c124eb3ae47ec987 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Martins?=
 <11438285+jl-martins@users.noreply.github.com>
Date: Sat, 15 Feb 2025 15:34:59 +0000
Subject: [PATCH] Fix "raw://" URL parsing logic

Closes https://github.com/unclecode/crawl4ai/issues/686
---
 crawl4ai/async_crawler_strategy.py            |  4 +--
 tests/20241401/test_async_crawler_strategy.py | 25 +++++++++++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 62ee4c65..5e3c2519 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -1231,9 +1231,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 get_delayed_content=None,
             )
 
-        elif url.startswith("raw:") or url.startswith("raw://"):
+        elif url.startswith("raw:"):
             # Process raw HTML content
-            raw_html = url[4:] if url[:4] == "raw:" else url[7:]
+            raw_html = url[6:] if url.startswith("raw://") else url[4:]
             html = raw_html
             if config.screenshot:
                 screenshot_data = await self._generate_screenshot_from_html(html)
diff --git a/tests/20241401/test_async_crawler_strategy.py b/tests/20241401/test_async_crawler_strategy.py
index 68fe4a88..8426fe0a 100644
--- a/tests/20241401/test_async_crawler_strategy.py
+++ b/tests/20241401/test_async_crawler_strategy.py
@@ -15,6 +15,24 @@ CRAWL4AI_HOME_DIR = Path(os.path.expanduser("~")).joinpath(".crawl4ai")
 if not CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").exists():
     CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").mkdir(parents=True)
 
+@pytest.fixture
+def basic_html():
+    return """
+    <html lang="en">
+    <head>
+        <title>Basic HTML</title>
+    </head>
+    <body>
+        <h1>Main Heading</h1>
+        <main>
+            <div class="container">
+                <p>Basic HTML document for testing purposes.</p>
+            </div>
+        </main>
+    </body>
+    </html>
+    """
+
 # Test Config Files
 @pytest.fixture
 def basic_browser_config():
@@ -325,6 +343,13 @@ async def test_stealth_mode(crawler_strategy):
     )
     assert response.status_code == 200
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("prefix", ("raw:", "raw://"))
+async def test_raw_urls(crawler_strategy, basic_html, prefix):
+    url = f"{prefix}{basic_html}"
+    response = await crawler_strategy.crawl(url, CrawlerRunConfig())
+    assert response.html == basic_html
+
 # Error Handling Tests  
 @pytest.mark.asyncio
 async def test_invalid_url():