Merge pull request #752 from jl-martins/fix-raw-url-parsing

Fix `raw://` URL parsing logic. issue ref #1118
2025-06-03 11:10:29 +02:00
parent 3b766e1aac 58c1e17170
commit 5ce3e682f3
2 changed files with 27 additions and 2 deletions
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -466,9 +466,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                console_messages=captured_console,
            )

-        elif url.startswith("raw:") or url.startswith("raw://"):
+        elif url.startswith("raw:"):
            # Process raw HTML content
-            raw_html = url[4:] if url[:4] == "raw:" else url[7:]
+            raw_html = url[6:] if url.startswith("raw://") else url[4:]
            html = raw_html
            if config.screenshot:
                screenshot_data = await self._generate_screenshot_from_html(html)
--- a/tests/general/test_async_crawler_strategy.py
+++ b/tests/general/test_async_crawler_strategy.py
@@ -15,6 +15,24 @@ CRAWL4AI_HOME_DIR = Path(os.path.expanduser("~")).joinpath(".crawl4ai")
 if not CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").exists():
    CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").mkdir(parents=True)

+@pytest.fixture
+def basic_html():
+    return """
+    <html lang="en">
+    <head>
+        <title>Basic HTML</title>
+    </head>
+    <body>
+        <h1>Main Heading</h1>
+        <main>
+            <div class="container">
+                <p>Basic HTML document for testing purposes.</p>
+            </div>
+        </main>
+    </body>
+    </html>
+    """
+
 # Test Config Files
@pytest.fixture
 def basic_browser_config():
@@ -325,6 +343,13 @@ async def test_stealth_mode(crawler_strategy):
    )
    assert response.status_code == 200

+@pytest.mark.asyncio
+@pytest.mark.parametrize("prefix", ("raw:", "raw://"))
+async def test_raw_urls(crawler_strategy, basic_html, prefix):
+    url = f"{prefix}{basic_html}"
+    response = await crawler_strategy.crawl(url, CrawlerRunConfig())
+    assert response.html == basic_html
+
 # Error Handling Tests  
@pytest.mark.asyncio
 async def test_invalid_url():