Fix raw URL parsing logic to correctly handle "raw://" and "raw:" prefixes. REF #1118

2025-06-03 11:19:08 +02:00
parent 5ce3e682f3
commit cc95d3abd4
1 changed files with 7 additions and 1 deletions
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -466,8 +466,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                console_messages=captured_console,
            )

-        elif url.startswith("raw:"):
+        ##### 
+        # Since both "raw:" and "raw://" start with "raw:", the first condition is always true for both, so "raw://" will be sliced as "//...", which is incorrect.
+        # Fix: Check for "raw://" first, then "raw:"
+        # Also, the prefix "raw://" is actually 6 characters long, not 7, so it should be sliced accordingly: url[6:]
+        #####
+        elif url.startswith("raw://") or url.startswith("raw:"):
            # Process raw HTML content
+            # raw_html = url[4:] if url[:4] == "raw:" else url[7:]
            raw_html = url[6:] if url.startswith("raw://") else url[4:]
            html = raw_html
            if config.screenshot: