From 624e34164d97cac740bdc03b817d4bed0b772db3 Mon Sep 17 00:00:00 2001 From: unclecode Date: Wed, 24 Dec 2025 04:31:57 +0000 Subject: [PATCH] Fix: HTTP strategy raw: URL parsing truncates at # character The AsyncHTTPCrawlerStrategy.crawl() method used urlparse() to extract content from raw: URLs. This caused HTML with CSS color codes like #eee to be truncated because # is treated as a URL fragment delimiter. Before: raw:body{background:#eee} -> parsed.path = 'body{background:' After: raw:body{background:#eee} -> raw_content = 'body{background:#eee' Fix: Strip the raw: or raw:// prefix directly instead of using urlparse, matching how the browser strategy handles it. --- crawl4ai/async_crawler_strategy.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 4a85782f..fe13243b 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -2475,7 +2475,10 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy): if scheme == 'file': return await self._handle_file(parsed.path) elif scheme == 'raw': - return await self._handle_raw(parsed.path) + # Don't use parsed.path - urlparse truncates at '#' which is common in CSS + # Strip prefix directly: "raw://" (6 chars) or "raw:" (4 chars) + raw_content = url[6:] if url.startswith("raw://") else url[4:] + return await self._handle_raw(raw_content) else: # http or https return await self._handle_http(url, config)