diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 121a3861..22e9c2ae 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -495,6 +495,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
pdf_data=None,
mhtml_data=None,
get_delayed_content=None,
+ # For raw:/file:// URLs, use base_url if provided; don't fall back to the raw content
+ redirected_url=config.base_url,
)
else:
raise ValueError(
@@ -683,7 +685,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
await page.set_content(html_content, wait_until=config.wait_until)
response = None
- redirected_url = config.base_url or url
+ # For raw: URLs, only use base_url if provided; don't fall back to the raw HTML string
+ redirected_url = config.base_url
status_code = 200
response_headers = {}
else:
@@ -1040,10 +1043,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
captured_console.extend(final_messages)
###
- # This ensures we capture the current page URL at the time we return the response,
+ # This ensures we capture the current page URL at the time we return the response,
# which correctly reflects any JavaScript navigation that occurred.
+ # For raw:/file:// URLs, preserve the earlier redirected_url (config.base_url or None)
+ # instead of using page.url which would be "about:blank".
###
- redirected_url = page.url # Use current page URL to capture JS redirects
+ is_local_content = url.startswith("file://") or url.startswith("raw://") or url.startswith("raw:")
+ if not is_local_content:
+ redirected_url = page.url # Use current page URL to capture JS redirects
# Return complete response
return AsyncCrawlResponse(
@@ -2372,11 +2379,13 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
status_code=200
)
- async def _handle_raw(self, content: str) -> AsyncCrawlResponse:
+ async def _handle_raw(self, content: str, base_url: str = None) -> AsyncCrawlResponse:
return AsyncCrawlResponse(
html=content,
response_headers={},
- status_code=200
+ status_code=200,
+ # For raw: URLs, use base_url if provided; don't fall back to the raw content
+ redirected_url=base_url
)
@@ -2501,7 +2510,7 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
# Don't use parsed.path - urlparse truncates at '#' which is common in CSS
# Strip prefix directly: "raw://" (6 chars) or "raw:" (4 chars)
raw_content = url[6:] if url.startswith("raw://") else url[4:]
- return await self._handle_raw(raw_content)
+ return await self._handle_raw(raw_content, base_url=config.base_url)
else: # http or https
return await self._handle_http(url, config)
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index ef03cb74..a3b63b2e 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -434,7 +434,9 @@ class AsyncWebCrawler:
)
crawl_result.status_code = async_response.status_code
- crawl_result.redirected_url = async_response.redirected_url or url
+ # For raw: URLs, don't fall back to the raw HTML string as redirected_url
+ is_raw_url = url.startswith("raw:") or url.startswith("raw://")
+ crawl_result.redirected_url = async_response.redirected_url or (None if is_raw_url else url)
crawl_result.response_headers = async_response.response_headers
crawl_result.downloaded_files = async_response.downloaded_files
crawl_result.js_execution_result = js_execution_result
@@ -479,7 +481,9 @@ class AsyncWebCrawler:
cached_result.success = bool(html)
cached_result.session_id = getattr(
config, "session_id", None)
- cached_result.redirected_url = cached_result.redirected_url or url
+ # For raw: URLs, don't fall back to the raw HTML string as redirected_url
+ is_raw_url = url.startswith("raw:") or url.startswith("raw://")
+ cached_result.redirected_url = cached_result.redirected_url or (None if is_raw_url else url)
return CrawlResultContainer(cached_result)
except Exception as e:
diff --git a/tests/test_raw_html_redirected_url.py b/tests/test_raw_html_redirected_url.py
new file mode 100644
index 00000000..c7dd0623
--- /dev/null
+++ b/tests/test_raw_html_redirected_url.py
@@ -0,0 +1,299 @@
+"""
+Tests for redirected_url handling with raw: URLs.
+
+This test file verifies the fix for the issue where redirected_url was incorrectly
+set to the entire raw HTML content (potentially 300KB+) instead of None or base_url.
+
+Issue: In raw: mode, async_crawler_strategy.py was setting redirected_url = config.base_url or url,
+which fell back to the raw HTML string when base_url wasn't provided.
+"""
+
+try:
+ import pytest
+ HAS_PYTEST = True
+except ImportError:
+ HAS_PYTEST = False
+ # Create a dummy decorator
+ class pytest:
+ class mark:
+ @staticmethod
+ def asyncio(fn):
+ return fn
+
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+
+# ============================================================================
+# Core fix tests: redirected_url should NOT be the raw HTML string
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_redirected_url_is_none_without_base_url():
+ """Test that redirected_url is None for raw: URLs when no base_url is provided."""
+ html = "
Test Content
"
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig()
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ # Key assertion: redirected_url should be None, NOT the raw HTML string
+ assert result.redirected_url is None, (
+ f"redirected_url should be None for raw: URLs without base_url, "
+ f"but got: {result.redirected_url[:100] if result.redirected_url else None}..."
+ )
+
+
+@pytest.mark.asyncio
+async def test_raw_html_redirected_url_not_huge():
+ """Test that redirected_url is not a huge string (the raw HTML content)."""
+ # Create a large HTML (100KB+)
+ items = "".join([f'Item {i} with some content
\n' for i in range(2000)])
+ large_html = f"{items}"
+ assert len(large_html) > 100000, "Test HTML should be >100KB"
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig()
+ result = await crawler.arun(f"raw:{large_html}", config=config)
+
+ assert result.success
+ # Key assertion: redirected_url should NOT be the huge HTML string
+ if result.redirected_url is not None:
+ assert len(result.redirected_url) < 1000, (
+ f"redirected_url should not be the raw HTML! "
+ f"Got {len(result.redirected_url)} chars: {result.redirected_url[:100]}..."
+ )
+
+
+@pytest.mark.asyncio
+async def test_raw_html_with_base_url_sets_redirected_url():
+ """Test that redirected_url is set to base_url when provided."""
+ html = "Test Content
"
+ base_url = "https://example.com/page"
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(base_url=base_url)
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ # Key assertion: redirected_url should be the base_url
+ assert result.redirected_url == base_url, (
+ f"redirected_url should be '{base_url}', got: {result.redirected_url}"
+ )
+
+
+@pytest.mark.asyncio
+async def test_raw_double_slash_prefix_redirected_url():
+ """Test redirected_url handling with raw:// prefix."""
+ html = "Content"
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig()
+ result = await crawler.arun(f"raw://{html}", config=config)
+
+ assert result.success
+ # Should be None, not the HTML
+ assert result.redirected_url is None
+
+
+# ============================================================================
+# Browser path tests (with js_code, screenshot, etc.)
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_browser_path_redirected_url_none():
+ """Test redirected_url is None for raw: URLs in browser path (with js_code)."""
+ html = "Original
"
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ js_code="document.getElementById('test').innerText = 'Modified'"
+ )
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ assert "Modified" in result.html
+ # Key assertion: even with browser path, redirected_url should be None
+ assert result.redirected_url is None, (
+ f"redirected_url should be None, got: {result.redirected_url}"
+ )
+
+
+@pytest.mark.asyncio
+async def test_raw_html_browser_path_with_base_url():
+ """Test redirected_url is base_url for raw: URLs in browser path."""
+ html = "Original
"
+ base_url = "https://mysite.com/processed"
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ base_url=base_url,
+ js_code="document.getElementById('test').innerText = 'Modified'"
+ )
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ assert "Modified" in result.html
+ assert result.redirected_url == base_url
+
+
+@pytest.mark.asyncio
+async def test_raw_html_screenshot_redirected_url():
+ """Test redirected_url with screenshot (browser path)."""
+ html = "Screenshot Test
"
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(screenshot=True)
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ assert result.screenshot is not None
+ # redirected_url should still be None
+ assert result.redirected_url is None
+
+
+# ============================================================================
+# Compatibility tests: HTTP URLs should still work correctly
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_http_url_redirected_url_still_works():
+ """Ensure HTTP URLs still set redirected_url correctly."""
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun("https://example.com")
+
+ assert result.success
+ # For HTTP URLs, redirected_url should be the final URL (or original if no redirect)
+ assert result.redirected_url is not None
+ assert "example.com" in result.redirected_url
+
+
+@pytest.mark.asyncio
+async def test_http_url_with_redirect_preserves_redirected_url():
+ """Test that HTTP redirects still capture the final URL."""
+ # httpbin.org/redirect-to redirects to the specified URL
+ async with AsyncWebCrawler() as crawler:
+ # Use a URL that redirects
+ result = await crawler.arun("https://httpbin.org/redirect-to?url=https://example.com")
+
+ assert result.success
+ # Should capture the final redirected URL
+ assert result.redirected_url is not None
+ assert "example.com" in result.redirected_url
+
+
+# ============================================================================
+# Edge cases
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_with_url_like_content():
+ """Test raw HTML containing URLs doesn't confuse redirected_url."""
+ html = """
+
+ Link
+ Visit https://google.com for more
+ raw:https://fake.com
+
+ """
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(f"raw:{html}")
+
+ assert result.success
+ # redirected_url should be None, not any URL from the content
+ assert result.redirected_url is None
+
+
+@pytest.mark.asyncio
+async def test_raw_html_empty_base_url():
+ """Test raw HTML with empty string base_url."""
+ html = "Content"
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(base_url="")
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ # Empty string is falsy, so redirected_url should be None
+ assert result.redirected_url is None or result.redirected_url == ""
+
+
+@pytest.mark.asyncio
+async def test_raw_html_process_in_browser_redirected_url():
+ """Test redirected_url with process_in_browser=True."""
+ html = "Test"
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(process_in_browser=True)
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ assert result.redirected_url is None
+
+
+# ============================================================================
+# Regression test: specific issue scenario
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_regression_321kb_html_redirected_url():
+ """
+ Regression test for the specific issue:
+ - raw:{321KB HTML} should NOT have redirected_url = "raw:{321KB HTML}"
+ - This was causing massive memory/logging issues
+ """
+ # Create ~321KB of HTML content
+ content = "X" * 300000 # ~300KB of content
+ html = f"{content}
"
+ assert len(html) > 300000, "Should be >300KB"
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(f"raw:{html}")
+
+ assert result.success
+
+ # The bug was: redirected_url = "raw:{321KB HTML}"
+ # After fix: redirected_url = None
+ assert result.redirected_url is None, (
+ "REGRESSION: redirected_url contains the raw HTML! "
+ f"Length: {len(result.redirected_url) if result.redirected_url else 0}"
+ )
+
+
+if __name__ == "__main__":
+ async def run_tests():
+ tests = [
+ ("redirected_url None without base_url", test_raw_html_redirected_url_is_none_without_base_url),
+ ("redirected_url not huge", test_raw_html_redirected_url_not_huge),
+ ("redirected_url with base_url", test_raw_html_with_base_url_sets_redirected_url),
+ ("raw:// prefix", test_raw_double_slash_prefix_redirected_url),
+ ("browser path None", test_raw_html_browser_path_redirected_url_none),
+ ("browser path with base_url", test_raw_html_browser_path_with_base_url),
+ ("HTTP URL still works", test_http_url_redirected_url_still_works),
+ ("321KB regression", test_regression_321kb_html_redirected_url),
+ ]
+
+ passed = 0
+ failed = 0
+
+ for name, test_fn in tests:
+ print(f"\n=== {name} ===")
+ try:
+ await test_fn()
+ print(f"PASSED")
+ passed += 1
+ except Exception as e:
+ print(f"FAILED: {e}")
+ import traceback
+ traceback.print_exc()
+ failed += 1
+
+ print(f"\n{'='*50}")
+ print(f"Results: {passed} passed, {failed} failed")
+ return failed == 0
+
+ import sys
+ success = asyncio.run(run_tests())
+ sys.exit(0 if success else 1)