Fix redirected_url containing raw HTML content for raw: URLs

When using raw: URLs without a base_url, redirected_url was incorrectly
set to the entire raw HTML string (potentially 300KB+) instead of None.

Changes:
- async_crawler_strategy.py: Don't fall back to url for raw:/file:// URLs
  in fast path, browser path, and HTTP strategy
- async_crawler_strategy.py: Skip page.url assignment for local content
  (would return "about:blank")
- async_webcrawler.py: Don't fall back to url for raw: URLs in crawl
  result and cached result paths
- Add comprehensive test suite for redirected_url handling
This commit is contained in:
unclecode
2026-01-20 00:31:12 +00:00
parent 857b1ed23b
commit 418bfcfd3b
3 changed files with 320 additions and 8 deletions

View File

@@ -495,6 +495,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
pdf_data=None,
mhtml_data=None,
get_delayed_content=None,
# For raw:/file:// URLs, use base_url if provided; don't fall back to the raw content
redirected_url=config.base_url,
)
else:
raise ValueError(
@@ -683,7 +685,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
await page.set_content(html_content, wait_until=config.wait_until)
response = None
redirected_url = config.base_url or url
# For raw: URLs, only use base_url if provided; don't fall back to the raw HTML string
redirected_url = config.base_url
status_code = 200
response_headers = {}
else:
@@ -1042,7 +1045,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
###
# This ensures we capture the current page URL at the time we return the response,
# which correctly reflects any JavaScript navigation that occurred.
# For raw:/file:// URLs, preserve the earlier redirected_url (config.base_url or None)
# instead of using page.url which would be "about:blank".
###
is_local_content = url.startswith("file://") or url.startswith("raw://") or url.startswith("raw:")
if not is_local_content:
redirected_url = page.url # Use current page URL to capture JS redirects
# Return complete response
@@ -2372,11 +2379,13 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
status_code=200
)
async def _handle_raw(self, content: str) -> AsyncCrawlResponse:
async def _handle_raw(self, content: str, base_url: str = None) -> AsyncCrawlResponse:
return AsyncCrawlResponse(
html=content,
response_headers={},
status_code=200
status_code=200,
# For raw: URLs, use base_url if provided; don't fall back to the raw content
redirected_url=base_url
)
@@ -2501,7 +2510,7 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
# Don't use parsed.path - urlparse truncates at '#' which is common in CSS
# Strip prefix directly: "raw://" (6 chars) or "raw:" (4 chars)
raw_content = url[6:] if url.startswith("raw://") else url[4:]
return await self._handle_raw(raw_content)
return await self._handle_raw(raw_content, base_url=config.base_url)
else: # http or https
return await self._handle_http(url, config)

View File

@@ -434,7 +434,9 @@ class AsyncWebCrawler:
)
crawl_result.status_code = async_response.status_code
crawl_result.redirected_url = async_response.redirected_url or url
# For raw: URLs, don't fall back to the raw HTML string as redirected_url
is_raw_url = url.startswith("raw:") or url.startswith("raw://")
crawl_result.redirected_url = async_response.redirected_url or (None if is_raw_url else url)
crawl_result.response_headers = async_response.response_headers
crawl_result.downloaded_files = async_response.downloaded_files
crawl_result.js_execution_result = js_execution_result
@@ -479,7 +481,9 @@ class AsyncWebCrawler:
cached_result.success = bool(html)
cached_result.session_id = getattr(
config, "session_id", None)
cached_result.redirected_url = cached_result.redirected_url or url
# For raw: URLs, don't fall back to the raw HTML string as redirected_url
is_raw_url = url.startswith("raw:") or url.startswith("raw://")
cached_result.redirected_url = cached_result.redirected_url or (None if is_raw_url else url)
return CrawlResultContainer(cached_result)
except Exception as e:

View File

@@ -0,0 +1,299 @@
"""
Tests for redirected_url handling with raw: URLs.
This test file verifies the fix for the issue where redirected_url was incorrectly
set to the entire raw HTML content (potentially 300KB+) instead of None or base_url.
Issue: In raw: mode, async_crawler_strategy.py was setting redirected_url = config.base_url or url,
which fell back to the raw HTML string when base_url wasn't provided.
"""
try:
import pytest
HAS_PYTEST = True
except ImportError:
HAS_PYTEST = False
# Create a dummy decorator
class pytest:
class mark:
@staticmethod
def asyncio(fn):
return fn
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
# ============================================================================
# Core fix tests: redirected_url should NOT be the raw HTML string
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_redirected_url_is_none_without_base_url():
"""Test that redirected_url is None for raw: URLs when no base_url is provided."""
html = "<html><body><div id='test'>Test Content</div></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig()
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# Key assertion: redirected_url should be None, NOT the raw HTML string
assert result.redirected_url is None, (
f"redirected_url should be None for raw: URLs without base_url, "
f"but got: {result.redirected_url[:100] if result.redirected_url else None}..."
)
@pytest.mark.asyncio
async def test_raw_html_redirected_url_not_huge():
"""Test that redirected_url is not a huge string (the raw HTML content)."""
# Create a large HTML (100KB+)
items = "".join([f'<div class="item">Item {i} with some content</div>\n' for i in range(2000)])
large_html = f"<html><body>{items}</body></html>"
assert len(large_html) > 100000, "Test HTML should be >100KB"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig()
result = await crawler.arun(f"raw:{large_html}", config=config)
assert result.success
# Key assertion: redirected_url should NOT be the huge HTML string
if result.redirected_url is not None:
assert len(result.redirected_url) < 1000, (
f"redirected_url should not be the raw HTML! "
f"Got {len(result.redirected_url)} chars: {result.redirected_url[:100]}..."
)
@pytest.mark.asyncio
async def test_raw_html_with_base_url_sets_redirected_url():
"""Test that redirected_url is set to base_url when provided."""
html = "<html><body><div id='test'>Test Content</div></body></html>"
base_url = "https://example.com/page"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(base_url=base_url)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# Key assertion: redirected_url should be the base_url
assert result.redirected_url == base_url, (
f"redirected_url should be '{base_url}', got: {result.redirected_url}"
)
@pytest.mark.asyncio
async def test_raw_double_slash_prefix_redirected_url():
"""Test redirected_url handling with raw:// prefix."""
html = "<html><body>Content</body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig()
result = await crawler.arun(f"raw://{html}", config=config)
assert result.success
# Should be None, not the HTML
assert result.redirected_url is None
# ============================================================================
# Browser path tests (with js_code, screenshot, etc.)
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_browser_path_redirected_url_none():
"""Test redirected_url is None for raw: URLs in browser path (with js_code)."""
html = "<html><body><div id='test'>Original</div></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.getElementById('test').innerText = 'Modified'"
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Modified" in result.html
# Key assertion: even with browser path, redirected_url should be None
assert result.redirected_url is None, (
f"redirected_url should be None, got: {result.redirected_url}"
)
@pytest.mark.asyncio
async def test_raw_html_browser_path_with_base_url():
"""Test redirected_url is base_url for raw: URLs in browser path."""
html = "<html><body><div id='test'>Original</div></body></html>"
base_url = "https://mysite.com/processed"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
base_url=base_url,
js_code="document.getElementById('test').innerText = 'Modified'"
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Modified" in result.html
assert result.redirected_url == base_url
@pytest.mark.asyncio
async def test_raw_html_screenshot_redirected_url():
"""Test redirected_url with screenshot (browser path)."""
html = "<html><body><h1>Screenshot Test</h1></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(screenshot=True)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert result.screenshot is not None
# redirected_url should still be None
assert result.redirected_url is None
# ============================================================================
# Compatibility tests: HTTP URLs should still work correctly
# ============================================================================
@pytest.mark.asyncio
async def test_http_url_redirected_url_still_works():
"""Ensure HTTP URLs still set redirected_url correctly."""
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://example.com")
assert result.success
# For HTTP URLs, redirected_url should be the final URL (or original if no redirect)
assert result.redirected_url is not None
assert "example.com" in result.redirected_url
@pytest.mark.asyncio
async def test_http_url_with_redirect_preserves_redirected_url():
"""Test that HTTP redirects still capture the final URL."""
# httpbin.org/redirect-to redirects to the specified URL
async with AsyncWebCrawler() as crawler:
# Use a URL that redirects
result = await crawler.arun("https://httpbin.org/redirect-to?url=https://example.com")
assert result.success
# Should capture the final redirected URL
assert result.redirected_url is not None
assert "example.com" in result.redirected_url
# ============================================================================
# Edge cases
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_with_url_like_content():
"""Test raw HTML containing URLs doesn't confuse redirected_url."""
html = """
<html><body>
<a href="https://example.com">Link</a>
<p>Visit https://google.com for more</p>
<div>raw:https://fake.com</div>
</body></html>
"""
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(f"raw:{html}")
assert result.success
# redirected_url should be None, not any URL from the content
assert result.redirected_url is None
@pytest.mark.asyncio
async def test_raw_html_empty_base_url():
"""Test raw HTML with empty string base_url."""
html = "<html><body>Content</body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(base_url="")
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# Empty string is falsy, so redirected_url should be None
assert result.redirected_url is None or result.redirected_url == ""
@pytest.mark.asyncio
async def test_raw_html_process_in_browser_redirected_url():
"""Test redirected_url with process_in_browser=True."""
html = "<html><body>Test</body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(process_in_browser=True)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert result.redirected_url is None
# ============================================================================
# Regression test: specific issue scenario
# ============================================================================
@pytest.mark.asyncio
async def test_regression_321kb_html_redirected_url():
"""
Regression test for the specific issue:
- raw:{321KB HTML} should NOT have redirected_url = "raw:{321KB HTML}"
- This was causing massive memory/logging issues
"""
# Create ~321KB of HTML content
content = "X" * 300000 # ~300KB of content
html = f"<html><body><div>{content}</div></body></html>"
assert len(html) > 300000, "Should be >300KB"
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(f"raw:{html}")
assert result.success
# The bug was: redirected_url = "raw:{321KB HTML}"
# After fix: redirected_url = None
assert result.redirected_url is None, (
"REGRESSION: redirected_url contains the raw HTML! "
f"Length: {len(result.redirected_url) if result.redirected_url else 0}"
)
if __name__ == "__main__":
async def run_tests():
tests = [
("redirected_url None without base_url", test_raw_html_redirected_url_is_none_without_base_url),
("redirected_url not huge", test_raw_html_redirected_url_not_huge),
("redirected_url with base_url", test_raw_html_with_base_url_sets_redirected_url),
("raw:// prefix", test_raw_double_slash_prefix_redirected_url),
("browser path None", test_raw_html_browser_path_redirected_url_none),
("browser path with base_url", test_raw_html_browser_path_with_base_url),
("HTTP URL still works", test_http_url_redirected_url_still_works),
("321KB regression", test_regression_321kb_html_redirected_url),
]
passed = 0
failed = 0
for name, test_fn in tests:
print(f"\n=== {name} ===")
try:
await test_fn()
print(f"PASSED")
passed += 1
except Exception as e:
print(f"FAILED: {e}")
import traceback
traceback.print_exc()
failed += 1
print(f"\n{'='*50}")
print(f"Results: {passed} passed, {failed} failed")
return failed == 0
import sys
success = asyncio.run(run_tests())
sys.exit(0 if success else 1)