Fix redirected_url containing raw HTML content for raw: URLs

When using raw: URLs without a base_url, redirected_url was incorrectly set to the entire raw HTML string (potentially 300KB+) instead of None. Changes: - async_crawler_strategy.py: Don't fall back to url for raw:/file:// URLs in fast path, browser path, and HTTP strategy - async_crawler_strategy.py: Skip page.url assignment for local content (would return "about:blank") - async_webcrawler.py: Don't fall back to url for raw: URLs in crawl result and cached result paths - Add comprehensive test suite for redirected_url handling
2026-01-20 00:31:12 +00:00
parent 857b1ed23b
commit 418bfcfd3b
3 changed files with 320 additions and 8 deletions
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -495,6 +495,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                pdf_data=None,
                mhtml_data=None,
                get_delayed_content=None,
                # For raw:/file:// URLs, use base_url if provided; don't fall back to the raw content
                redirected_url=config.base_url,
            )
        else:
            raise ValueError(
@@ -683,7 +685,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                    await page.set_content(html_content, wait_until=config.wait_until)
                    response = None
-                    redirected_url = config.base_url or url
+                    # For raw: URLs, only use base_url if provided; don't fall back to the raw HTML string
                    redirected_url = config.base_url
                    status_code = 200
                    response_headers = {}
                else:
@@ -1042,8 +1045,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            ###
            # This ensures we capture the current page URL at the time we return the response,
            # which correctly reflects any JavaScript navigation that occurred.
            # For raw:/file:// URLs, preserve the earlier redirected_url (config.base_url or None)
            # instead of using page.url which would be "about:blank".
            ###
-            redirected_url = page.url  # Use current page URL to capture JS redirects
+            is_local_content = url.startswith("file://") or url.startswith("raw://") or url.startswith("raw:")
            if not is_local_content:
                redirected_url = page.url  # Use current page URL to capture JS redirects
            # Return complete response
            return AsyncCrawlResponse(
@@ -2372,11 +2379,13 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
            status_code=200
        )
-    async def _handle_raw(self, content: str) -> AsyncCrawlResponse:
+    async def _handle_raw(self, content: str, base_url: str = None) -> AsyncCrawlResponse:
        return AsyncCrawlResponse(
            html=content,
            response_headers={},
-            status_code=200
+            status_code=200,
            # For raw: URLs, use base_url if provided; don't fall back to the raw content
            redirected_url=base_url
        )
@@ -2501,7 +2510,7 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
                # Don't use parsed.path - urlparse truncates at '#' which is common in CSS
                # Strip prefix directly: "raw://" (6 chars) or "raw:" (4 chars)
                raw_content = url[6:] if url.startswith("raw://") else url[4:]
-                return await self._handle_raw(raw_content)
+                return await self._handle_raw(raw_content, base_url=config.base_url)
            else:  # http or https
                return await self._handle_http(url, config)
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -434,7 +434,9 @@ class AsyncWebCrawler:
                    )
                    crawl_result.status_code = async_response.status_code
-                    crawl_result.redirected_url = async_response.redirected_url or url
+                    # For raw: URLs, don't fall back to the raw HTML string as redirected_url
                    is_raw_url = url.startswith("raw:") or url.startswith("raw://")
                    crawl_result.redirected_url = async_response.redirected_url or (None if is_raw_url else url)
                    crawl_result.response_headers = async_response.response_headers
                    crawl_result.downloaded_files = async_response.downloaded_files
                    crawl_result.js_execution_result = js_execution_result
@@ -479,7 +481,9 @@ class AsyncWebCrawler:
                    cached_result.success = bool(html)
                    cached_result.session_id = getattr(
                        config, "session_id", None)
-                    cached_result.redirected_url = cached_result.redirected_url or url
+                    # For raw: URLs, don't fall back to the raw HTML string as redirected_url
                    is_raw_url = url.startswith("raw:") or url.startswith("raw://")
                    cached_result.redirected_url = cached_result.redirected_url or (None if is_raw_url else url)
                    return CrawlResultContainer(cached_result)
            except Exception as e:
--- a/tests/test_raw_html_redirected_url.py
+++ b/tests/test_raw_html_redirected_url.py
@@ -0,0 +1,299 @@
 """
 Tests for redirected_url handling with raw: URLs.
 This test file verifies the fix for the issue where redirected_url was incorrectly
 set to the entire raw HTML content (potentially 300KB+) instead of None or base_url.
 Issue: In raw: mode, async_crawler_strategy.py was setting redirected_url = config.base_url or url,
 which fell back to the raw HTML string when base_url wasn't provided.
 """
 try:
    import pytest
    HAS_PYTEST = True
 except ImportError:
    HAS_PYTEST = False
    # Create a dummy decorator
    class pytest:
        class mark:
            @staticmethod
            def asyncio(fn):
                return fn
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
 # ============================================================================
 # Core fix tests: redirected_url should NOT be the raw HTML string
 # ============================================================================
@pytest.mark.asyncio
 async def test_raw_html_redirected_url_is_none_without_base_url():
    """Test that redirected_url is None for raw: URLs when no base_url is provided."""
    html = "<html><body><div id='test'>Test Content</div></body></html>"
    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig()
        result = await crawler.arun(f"raw:{html}", config=config)
    assert result.success
    # Key assertion: redirected_url should be None, NOT the raw HTML string
    assert result.redirected_url is None, (
        f"redirected_url should be None for raw: URLs without base_url, "
        f"but got: {result.redirected_url[:100] if result.redirected_url else None}..."
    )
@pytest.mark.asyncio
 async def test_raw_html_redirected_url_not_huge():
    """Test that redirected_url is not a huge string (the raw HTML content)."""
    # Create a large HTML (100KB+)
    items = "".join([f'<div class="item">Item {i} with some content</div>\n' for i in range(2000)])
    large_html = f"<html><body>{items}</body></html>"
    assert len(large_html) > 100000, "Test HTML should be >100KB"
    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig()
        result = await crawler.arun(f"raw:{large_html}", config=config)
    assert result.success
    # Key assertion: redirected_url should NOT be the huge HTML string
    if result.redirected_url is not None:
        assert len(result.redirected_url) < 1000, (
            f"redirected_url should not be the raw HTML! "
            f"Got {len(result.redirected_url)} chars: {result.redirected_url[:100]}..."
        )
@pytest.mark.asyncio
 async def test_raw_html_with_base_url_sets_redirected_url():
    """Test that redirected_url is set to base_url when provided."""
    html = "<html><body><div id='test'>Test Content</div></body></html>"
    base_url = "https://example.com/page"
    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig(base_url=base_url)
        result = await crawler.arun(f"raw:{html}", config=config)
    assert result.success
    # Key assertion: redirected_url should be the base_url
    assert result.redirected_url == base_url, (
        f"redirected_url should be '{base_url}', got: {result.redirected_url}"
    )
@pytest.mark.asyncio
 async def test_raw_double_slash_prefix_redirected_url():
    """Test redirected_url handling with raw:// prefix."""
    html = "<html><body>Content</body></html>"
    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig()
        result = await crawler.arun(f"raw://{html}", config=config)
    assert result.success
    # Should be None, not the HTML
    assert result.redirected_url is None
 # ============================================================================
 # Browser path tests (with js_code, screenshot, etc.)
 # ============================================================================
@pytest.mark.asyncio
 async def test_raw_html_browser_path_redirected_url_none():
    """Test redirected_url is None for raw: URLs in browser path (with js_code)."""
    html = "<html><body><div id='test'>Original</div></body></html>"
    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig(
            js_code="document.getElementById('test').innerText = 'Modified'"
        )
        result = await crawler.arun(f"raw:{html}", config=config)
    assert result.success
    assert "Modified" in result.html
    # Key assertion: even with browser path, redirected_url should be None
    assert result.redirected_url is None, (
        f"redirected_url should be None, got: {result.redirected_url}"
    )
@pytest.mark.asyncio
 async def test_raw_html_browser_path_with_base_url():
    """Test redirected_url is base_url for raw: URLs in browser path."""
    html = "<html><body><div id='test'>Original</div></body></html>"
    base_url = "https://mysite.com/processed"
    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig(
            base_url=base_url,
            js_code="document.getElementById('test').innerText = 'Modified'"
        )
        result = await crawler.arun(f"raw:{html}", config=config)
    assert result.success
    assert "Modified" in result.html
    assert result.redirected_url == base_url
@pytest.mark.asyncio
 async def test_raw_html_screenshot_redirected_url():
    """Test redirected_url with screenshot (browser path)."""
    html = "<html><body><h1>Screenshot Test</h1></body></html>"
    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig(screenshot=True)
        result = await crawler.arun(f"raw:{html}", config=config)
    assert result.success
    assert result.screenshot is not None
    # redirected_url should still be None
    assert result.redirected_url is None
 # ============================================================================
 # Compatibility tests: HTTP URLs should still work correctly
 # ============================================================================
@pytest.mark.asyncio
 async def test_http_url_redirected_url_still_works():
    """Ensure HTTP URLs still set redirected_url correctly."""
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun("https://example.com")
    assert result.success
    # For HTTP URLs, redirected_url should be the final URL (or original if no redirect)
    assert result.redirected_url is not None
    assert "example.com" in result.redirected_url
@pytest.mark.asyncio
 async def test_http_url_with_redirect_preserves_redirected_url():
    """Test that HTTP redirects still capture the final URL."""
    # httpbin.org/redirect-to redirects to the specified URL
    async with AsyncWebCrawler() as crawler:
        # Use a URL that redirects
        result = await crawler.arun("https://httpbin.org/redirect-to?url=https://example.com")
    assert result.success
    # Should capture the final redirected URL
    assert result.redirected_url is not None
    assert "example.com" in result.redirected_url
 # ============================================================================
 # Edge cases
 # ============================================================================
@pytest.mark.asyncio
 async def test_raw_html_with_url_like_content():
    """Test raw HTML containing URLs doesn't confuse redirected_url."""
    html = """
    <html><body>
        <a href="https://example.com">Link</a>
        <p>Visit https://google.com for more</p>
        <div>raw:https://fake.com</div>
    </body></html>
    """
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(f"raw:{html}")
    assert result.success
    # redirected_url should be None, not any URL from the content
    assert result.redirected_url is None
@pytest.mark.asyncio
 async def test_raw_html_empty_base_url():
    """Test raw HTML with empty string base_url."""
    html = "<html><body>Content</body></html>"
    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig(base_url="")
        result = await crawler.arun(f"raw:{html}", config=config)
    assert result.success
    # Empty string is falsy, so redirected_url should be None
    assert result.redirected_url is None or result.redirected_url == ""
@pytest.mark.asyncio
 async def test_raw_html_process_in_browser_redirected_url():
    """Test redirected_url with process_in_browser=True."""
    html = "<html><body>Test</body></html>"
    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig(process_in_browser=True)
        result = await crawler.arun(f"raw:{html}", config=config)
    assert result.success
    assert result.redirected_url is None
 # ============================================================================
 # Regression test: specific issue scenario
 # ============================================================================
@pytest.mark.asyncio
 async def test_regression_321kb_html_redirected_url():
    """
    Regression test for the specific issue:
    - raw:{321KB HTML} should NOT have redirected_url = "raw:{321KB HTML}"
    - This was causing massive memory/logging issues
    """
    # Create ~321KB of HTML content
    content = "X" * 300000  # ~300KB of content
    html = f"<html><body><div>{content}</div></body></html>"
    assert len(html) > 300000, "Should be >300KB"
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(f"raw:{html}")
    assert result.success
    # The bug was: redirected_url = "raw:{321KB HTML}"
    # After fix: redirected_url = None
    assert result.redirected_url is None, (
        "REGRESSION: redirected_url contains the raw HTML! "
        f"Length: {len(result.redirected_url) if result.redirected_url else 0}"
    )
 if __name__ == "__main__":
    async def run_tests():
        tests = [
            ("redirected_url None without base_url", test_raw_html_redirected_url_is_none_without_base_url),
            ("redirected_url not huge", test_raw_html_redirected_url_not_huge),
            ("redirected_url with base_url", test_raw_html_with_base_url_sets_redirected_url),
            ("raw:// prefix", test_raw_double_slash_prefix_redirected_url),
            ("browser path None", test_raw_html_browser_path_redirected_url_none),
            ("browser path with base_url", test_raw_html_browser_path_with_base_url),
            ("HTTP URL still works", test_http_url_redirected_url_still_works),
            ("321KB regression", test_regression_321kb_html_redirected_url),
        ]
        passed = 0
        failed = 0
        for name, test_fn in tests:
            print(f"\n=== {name} ===")
            try:
                await test_fn()
                print(f"PASSED")
                passed += 1
            except Exception as e:
                print(f"FAILED: {e}")
                import traceback
                traceback.print_exc()
                failed += 1
        print(f"\n{'='*50}")
        print(f"Results: {passed} passed, {failed} failed")
        return failed == 0
    import sys
    success = asyncio.run(run_tests())
    sys.exit(0 if success else 1)