diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 121a3861..22e9c2ae 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -495,6 +495,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): pdf_data=None, mhtml_data=None, get_delayed_content=None, + # For raw:/file:// URLs, use base_url if provided; don't fall back to the raw content + redirected_url=config.base_url, ) else: raise ValueError( @@ -683,7 +685,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await page.set_content(html_content, wait_until=config.wait_until) response = None - redirected_url = config.base_url or url + # For raw: URLs, only use base_url if provided; don't fall back to the raw HTML string + redirected_url = config.base_url status_code = 200 response_headers = {} else: @@ -1040,10 +1043,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): captured_console.extend(final_messages) ### - # This ensures we capture the current page URL at the time we return the response, + # This ensures we capture the current page URL at the time we return the response, # which correctly reflects any JavaScript navigation that occurred. + # For raw:/file:// URLs, preserve the earlier redirected_url (config.base_url or None) + # instead of using page.url which would be "about:blank". ### - redirected_url = page.url # Use current page URL to capture JS redirects + is_local_content = url.startswith("file://") or url.startswith("raw://") or url.startswith("raw:") + if not is_local_content: + redirected_url = page.url # Use current page URL to capture JS redirects # Return complete response return AsyncCrawlResponse( @@ -2372,11 +2379,13 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy): status_code=200 ) - async def _handle_raw(self, content: str) -> AsyncCrawlResponse: + async def _handle_raw(self, content: str, base_url: str = None) -> AsyncCrawlResponse: return AsyncCrawlResponse( html=content, response_headers={}, - status_code=200 + status_code=200, + # For raw: URLs, use base_url if provided; don't fall back to the raw content + redirected_url=base_url ) @@ -2501,7 +2510,7 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy): # Don't use parsed.path - urlparse truncates at '#' which is common in CSS # Strip prefix directly: "raw://" (6 chars) or "raw:" (4 chars) raw_content = url[6:] if url.startswith("raw://") else url[4:] - return await self._handle_raw(raw_content) + return await self._handle_raw(raw_content, base_url=config.base_url) else: # http or https return await self._handle_http(url, config) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index ef03cb74..a3b63b2e 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -434,7 +434,9 @@ class AsyncWebCrawler: ) crawl_result.status_code = async_response.status_code - crawl_result.redirected_url = async_response.redirected_url or url + # For raw: URLs, don't fall back to the raw HTML string as redirected_url + is_raw_url = url.startswith("raw:") or url.startswith("raw://") + crawl_result.redirected_url = async_response.redirected_url or (None if is_raw_url else url) crawl_result.response_headers = async_response.response_headers crawl_result.downloaded_files = async_response.downloaded_files crawl_result.js_execution_result = js_execution_result @@ -479,7 +481,9 @@ class AsyncWebCrawler: cached_result.success = bool(html) cached_result.session_id = getattr( config, "session_id", None) - cached_result.redirected_url = cached_result.redirected_url or url + # For raw: URLs, don't fall back to the raw HTML string as redirected_url + is_raw_url = url.startswith("raw:") or url.startswith("raw://") + cached_result.redirected_url = cached_result.redirected_url or (None if is_raw_url else url) return CrawlResultContainer(cached_result) except Exception as e: diff --git a/tests/test_raw_html_redirected_url.py b/tests/test_raw_html_redirected_url.py new file mode 100644 index 00000000..c7dd0623 --- /dev/null +++ b/tests/test_raw_html_redirected_url.py @@ -0,0 +1,299 @@ +""" +Tests for redirected_url handling with raw: URLs. + +This test file verifies the fix for the issue where redirected_url was incorrectly +set to the entire raw HTML content (potentially 300KB+) instead of None or base_url. + +Issue: In raw: mode, async_crawler_strategy.py was setting redirected_url = config.base_url or url, +which fell back to the raw HTML string when base_url wasn't provided. +""" + +try: + import pytest + HAS_PYTEST = True +except ImportError: + HAS_PYTEST = False + # Create a dummy decorator + class pytest: + class mark: + @staticmethod + def asyncio(fn): + return fn + +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + + +# ============================================================================ +# Core fix tests: redirected_url should NOT be the raw HTML string +# ============================================================================ + +@pytest.mark.asyncio +async def test_raw_html_redirected_url_is_none_without_base_url(): + """Test that redirected_url is None for raw: URLs when no base_url is provided.""" + html = "
Test Content
" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig() + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + # Key assertion: redirected_url should be None, NOT the raw HTML string + assert result.redirected_url is None, ( + f"redirected_url should be None for raw: URLs without base_url, " + f"but got: {result.redirected_url[:100] if result.redirected_url else None}..." + ) + + +@pytest.mark.asyncio +async def test_raw_html_redirected_url_not_huge(): + """Test that redirected_url is not a huge string (the raw HTML content).""" + # Create a large HTML (100KB+) + items = "".join([f'
Item {i} with some content
\n' for i in range(2000)]) + large_html = f"{items}" + assert len(large_html) > 100000, "Test HTML should be >100KB" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig() + result = await crawler.arun(f"raw:{large_html}", config=config) + + assert result.success + # Key assertion: redirected_url should NOT be the huge HTML string + if result.redirected_url is not None: + assert len(result.redirected_url) < 1000, ( + f"redirected_url should not be the raw HTML! " + f"Got {len(result.redirected_url)} chars: {result.redirected_url[:100]}..." + ) + + +@pytest.mark.asyncio +async def test_raw_html_with_base_url_sets_redirected_url(): + """Test that redirected_url is set to base_url when provided.""" + html = "
Test Content
" + base_url = "https://example.com/page" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(base_url=base_url) + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + # Key assertion: redirected_url should be the base_url + assert result.redirected_url == base_url, ( + f"redirected_url should be '{base_url}', got: {result.redirected_url}" + ) + + +@pytest.mark.asyncio +async def test_raw_double_slash_prefix_redirected_url(): + """Test redirected_url handling with raw:// prefix.""" + html = "Content" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig() + result = await crawler.arun(f"raw://{html}", config=config) + + assert result.success + # Should be None, not the HTML + assert result.redirected_url is None + + +# ============================================================================ +# Browser path tests (with js_code, screenshot, etc.) +# ============================================================================ + +@pytest.mark.asyncio +async def test_raw_html_browser_path_redirected_url_none(): + """Test redirected_url is None for raw: URLs in browser path (with js_code).""" + html = "
Original
" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + js_code="document.getElementById('test').innerText = 'Modified'" + ) + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + assert "Modified" in result.html + # Key assertion: even with browser path, redirected_url should be None + assert result.redirected_url is None, ( + f"redirected_url should be None, got: {result.redirected_url}" + ) + + +@pytest.mark.asyncio +async def test_raw_html_browser_path_with_base_url(): + """Test redirected_url is base_url for raw: URLs in browser path.""" + html = "
Original
" + base_url = "https://mysite.com/processed" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + base_url=base_url, + js_code="document.getElementById('test').innerText = 'Modified'" + ) + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + assert "Modified" in result.html + assert result.redirected_url == base_url + + +@pytest.mark.asyncio +async def test_raw_html_screenshot_redirected_url(): + """Test redirected_url with screenshot (browser path).""" + html = "

Screenshot Test

" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(screenshot=True) + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + assert result.screenshot is not None + # redirected_url should still be None + assert result.redirected_url is None + + +# ============================================================================ +# Compatibility tests: HTTP URLs should still work correctly +# ============================================================================ + +@pytest.mark.asyncio +async def test_http_url_redirected_url_still_works(): + """Ensure HTTP URLs still set redirected_url correctly.""" + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com") + + assert result.success + # For HTTP URLs, redirected_url should be the final URL (or original if no redirect) + assert result.redirected_url is not None + assert "example.com" in result.redirected_url + + +@pytest.mark.asyncio +async def test_http_url_with_redirect_preserves_redirected_url(): + """Test that HTTP redirects still capture the final URL.""" + # httpbin.org/redirect-to redirects to the specified URL + async with AsyncWebCrawler() as crawler: + # Use a URL that redirects + result = await crawler.arun("https://httpbin.org/redirect-to?url=https://example.com") + + assert result.success + # Should capture the final redirected URL + assert result.redirected_url is not None + assert "example.com" in result.redirected_url + + +# ============================================================================ +# Edge cases +# ============================================================================ + +@pytest.mark.asyncio +async def test_raw_html_with_url_like_content(): + """Test raw HTML containing URLs doesn't confuse redirected_url.""" + html = """ + + Link +

Visit https://google.com for more

+
raw:https://fake.com
+ + """ + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(f"raw:{html}") + + assert result.success + # redirected_url should be None, not any URL from the content + assert result.redirected_url is None + + +@pytest.mark.asyncio +async def test_raw_html_empty_base_url(): + """Test raw HTML with empty string base_url.""" + html = "Content" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(base_url="") + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + # Empty string is falsy, so redirected_url should be None + assert result.redirected_url is None or result.redirected_url == "" + + +@pytest.mark.asyncio +async def test_raw_html_process_in_browser_redirected_url(): + """Test redirected_url with process_in_browser=True.""" + html = "Test" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(process_in_browser=True) + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + assert result.redirected_url is None + + +# ============================================================================ +# Regression test: specific issue scenario +# ============================================================================ + +@pytest.mark.asyncio +async def test_regression_321kb_html_redirected_url(): + """ + Regression test for the specific issue: + - raw:{321KB HTML} should NOT have redirected_url = "raw:{321KB HTML}" + - This was causing massive memory/logging issues + """ + # Create ~321KB of HTML content + content = "X" * 300000 # ~300KB of content + html = f"
{content}
" + assert len(html) > 300000, "Should be >300KB" + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(f"raw:{html}") + + assert result.success + + # The bug was: redirected_url = "raw:{321KB HTML}" + # After fix: redirected_url = None + assert result.redirected_url is None, ( + "REGRESSION: redirected_url contains the raw HTML! " + f"Length: {len(result.redirected_url) if result.redirected_url else 0}" + ) + + +if __name__ == "__main__": + async def run_tests(): + tests = [ + ("redirected_url None without base_url", test_raw_html_redirected_url_is_none_without_base_url), + ("redirected_url not huge", test_raw_html_redirected_url_not_huge), + ("redirected_url with base_url", test_raw_html_with_base_url_sets_redirected_url), + ("raw:// prefix", test_raw_double_slash_prefix_redirected_url), + ("browser path None", test_raw_html_browser_path_redirected_url_none), + ("browser path with base_url", test_raw_html_browser_path_with_base_url), + ("HTTP URL still works", test_http_url_redirected_url_still_works), + ("321KB regression", test_regression_321kb_html_redirected_url), + ] + + passed = 0 + failed = 0 + + for name, test_fn in tests: + print(f"\n=== {name} ===") + try: + await test_fn() + print(f"PASSED") + passed += 1 + except Exception as e: + print(f"FAILED: {e}") + import traceback + traceback.print_exc() + failed += 1 + + print(f"\n{'='*50}") + print(f"Results: {passed} passed, {failed} failed") + return failed == 0 + + import sys + success = asyncio.run(run_tests()) + sys.exit(0 if success else 1)