"""
Tests for raw:/file:// URL browser pipeline support.
Tests the new feature that allows js_code, wait_for, and other browser operations
to work with raw: and file:// URLs by routing them through _crawl_web() with
set_content() instead of goto().
"""
import pytest
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
@pytest.mark.asyncio
async def test_raw_html_fast_path():
"""Test that raw: without browser params returns HTML directly (fast path)."""
html = "
Original Content
"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig() # No browser params
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Original Content" in result.html
# Fast path should not modify the HTML
assert result.html == html
@pytest.mark.asyncio
async def test_js_code_on_raw_html():
"""Test that js_code executes on raw: HTML and modifies the DOM."""
html = "Original
"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.getElementById('test').innerText = 'Modified by JS'"
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Modified by JS" in result.html
assert "Original" not in result.html or "Modified by JS" in result.html
@pytest.mark.asyncio
async def test_js_code_adds_element_to_raw_html():
"""Test that js_code can add new elements to raw: HTML."""
html = ""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code='document.getElementById("container").innerHTML = "Custom Content"'
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "injected" in result.html
assert "Custom Content" in result.html
@pytest.mark.asyncio
async def test_screenshot_on_raw_html():
"""Test that screenshots work on raw: HTML."""
html = "Screenshot Test
"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(screenshot=True)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert result.screenshot is not None
assert len(result.screenshot) > 100 # Should have substantial screenshot data
@pytest.mark.asyncio
async def test_process_in_browser_flag():
"""Test that process_in_browser=True forces browser path even without other params."""
html = "Test
"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(process_in_browser=True)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# Browser path normalizes HTML, so it may be slightly different
assert "Test" in result.html
@pytest.mark.asyncio
async def test_raw_prefix_variations():
"""Test both raw: and raw:// prefix formats."""
html = "Content"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code='document.body.innerHTML += "Added
"'
)
# Test raw: prefix
result1 = await crawler.arun(f"raw:{html}", config=config)
assert result1.success
assert "Added" in result1.html
# Test raw:// prefix
result2 = await crawler.arun(f"raw://{html}", config=config)
assert result2.success
assert "Added" in result2.html
@pytest.mark.asyncio
async def test_wait_for_on_raw_html():
"""Test that wait_for works with raw: HTML after js_code modifies DOM."""
html = ""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code='''
setTimeout(() => {
document.getElementById('container').innerHTML = 'Delayed Content
';
}, 100);
''',
wait_for="#delayed",
wait_for_timeout=5000
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Delayed Content" in result.html
@pytest.mark.asyncio
async def test_multiple_js_code_scripts():
"""Test that multiple js_code scripts execute in order."""
html = "0
"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code=[
"document.getElementById('counter').innerText = '1'",
"document.getElementById('counter').innerText = parseInt(document.getElementById('counter').innerText) + 1",
"document.getElementById('counter').innerText = parseInt(document.getElementById('counter').innerText) + 1",
]
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert ">3<" in result.html # Counter should be 3 after all scripts run
if __name__ == "__main__":
# Run a quick manual test
async def quick_test():
html = "Original
"
async with AsyncWebCrawler(verbose=True) as crawler:
# Test 1: Fast path
print("\n=== Test 1: Fast path (no browser params) ===")
result1 = await crawler.arun(f"raw:{html}")
print(f"Success: {result1.success}")
print(f"HTML contains 'Original': {'Original' in result1.html}")
# Test 2: js_code modifies DOM
print("\n=== Test 2: js_code modifies DOM ===")
config = CrawlerRunConfig(
js_code="document.getElementById('test').innerText = 'Modified by JS'"
)
result2 = await crawler.arun(f"raw:{html}", config=config)
print(f"Success: {result2.success}")
print(f"HTML contains 'Modified by JS': {'Modified by JS' in result2.html}")
print(f"HTML snippet: {result2.html[:500]}...")
asyncio.run(quick_test())