More content"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(js_code="document.body.innerHTML += '
Valid Added
'")
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Valid Added" in result.html
# Browser should have fixed the malformed HTML
# ============================================================================
# EDGE CASE: Very large HTML
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_large():
"""Test large raw HTML (100KB+)."""
# Generate 100KB of HTML
items = "".join([f'
Item {i} content here with some text
\n' for i in range(2000)])
html = f"{items}"
assert len(html) > 100000 # Verify it's actually large
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.getElementById('item-999').innerText = 'MODIFIED ITEM 999'"
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "MODIFIED ITEM 999" in result.html
assert "item-1999" in result.html # Last item should still exist
# ============================================================================
# EDGE CASE: JavaScript errors and timeouts
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_js_error_doesnt_crash():
"""Test that JavaScript errors in js_code don't crash the crawl."""
html = "
Original
"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code=[
"nonExistentFunction();", # This will throw an error
"document.getElementById('test').innerText = 'Still works'" # This should still run
]
)
result = await crawler.arun(f"raw:{html}", config=config)
# Crawl should succeed even with JS errors
assert result.success
@pytest.mark.asyncio
async def test_raw_html_wait_for_timeout():
"""Test wait_for with element that never appears times out gracefully."""
html = "
Original
"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
wait_for="#never-exists",
wait_for_timeout=1000 # 1 second timeout
)
result = await crawler.arun(f"raw:{html}", config=config)
# Should timeout but still return the HTML we have
# The behavior might be success=False or success=True with partial content
# Either way, it shouldn't hang or crash
assert result is not None
# ============================================================================
# COMPATIBILITY: Normal HTTP URLs still work
# ============================================================================
@pytest.mark.asyncio
async def test_http_urls_still_work():
"""Ensure we didn't break normal HTTP URL crawling."""
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://example.com")
assert result.success
assert "Example Domain" in result.html
@pytest.mark.asyncio
async def test_http_with_js_code_still_works():
"""Ensure HTTP URLs with js_code still work."""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.body.innerHTML += '
Injected via JS
'"
)
result = await crawler.arun("https://example.com", config=config)
assert result.success
assert "Injected via JS" in result.html
# ============================================================================
# COMPATIBILITY: File URLs
# ============================================================================
@pytest.mark.asyncio
async def test_file_url_with_js_code():
"""Test file:// URLs with js_code execution."""
# Create a temp file
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
f.write("
File Content
")
temp_path = f.name
try:
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.getElementById('file-content').innerText = 'Modified File Content'"
)
result = await crawler.arun(f"file://{temp_path}", config=config)
assert result.success
assert "Modified File Content" in result.html
finally:
os.unlink(temp_path)
@pytest.mark.asyncio
async def test_file_url_fast_path():
"""Test file:// fast path (no browser params)."""
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
f.write("Fast path file content")
temp_path = f.name
try:
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(f"file://{temp_path}")
assert result.success
assert "Fast path file content" in result.html
finally:
os.unlink(temp_path)
# ============================================================================
# COMPATIBILITY: Extraction strategies with raw HTML
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_with_css_extraction():
"""Test CSS extraction on raw HTML after js_code modifies it."""
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
html = """
"""
schema = {
"name": "Products",
"baseSelector": ".product",
"fields": [
{"name": "name", "selector": ".name", "type": "text"}
]
}
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="""
document.querySelector('.products').innerHTML +=
'
JS Added Product
';
""",
extraction_strategy=JsonCssExtractionStrategy(schema)
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# Check that extraction found both products
import json
extracted = json.loads(result.extracted_content)
names = [p.get('name', '') for p in extracted]
assert any("JS Added Product" in name for name in names)
# ============================================================================
# EDGE CASE: Concurrent raw: requests
# ============================================================================
@pytest.mark.asyncio
async def test_concurrent_raw_requests():
"""Test multiple concurrent raw: requests don't interfere."""
htmls = [
f"
Request {i}
"
for i in range(5)
]
async with AsyncWebCrawler() as crawler:
configs = [
CrawlerRunConfig(
js_code=f"document.getElementById('test').innerText += ' Modified {i}'"
)
for i in range(5)
]
# Run concurrently
tasks = [
crawler.arun(f"raw:{html}", config=config)
for html, config in zip(htmls, configs)
]
results = await asyncio.gather(*tasks)
for i, result in enumerate(results):
assert result.success
assert f"Request {i}" in result.html
assert f"Modified {i}" in result.html
# ============================================================================
# EDGE CASE: raw: with base_url for link resolution
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_with_base_url():
"""Test that base_url is used for link resolution in markdown."""
html = """
Page 1
Page 2

"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
base_url="https://example.com",
process_in_browser=True # Force browser to test base_url handling
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# Check markdown has absolute URLs
if result.markdown:
# Links should be absolute
md = result.markdown.raw_markdown if hasattr(result.markdown, 'raw_markdown') else str(result.markdown)
assert "example.com" in md or "/page1" in md
# ============================================================================
# EDGE CASE: raw: with screenshot of complex page
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_screenshot_complex_page():
"""Test screenshot of complex raw HTML with CSS and JS modifications."""
html = """
Original Title
This is a test card with styling.
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.getElementById('title').innerText = 'Modified Title'",
screenshot=True
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert result.screenshot is not None
assert len(result.screenshot) > 1000 # Should be substantial
assert "Modified Title" in result.html
# ============================================================================
# EDGE CASE: JavaScript that tries to navigate away
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_js_navigation_blocked():
"""Test that JS trying to navigate doesn't break the crawl."""
html = """
Original Content
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
# Try to navigate via js_code
js_code=[
"document.getElementById('content').innerText = 'Before navigation attempt'",
# Actual navigation attempt commented - would cause issues
# "window.location.href = 'https://example.com'",
]
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Before navigation attempt" in result.html
# ============================================================================
# EDGE CASE: Raw HTML with iframes
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_with_iframes():
"""Test raw HTML containing iframes."""
html = """
Main content
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.getElementById('main').innerText = 'Modified main'",
process_iframes=True
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Modified main" in result.html
# ============================================================================
# TRICKY: Protocol inside raw content
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_with_urls_inside():
"""Test raw: with http:// URLs inside the content."""
html = """
Example
Google
Test content with URL: https://test.com
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.getElementById('test').innerText += ' - Modified'"
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Modified" in result.html
assert "http://example.com" in result.html or "example.com" in result.html
# ============================================================================
# TRICKY: Double raw: prefix
# ============================================================================
@pytest.mark.asyncio
async def test_double_raw_prefix():
"""Test what happens with double raw: prefix (edge case)."""
html = "Content"
async with AsyncWebCrawler() as crawler:
# raw:raw:... - the second raw: becomes part of content
result = await crawler.arun(f"raw:raw:{html}")
# Should either handle gracefully or return "raw:..." as content
assert result is not None
if __name__ == "__main__":
import sys
async def run_tests():
# Run a few key tests manually
tests = [
("Hash in CSS", test_raw_html_with_hash_in_css),
("Unicode", test_raw_html_with_unicode),
("Large HTML", test_raw_html_large),
("HTTP still works", test_http_urls_still_work),
("Concurrent requests", test_concurrent_raw_requests),
("Complex screenshot", test_raw_html_screenshot_complex_page),
]
for name, test_fn in tests:
print(f"\n=== Running: {name} ===")
try:
await test_fn()
print(f"✅ {name} PASSED")
except Exception as e:
print(f"❌ {name} FAILED: {e}")
import traceback
traceback.print_exc()
asyncio.run(run_tests())