Add browser pipeline support for raw:/file:// URLs

- Add process_in_browser parameter to CrawlerRunConfig
- Route raw:/file:// URLs through _crawl_web() when browser operations needed
- Use page.set_content() instead of goto() for local content
- Fix cookie handling for non-HTTP URLs in browser_manager
- Auto-detect browser requirements: js_code, wait_for, screenshot, etc.
- Maintain fast path for raw:/file:// without browser params

Fixes #310
This commit is contained in:
unclecode
2025-12-27 12:32:42 +00:00
parent a43256b27a
commit 2550f3d2d5
5 changed files with 870 additions and 118 deletions

View File

@@ -0,0 +1,172 @@
"""
Tests for raw:/file:// URL browser pipeline support.
Tests the new feature that allows js_code, wait_for, and other browser operations
to work with raw: and file:// URLs by routing them through _crawl_web() with
set_content() instead of goto().
"""
import pytest
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
@pytest.mark.asyncio
async def test_raw_html_fast_path():
"""Test that raw: without browser params returns HTML directly (fast path)."""
html = "<html><body><div id='test'>Original Content</div></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig() # No browser params
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Original Content" in result.html
# Fast path should not modify the HTML
assert result.html == html
@pytest.mark.asyncio
async def test_js_code_on_raw_html():
"""Test that js_code executes on raw: HTML and modifies the DOM."""
html = "<html><body><div id='test'>Original</div></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.getElementById('test').innerText = 'Modified by JS'"
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Modified by JS" in result.html
assert "Original" not in result.html or "Modified by JS" in result.html
@pytest.mark.asyncio
async def test_js_code_adds_element_to_raw_html():
"""Test that js_code can add new elements to raw: HTML."""
html = "<html><body><div id='container'></div></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code='document.getElementById("container").innerHTML = "<span id=\'injected\'>Custom Content</span>"'
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "injected" in result.html
assert "Custom Content" in result.html
@pytest.mark.asyncio
async def test_screenshot_on_raw_html():
"""Test that screenshots work on raw: HTML."""
html = "<html><body><h1 style='color:red;font-size:48px;'>Screenshot Test</h1></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(screenshot=True)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert result.screenshot is not None
assert len(result.screenshot) > 100 # Should have substantial screenshot data
@pytest.mark.asyncio
async def test_process_in_browser_flag():
"""Test that process_in_browser=True forces browser path even without other params."""
html = "<html><body><div>Test</div></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(process_in_browser=True)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# Browser path normalizes HTML, so it may be slightly different
assert "Test" in result.html
@pytest.mark.asyncio
async def test_raw_prefix_variations():
"""Test both raw: and raw:// prefix formats."""
html = "<html><body>Content</body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code='document.body.innerHTML += "<div id=\'added\'>Added</div>"'
)
# Test raw: prefix
result1 = await crawler.arun(f"raw:{html}", config=config)
assert result1.success
assert "Added" in result1.html
# Test raw:// prefix
result2 = await crawler.arun(f"raw://{html}", config=config)
assert result2.success
assert "Added" in result2.html
@pytest.mark.asyncio
async def test_wait_for_on_raw_html():
"""Test that wait_for works with raw: HTML after js_code modifies DOM."""
html = "<html><body><div id='container'></div></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code='''
setTimeout(() => {
document.getElementById('container').innerHTML = '<div id="delayed">Delayed Content</div>';
}, 100);
''',
wait_for="#delayed",
wait_for_timeout=5000
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Delayed Content" in result.html
@pytest.mark.asyncio
async def test_multiple_js_code_scripts():
"""Test that multiple js_code scripts execute in order."""
html = "<html><body><div id='counter'>0</div></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code=[
"document.getElementById('counter').innerText = '1'",
"document.getElementById('counter').innerText = parseInt(document.getElementById('counter').innerText) + 1",
"document.getElementById('counter').innerText = parseInt(document.getElementById('counter').innerText) + 1",
]
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert ">3<" in result.html # Counter should be 3 after all scripts run
if __name__ == "__main__":
# Run a quick manual test
async def quick_test():
html = "<html><body><div id='test'>Original</div></body></html>"
async with AsyncWebCrawler(verbose=True) as crawler:
# Test 1: Fast path
print("\n=== Test 1: Fast path (no browser params) ===")
result1 = await crawler.arun(f"raw:{html}")
print(f"Success: {result1.success}")
print(f"HTML contains 'Original': {'Original' in result1.html}")
# Test 2: js_code modifies DOM
print("\n=== Test 2: js_code modifies DOM ===")
config = CrawlerRunConfig(
js_code="document.getElementById('test').innerText = 'Modified by JS'"
)
result2 = await crawler.arun(f"raw:{html}", config=config)
print(f"Success: {result2.success}")
print(f"HTML contains 'Modified by JS': {'Modified by JS' in result2.html}")
print(f"HTML snippet: {result2.html[:500]}...")
asyncio.run(quick_test())

View File

@@ -0,0 +1,563 @@
"""
BRUTAL edge case tests for raw:/file:// URL browser pipeline.
These tests try to break the system with tricky inputs, edge cases,
and compatibility checks to ensure we didn't break existing functionality.
"""
import pytest
import asyncio
import tempfile
import os
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
# ============================================================================
# EDGE CASE: Hash characters in HTML (previously broke urlparse - Issue #283)
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_with_hash_in_css():
"""Test that # in CSS colors doesn't break HTML parsing (regression for #283)."""
html = """
<html>
<head>
<style>
body { background-color: #ff5733; color: #333333; }
.highlight { border: 1px solid #000; }
</style>
</head>
<body>
<div class="highlight" style="color: #ffffff;">Content with hash colors</div>
</body>
</html>
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(js_code="document.body.innerHTML += '<div id=\"added\">Added</div>'")
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "#ff5733" in result.html or "ff5733" in result.html # Color should be preserved
assert "Added" in result.html # JS executed
assert "Content with hash colors" in result.html # Original content preserved
@pytest.mark.asyncio
async def test_raw_html_with_fragment_links():
"""Test HTML with # fragment links doesn't break."""
html = """
<html><body>
<a href="#section1">Go to section 1</a>
<a href="#section2">Go to section 2</a>
<div id="section1">Section 1</div>
<div id="section2">Section 2</div>
</body></html>
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(js_code="document.getElementById('section1').innerText = 'Modified Section 1'")
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Modified Section 1" in result.html
assert "#section2" in result.html # Fragment link preserved
# ============================================================================
# EDGE CASE: Special characters and unicode
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_with_unicode():
"""Test raw HTML with various unicode characters."""
html = """
<html><body>
<div id="unicode">日本語 中文 한국어 العربية 🎉 💻 🚀</div>
<div id="special">&amp; &lt; &gt; &quot; &apos;</div>
</body></html>
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(js_code="document.getElementById('unicode').innerText += ' ✅ Modified'")
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "✅ Modified" in result.html or "Modified" in result.html
# Check unicode is preserved
assert "日本語" in result.html or "&#" in result.html # Either preserved or encoded
@pytest.mark.asyncio
async def test_raw_html_with_script_tags():
"""Test raw HTML with existing script tags doesn't interfere with js_code."""
html = """
<html><body>
<div id="counter">0</div>
<script>
// This script runs on page load
document.getElementById('counter').innerText = '10';
</script>
</body></html>
"""
async with AsyncWebCrawler() as crawler:
# Our js_code runs AFTER the page scripts
config = CrawlerRunConfig(
js_code="document.getElementById('counter').innerText = parseInt(document.getElementById('counter').innerText) + 5"
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# The embedded script sets it to 10, then our js_code adds 5
assert ">15<" in result.html or "15" in result.html
# ============================================================================
# EDGE CASE: Empty and malformed HTML
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_empty():
"""Test empty raw HTML."""
html = ""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(js_code="document.body.innerHTML = '<div>Added to empty</div>'")
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Added to empty" in result.html
@pytest.mark.asyncio
async def test_raw_html_minimal():
"""Test minimal HTML (just text, no tags)."""
html = "Just plain text, no HTML tags"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(js_code="document.body.innerHTML += '<div id=\"injected\">Injected</div>'")
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# Browser should wrap it in proper HTML
assert "Injected" in result.html
@pytest.mark.asyncio
async def test_raw_html_malformed():
"""Test malformed HTML with unclosed tags."""
html = "<html><body><div><span>Unclosed tags<div>More content"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(js_code="document.body.innerHTML += '<div id=\"valid\">Valid Added</div>'")
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Valid Added" in result.html
# Browser should have fixed the malformed HTML
# ============================================================================
# EDGE CASE: Very large HTML
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_large():
"""Test large raw HTML (100KB+)."""
# Generate 100KB of HTML
items = "".join([f'<div class="item" id="item-{i}">Item {i} content here with some text</div>\n' for i in range(2000)])
html = f"<html><body>{items}</body></html>"
assert len(html) > 100000 # Verify it's actually large
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.getElementById('item-999').innerText = 'MODIFIED ITEM 999'"
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "MODIFIED ITEM 999" in result.html
assert "item-1999" in result.html # Last item should still exist
# ============================================================================
# EDGE CASE: JavaScript errors and timeouts
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_js_error_doesnt_crash():
"""Test that JavaScript errors in js_code don't crash the crawl."""
html = "<html><body><div id='test'>Original</div></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code=[
"nonExistentFunction();", # This will throw an error
"document.getElementById('test').innerText = 'Still works'" # This should still run
]
)
result = await crawler.arun(f"raw:{html}", config=config)
# Crawl should succeed even with JS errors
assert result.success
@pytest.mark.asyncio
async def test_raw_html_wait_for_timeout():
"""Test wait_for with element that never appears times out gracefully."""
html = "<html><body><div id='test'>Original</div></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
wait_for="#never-exists",
wait_for_timeout=1000 # 1 second timeout
)
result = await crawler.arun(f"raw:{html}", config=config)
# Should timeout but still return the HTML we have
# The behavior might be success=False or success=True with partial content
# Either way, it shouldn't hang or crash
assert result is not None
# ============================================================================
# COMPATIBILITY: Normal HTTP URLs still work
# ============================================================================
@pytest.mark.asyncio
async def test_http_urls_still_work():
"""Ensure we didn't break normal HTTP URL crawling."""
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://example.com")
assert result.success
assert "Example Domain" in result.html
@pytest.mark.asyncio
async def test_http_with_js_code_still_works():
"""Ensure HTTP URLs with js_code still work."""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.body.innerHTML += '<div id=\"injected\">Injected via JS</div>'"
)
result = await crawler.arun("https://example.com", config=config)
assert result.success
assert "Injected via JS" in result.html
# ============================================================================
# COMPATIBILITY: File URLs
# ============================================================================
@pytest.mark.asyncio
async def test_file_url_with_js_code():
"""Test file:// URLs with js_code execution."""
# Create a temp file
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
f.write("<html><body><div id='file-content'>File Content</div></body></html>")
temp_path = f.name
try:
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.getElementById('file-content').innerText = 'Modified File Content'"
)
result = await crawler.arun(f"file://{temp_path}", config=config)
assert result.success
assert "Modified File Content" in result.html
finally:
os.unlink(temp_path)
@pytest.mark.asyncio
async def test_file_url_fast_path():
"""Test file:// fast path (no browser params)."""
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
f.write("<html><body>Fast path file content</body></html>")
temp_path = f.name
try:
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(f"file://{temp_path}")
assert result.success
assert "Fast path file content" in result.html
finally:
os.unlink(temp_path)
# ============================================================================
# COMPATIBILITY: Extraction strategies with raw HTML
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_with_css_extraction():
"""Test CSS extraction on raw HTML after js_code modifies it."""
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
html = """
<html><body>
<div class="products">
<div class="product"><span class="name">Original Product</span></div>
</div>
</body></html>
"""
schema = {
"name": "Products",
"baseSelector": ".product",
"fields": [
{"name": "name", "selector": ".name", "type": "text"}
]
}
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="""
document.querySelector('.products').innerHTML +=
'<div class="product"><span class="name">JS Added Product</span></div>';
""",
extraction_strategy=JsonCssExtractionStrategy(schema)
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# Check that extraction found both products
import json
extracted = json.loads(result.extracted_content)
names = [p.get('name', '') for p in extracted]
assert any("JS Added Product" in name for name in names)
# ============================================================================
# EDGE CASE: Concurrent raw: requests
# ============================================================================
@pytest.mark.asyncio
async def test_concurrent_raw_requests():
"""Test multiple concurrent raw: requests don't interfere."""
htmls = [
f"<html><body><div id='test'>Request {i}</div></body></html>"
for i in range(5)
]
async with AsyncWebCrawler() as crawler:
configs = [
CrawlerRunConfig(
js_code=f"document.getElementById('test').innerText += ' Modified {i}'"
)
for i in range(5)
]
# Run concurrently
tasks = [
crawler.arun(f"raw:{html}", config=config)
for html, config in zip(htmls, configs)
]
results = await asyncio.gather(*tasks)
for i, result in enumerate(results):
assert result.success
assert f"Request {i}" in result.html
assert f"Modified {i}" in result.html
# ============================================================================
# EDGE CASE: raw: with base_url for link resolution
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_with_base_url():
"""Test that base_url is used for link resolution in markdown."""
html = """
<html><body>
<a href="/page1">Page 1</a>
<a href="/page2">Page 2</a>
<img src="/images/logo.png" alt="Logo">
</body></html>
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
base_url="https://example.com",
process_in_browser=True # Force browser to test base_url handling
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# Check markdown has absolute URLs
if result.markdown:
# Links should be absolute
md = result.markdown.raw_markdown if hasattr(result.markdown, 'raw_markdown') else str(result.markdown)
assert "example.com" in md or "/page1" in md
# ============================================================================
# EDGE CASE: raw: with screenshot of complex page
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_screenshot_complex_page():
"""Test screenshot of complex raw HTML with CSS and JS modifications."""
html = """
<html>
<head>
<style>
body { font-family: Arial; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 40px; }
.card { background: white; padding: 20px; border-radius: 10px; box-shadow: 0 4px 6px rgba(0,0,0,0.1); }
h1 { color: #333; }
</style>
</head>
<body>
<div class="card">
<h1 id="title">Original Title</h1>
<p>This is a test card with styling.</p>
</div>
</body>
</html>
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.getElementById('title').innerText = 'Modified Title'",
screenshot=True
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert result.screenshot is not None
assert len(result.screenshot) > 1000 # Should be substantial
assert "Modified Title" in result.html
# ============================================================================
# EDGE CASE: JavaScript that tries to navigate away
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_js_navigation_blocked():
"""Test that JS trying to navigate doesn't break the crawl."""
html = """
<html><body>
<div id="content">Original Content</div>
<script>
// Try to navigate away (should be blocked or handled)
// window.location.href = 'https://example.com';
</script>
</body></html>
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
# Try to navigate via js_code
js_code=[
"document.getElementById('content').innerText = 'Before navigation attempt'",
# Actual navigation attempt commented - would cause issues
# "window.location.href = 'https://example.com'",
]
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Before navigation attempt" in result.html
# ============================================================================
# EDGE CASE: Raw HTML with iframes
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_with_iframes():
"""Test raw HTML containing iframes."""
html = """
<html><body>
<div id="main">Main content</div>
<iframe id="frame1" srcdoc="<html><body><div id='iframe-content'>Iframe Content</div></body></html>"></iframe>
</body></html>
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.getElementById('main').innerText = 'Modified main'",
process_iframes=True
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Modified main" in result.html
# ============================================================================
# TRICKY: Protocol inside raw content
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_with_urls_inside():
"""Test raw: with http:// URLs inside the content."""
html = """
<html><body>
<a href="http://example.com">Example</a>
<a href="https://google.com">Google</a>
<img src="https://placekitten.com/200/300" alt="Cat">
<div id="test">Test content with URL: https://test.com</div>
</body></html>
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.getElementById('test').innerText += ' - Modified'"
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Modified" in result.html
assert "http://example.com" in result.html or "example.com" in result.html
# ============================================================================
# TRICKY: Double raw: prefix
# ============================================================================
@pytest.mark.asyncio
async def test_double_raw_prefix():
"""Test what happens with double raw: prefix (edge case)."""
html = "<html><body>Content</body></html>"
async with AsyncWebCrawler() as crawler:
# raw:raw:<html>... - the second raw: becomes part of content
result = await crawler.arun(f"raw:raw:{html}")
# Should either handle gracefully or return "raw:<html>..." as content
assert result is not None
if __name__ == "__main__":
import sys
async def run_tests():
# Run a few key tests manually
tests = [
("Hash in CSS", test_raw_html_with_hash_in_css),
("Unicode", test_raw_html_with_unicode),
("Large HTML", test_raw_html_large),
("HTTP still works", test_http_urls_still_work),
("Concurrent requests", test_concurrent_raw_requests),
("Complex screenshot", test_raw_html_screenshot_complex_page),
]
for name, test_fn in tests:
print(f"\n=== Running: {name} ===")
try:
await test_fn()
print(f"{name} PASSED")
except Exception as e:
print(f"{name} FAILED: {e}")
import traceback
traceback.print_exc()
asyncio.run(run_tests())