Add browser pipeline support for raw:/file:// URLs

- Add process_in_browser parameter to CrawlerRunConfig
- Route raw:/file:// URLs through _crawl_web() when browser operations needed
- Use page.set_content() instead of goto() for local content
- Fix cookie handling for non-HTTP URLs in browser_manager
- Auto-detect browser requirements: js_code, wait_for, screenshot, etc.
- Maintain fast path for raw:/file:// without browser params

Fixes #310
This commit is contained in:
unclecode
2025-12-27 12:32:42 +00:00
parent a43256b27a
commit 2550f3d2d5
5 changed files with 870 additions and 118 deletions

View File

@@ -1188,6 +1188,12 @@ class CrawlerRunConfig():
# Connection Parameters
stream (bool): If True, enables streaming of crawled URLs as they are processed when used with arun_many.
Default: False.
process_in_browser (bool): If True, forces raw:/file:// URLs to be processed through the browser
pipeline (enabling js_code, wait_for, scrolling, etc.). When False (default),
raw:/file:// URLs use a fast path that returns HTML directly without browser
interaction. This is automatically enabled when browser-requiring parameters
are detected (js_code, wait_for, screenshot, pdf, etc.).
Default: False.
check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
Default: False.
@@ -1308,6 +1314,7 @@ class CrawlerRunConfig():
method: str = "GET",
stream: bool = False,
prefetch: bool = False, # When True, return only HTML + links (skip heavy processing)
process_in_browser: bool = False, # Force browser processing for raw:/file:// URLs
url: str = None,
base_url: str = None, # Base URL for markdown link resolution (used with raw: HTML)
check_robots_txt: bool = False,
@@ -1445,6 +1452,7 @@ class CrawlerRunConfig():
# Connection Parameters
self.stream = stream
self.prefetch = prefetch # Prefetch mode: return only HTML + links
self.process_in_browser = process_in_browser # Force browser processing for raw:/file:// URLs
self.method = method
# Robots.txt Handling Parameters
@@ -1722,6 +1730,7 @@ class CrawlerRunConfig():
method=kwargs.get("method", "GET"),
stream=kwargs.get("stream", False),
prefetch=kwargs.get("prefetch", False),
process_in_browser=kwargs.get("process_in_browser", False),
check_robots_txt=kwargs.get("check_robots_txt", False),
user_agent=kwargs.get("user_agent"),
user_agent_mode=kwargs.get("user_agent_mode"),
@@ -1831,6 +1840,7 @@ class CrawlerRunConfig():
"method": self.method,
"stream": self.stream,
"prefetch": self.prefetch,
"process_in_browser": self.process_in_browser,
"check_robots_txt": self.check_robots_txt,
"user_agent": self.user_agent,
"user_agent_mode": self.user_agent_mode,

View File

@@ -452,66 +452,48 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if url.startswith(("http://", "https://", "view-source:")):
return await self._crawl_web(url, config)
elif url.startswith("file://"):
# initialize empty lists for console messages
captured_console = []
pdf_data = None
mhtml_data = None
# Process local file
local_file_path = url[7:] # Remove 'file://' prefix
if not os.path.exists(local_file_path):
raise FileNotFoundError(f"Local file not found: {local_file_path}")
with open(local_file_path, "r", encoding="utf-8") as f:
html = f.read()
# Handle media generation - all require loading HTML into browser
if config.screenshot or config.pdf or config.capture_mhtml:
screenshot_data, pdf_data, mhtml_data = await self._generate_media_from_html(
html, config
)
if config.capture_console_messages:
page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
captured_console = await self._capture_console_messages(page, url)
return AsyncCrawlResponse(
html=html,
response_headers=response_headers,
status_code=status_code,
screenshot=screenshot_data,
pdf_data=pdf_data,
mhtml_data=mhtml_data,
get_delayed_content=None,
console_messages=captured_console,
elif url.startswith("file://") or url.startswith("raw://") or url.startswith("raw:"):
# Check if browser processing is required for file:// or raw: URLs
needs_browser = (
config.process_in_browser or
config.screenshot or
config.pdf or
config.capture_mhtml or
config.js_code or
config.wait_for or
config.scan_full_page or
config.remove_overlay_elements or
config.simulate_user or
config.magic or
config.process_iframes or
config.capture_console_messages or
config.capture_network_requests
)
#####
# Since both "raw:" and "raw://" start with "raw:", the first condition is always true for both, so "raw://" will be sliced as "//...", which is incorrect.
# Fix: Check for "raw://" first, then "raw:"
# Also, the prefix "raw://" is actually 6 characters long, not 7, so it should be sliced accordingly: url[6:]
#####
elif url.startswith("raw://") or url.startswith("raw:"):
# Process raw HTML content
# raw_html = url[4:] if url[:4] == "raw:" else url[7:]
raw_html = url[6:] if url.startswith("raw://") else url[4:]
html = raw_html
pdf_data = None
mhtml_data = None
if needs_browser:
# Route through _crawl_web() for full browser pipeline
# _crawl_web() will detect file:// and raw: URLs and use set_content()
return await self._crawl_web(url, config)
# Handle media generation - all require loading HTML into browser
if config.screenshot or config.pdf or config.capture_mhtml:
screenshot_data, pdf_data, mhtml_data = await self._generate_media_from_html(
html, config
)
# Fast path: return HTML directly without browser interaction
if url.startswith("file://"):
# Process local file
local_file_path = url[7:] # Remove 'file://' prefix
if not os.path.exists(local_file_path):
raise FileNotFoundError(f"Local file not found: {local_file_path}")
with open(local_file_path, "r", encoding="utf-8") as f:
html = f.read()
else:
# Process raw HTML content (raw:// or raw:)
html = url[6:] if url.startswith("raw://") else url[4:]
return AsyncCrawlResponse(
html=html,
response_headers=response_headers,
status_code=status_code,
screenshot=screenshot_data,
pdf_data=pdf_data,
mhtml_data=mhtml_data,
screenshot=None,
pdf_data=None,
mhtml_data=None,
get_delayed_content=None,
)
else:
@@ -684,67 +666,83 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if not config.js_only:
await self.execute_hook("before_goto", page, context=context, url=url, config=config)
try:
# Generate a unique nonce for this request
if config.experimental.get("use_csp_nonce", False):
nonce = hashlib.sha256(os.urandom(32)).hexdigest()
# Check if this is a file:// or raw: URL that needs set_content() instead of goto()
is_local_content = url.startswith("file://") or url.startswith("raw://") or url.startswith("raw:")
# Add CSP headers to the request
await page.set_extra_http_headers(
{
"Content-Security-Policy": f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'"
}
)
response = await page.goto(
url, wait_until=config.wait_until, timeout=config.page_timeout
)
redirected_url = page.url
except Error as e:
# Allow navigation to be aborted when downloading files
# This is expected behavior for downloads in some browser engines
if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads:
self.logger.info(
message=f"Navigation aborted, likely due to file download: {url}",
tag="GOTO",
params={"url": url},
)
response = None
if is_local_content:
# Load local content using set_content() instead of network navigation
if url.startswith("file://"):
local_file_path = url[7:] # Remove 'file://' prefix
if not os.path.exists(local_file_path):
raise FileNotFoundError(f"Local file not found: {local_file_path}")
with open(local_file_path, "r", encoding="utf-8") as f:
html_content = f.read()
else:
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
# raw:// or raw:
html_content = url[6:] if url.startswith("raw://") else url[4:]
await page.set_content(html_content, wait_until=config.wait_until)
response = None
redirected_url = config.base_url or url
status_code = 200
response_headers = {}
else:
# Standard web navigation with goto()
try:
# Generate a unique nonce for this request
if config.experimental.get("use_csp_nonce", False):
nonce = hashlib.sha256(os.urandom(32)).hexdigest()
# Add CSP headers to the request
await page.set_extra_http_headers(
{
"Content-Security-Policy": f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'"
}
)
response = await page.goto(
url, wait_until=config.wait_until, timeout=config.page_timeout
)
redirected_url = page.url
except Error as e:
# Allow navigation to be aborted when downloading files
# This is expected behavior for downloads in some browser engines
if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads:
self.logger.info(
message=f"Navigation aborted, likely due to file download: {url}",
tag="GOTO",
params={"url": url},
)
response = None
else:
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
# ──────────────────────────────────────────────────────────────
# Walk the redirect chain. Playwright returns only the last
# hop, so we trace the `request.redirected_from` links until the
# first response that differs from the final one and surface its
# status-code.
# ──────────────────────────────────────────────────────────────
if response is None:
status_code = 200
response_headers = {}
else:
first_resp = response
req = response.request
while req and req.redirected_from:
prev_req = req.redirected_from
prev_resp = await prev_req.response()
if prev_resp: # keep earliest
first_resp = prev_resp
req = prev_req
status_code = first_resp.status
response_headers = first_resp.headers
await self.execute_hook(
"after_goto", page, context=context, url=url, response=response, config=config
)
# ──────────────────────────────────────────────────────────────
# Walk the redirect chain. Playwright returns only the last
# hop, so we trace the `request.redirected_from` links until the
# first response that differs from the final one and surface its
# status-code.
# ──────────────────────────────────────────────────────────────
if response is None:
status_code = 200
response_headers = {}
else:
first_resp = response
req = response.request
while req and req.redirected_from:
prev_req = req.redirected_from
prev_resp = await prev_req.response()
if prev_resp: # keep earliest
first_resp = prev_resp
req = prev_req
status_code = first_resp.status
response_headers = first_resp.headers
# if response is None:
# status_code = 200
# response_headers = {}
# else:
# status_code = response.status
# response_headers = response.headers
else:
status_code = 200
response_headers = {}

View File

@@ -895,18 +895,27 @@ class BrowserManager:
combined_headers.update(self.config.headers)
await context.set_extra_http_headers(combined_headers)
# Add default cookie
await context.add_cookies(
[
{
"name": "cookiesEnabled",
"value": "true",
"url": crawlerRunConfig.url
if crawlerRunConfig and crawlerRunConfig.url
else "https://crawl4ai.com/",
}
]
)
# Add default cookie (skip for raw:/file:// URLs which are not valid cookie URLs)
cookie_url = None
if crawlerRunConfig and crawlerRunConfig.url:
url = crawlerRunConfig.url
# Only set cookie for http/https URLs
if url.startswith(("http://", "https://")):
cookie_url = url
elif crawlerRunConfig.base_url and crawlerRunConfig.base_url.startswith(("http://", "https://")):
# Use base_url as fallback for raw:/file:// URLs
cookie_url = crawlerRunConfig.base_url
if cookie_url:
await context.add_cookies(
[
{
"name": "cookiesEnabled",
"value": "true",
"url": cookie_url,
}
]
)
# Handle navigator overrides
if crawlerRunConfig:

View File

@@ -0,0 +1,172 @@
"""
Tests for raw:/file:// URL browser pipeline support.
Tests the new feature that allows js_code, wait_for, and other browser operations
to work with raw: and file:// URLs by routing them through _crawl_web() with
set_content() instead of goto().
"""
import pytest
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
@pytest.mark.asyncio
async def test_raw_html_fast_path():
"""Test that raw: without browser params returns HTML directly (fast path)."""
html = "<html><body><div id='test'>Original Content</div></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig() # No browser params
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Original Content" in result.html
# Fast path should not modify the HTML
assert result.html == html
@pytest.mark.asyncio
async def test_js_code_on_raw_html():
"""Test that js_code executes on raw: HTML and modifies the DOM."""
html = "<html><body><div id='test'>Original</div></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.getElementById('test').innerText = 'Modified by JS'"
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Modified by JS" in result.html
assert "Original" not in result.html or "Modified by JS" in result.html
@pytest.mark.asyncio
async def test_js_code_adds_element_to_raw_html():
"""Test that js_code can add new elements to raw: HTML."""
html = "<html><body><div id='container'></div></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code='document.getElementById("container").innerHTML = "<span id=\'injected\'>Custom Content</span>"'
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "injected" in result.html
assert "Custom Content" in result.html
@pytest.mark.asyncio
async def test_screenshot_on_raw_html():
"""Test that screenshots work on raw: HTML."""
html = "<html><body><h1 style='color:red;font-size:48px;'>Screenshot Test</h1></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(screenshot=True)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert result.screenshot is not None
assert len(result.screenshot) > 100 # Should have substantial screenshot data
@pytest.mark.asyncio
async def test_process_in_browser_flag():
"""Test that process_in_browser=True forces browser path even without other params."""
html = "<html><body><div>Test</div></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(process_in_browser=True)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# Browser path normalizes HTML, so it may be slightly different
assert "Test" in result.html
@pytest.mark.asyncio
async def test_raw_prefix_variations():
"""Test both raw: and raw:// prefix formats."""
html = "<html><body>Content</body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code='document.body.innerHTML += "<div id=\'added\'>Added</div>"'
)
# Test raw: prefix
result1 = await crawler.arun(f"raw:{html}", config=config)
assert result1.success
assert "Added" in result1.html
# Test raw:// prefix
result2 = await crawler.arun(f"raw://{html}", config=config)
assert result2.success
assert "Added" in result2.html
@pytest.mark.asyncio
async def test_wait_for_on_raw_html():
"""Test that wait_for works with raw: HTML after js_code modifies DOM."""
html = "<html><body><div id='container'></div></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code='''
setTimeout(() => {
document.getElementById('container').innerHTML = '<div id="delayed">Delayed Content</div>';
}, 100);
''',
wait_for="#delayed",
wait_for_timeout=5000
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Delayed Content" in result.html
@pytest.mark.asyncio
async def test_multiple_js_code_scripts():
"""Test that multiple js_code scripts execute in order."""
html = "<html><body><div id='counter'>0</div></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code=[
"document.getElementById('counter').innerText = '1'",
"document.getElementById('counter').innerText = parseInt(document.getElementById('counter').innerText) + 1",
"document.getElementById('counter').innerText = parseInt(document.getElementById('counter').innerText) + 1",
]
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert ">3<" in result.html # Counter should be 3 after all scripts run
if __name__ == "__main__":
# Run a quick manual test
async def quick_test():
html = "<html><body><div id='test'>Original</div></body></html>"
async with AsyncWebCrawler(verbose=True) as crawler:
# Test 1: Fast path
print("\n=== Test 1: Fast path (no browser params) ===")
result1 = await crawler.arun(f"raw:{html}")
print(f"Success: {result1.success}")
print(f"HTML contains 'Original': {'Original' in result1.html}")
# Test 2: js_code modifies DOM
print("\n=== Test 2: js_code modifies DOM ===")
config = CrawlerRunConfig(
js_code="document.getElementById('test').innerText = 'Modified by JS'"
)
result2 = await crawler.arun(f"raw:{html}", config=config)
print(f"Success: {result2.success}")
print(f"HTML contains 'Modified by JS': {'Modified by JS' in result2.html}")
print(f"HTML snippet: {result2.html[:500]}...")
asyncio.run(quick_test())

View File

@@ -0,0 +1,563 @@
"""
BRUTAL edge case tests for raw:/file:// URL browser pipeline.
These tests try to break the system with tricky inputs, edge cases,
and compatibility checks to ensure we didn't break existing functionality.
"""
import pytest
import asyncio
import tempfile
import os
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
# ============================================================================
# EDGE CASE: Hash characters in HTML (previously broke urlparse - Issue #283)
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_with_hash_in_css():
"""Test that # in CSS colors doesn't break HTML parsing (regression for #283)."""
html = """
<html>
<head>
<style>
body { background-color: #ff5733; color: #333333; }
.highlight { border: 1px solid #000; }
</style>
</head>
<body>
<div class="highlight" style="color: #ffffff;">Content with hash colors</div>
</body>
</html>
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(js_code="document.body.innerHTML += '<div id=\"added\">Added</div>'")
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "#ff5733" in result.html or "ff5733" in result.html # Color should be preserved
assert "Added" in result.html # JS executed
assert "Content with hash colors" in result.html # Original content preserved
@pytest.mark.asyncio
async def test_raw_html_with_fragment_links():
"""Test HTML with # fragment links doesn't break."""
html = """
<html><body>
<a href="#section1">Go to section 1</a>
<a href="#section2">Go to section 2</a>
<div id="section1">Section 1</div>
<div id="section2">Section 2</div>
</body></html>
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(js_code="document.getElementById('section1').innerText = 'Modified Section 1'")
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Modified Section 1" in result.html
assert "#section2" in result.html # Fragment link preserved
# ============================================================================
# EDGE CASE: Special characters and unicode
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_with_unicode():
"""Test raw HTML with various unicode characters."""
html = """
<html><body>
<div id="unicode">日本語 中文 한국어 العربية 🎉 💻 🚀</div>
<div id="special">&amp; &lt; &gt; &quot; &apos;</div>
</body></html>
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(js_code="document.getElementById('unicode').innerText += ' ✅ Modified'")
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "✅ Modified" in result.html or "Modified" in result.html
# Check unicode is preserved
assert "日本語" in result.html or "&#" in result.html # Either preserved or encoded
@pytest.mark.asyncio
async def test_raw_html_with_script_tags():
"""Test raw HTML with existing script tags doesn't interfere with js_code."""
html = """
<html><body>
<div id="counter">0</div>
<script>
// This script runs on page load
document.getElementById('counter').innerText = '10';
</script>
</body></html>
"""
async with AsyncWebCrawler() as crawler:
# Our js_code runs AFTER the page scripts
config = CrawlerRunConfig(
js_code="document.getElementById('counter').innerText = parseInt(document.getElementById('counter').innerText) + 5"
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# The embedded script sets it to 10, then our js_code adds 5
assert ">15<" in result.html or "15" in result.html
# ============================================================================
# EDGE CASE: Empty and malformed HTML
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_empty():
"""Test empty raw HTML."""
html = ""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(js_code="document.body.innerHTML = '<div>Added to empty</div>'")
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Added to empty" in result.html
@pytest.mark.asyncio
async def test_raw_html_minimal():
"""Test minimal HTML (just text, no tags)."""
html = "Just plain text, no HTML tags"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(js_code="document.body.innerHTML += '<div id=\"injected\">Injected</div>'")
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# Browser should wrap it in proper HTML
assert "Injected" in result.html
@pytest.mark.asyncio
async def test_raw_html_malformed():
"""Test malformed HTML with unclosed tags."""
html = "<html><body><div><span>Unclosed tags<div>More content"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(js_code="document.body.innerHTML += '<div id=\"valid\">Valid Added</div>'")
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Valid Added" in result.html
# Browser should have fixed the malformed HTML
# ============================================================================
# EDGE CASE: Very large HTML
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_large():
"""Test large raw HTML (100KB+)."""
# Generate 100KB of HTML
items = "".join([f'<div class="item" id="item-{i}">Item {i} content here with some text</div>\n' for i in range(2000)])
html = f"<html><body>{items}</body></html>"
assert len(html) > 100000 # Verify it's actually large
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.getElementById('item-999').innerText = 'MODIFIED ITEM 999'"
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "MODIFIED ITEM 999" in result.html
assert "item-1999" in result.html # Last item should still exist
# ============================================================================
# EDGE CASE: JavaScript errors and timeouts
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_js_error_doesnt_crash():
"""Test that JavaScript errors in js_code don't crash the crawl."""
html = "<html><body><div id='test'>Original</div></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code=[
"nonExistentFunction();", # This will throw an error
"document.getElementById('test').innerText = 'Still works'" # This should still run
]
)
result = await crawler.arun(f"raw:{html}", config=config)
# Crawl should succeed even with JS errors
assert result.success
@pytest.mark.asyncio
async def test_raw_html_wait_for_timeout():
"""Test wait_for with element that never appears times out gracefully."""
html = "<html><body><div id='test'>Original</div></body></html>"
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
wait_for="#never-exists",
wait_for_timeout=1000 # 1 second timeout
)
result = await crawler.arun(f"raw:{html}", config=config)
# Should timeout but still return the HTML we have
# The behavior might be success=False or success=True with partial content
# Either way, it shouldn't hang or crash
assert result is not None
# ============================================================================
# COMPATIBILITY: Normal HTTP URLs still work
# ============================================================================
@pytest.mark.asyncio
async def test_http_urls_still_work():
"""Ensure we didn't break normal HTTP URL crawling."""
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://example.com")
assert result.success
assert "Example Domain" in result.html
@pytest.mark.asyncio
async def test_http_with_js_code_still_works():
"""Ensure HTTP URLs with js_code still work."""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.body.innerHTML += '<div id=\"injected\">Injected via JS</div>'"
)
result = await crawler.arun("https://example.com", config=config)
assert result.success
assert "Injected via JS" in result.html
# ============================================================================
# COMPATIBILITY: File URLs
# ============================================================================
@pytest.mark.asyncio
async def test_file_url_with_js_code():
"""Test file:// URLs with js_code execution."""
# Create a temp file
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
f.write("<html><body><div id='file-content'>File Content</div></body></html>")
temp_path = f.name
try:
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.getElementById('file-content').innerText = 'Modified File Content'"
)
result = await crawler.arun(f"file://{temp_path}", config=config)
assert result.success
assert "Modified File Content" in result.html
finally:
os.unlink(temp_path)
@pytest.mark.asyncio
async def test_file_url_fast_path():
"""Test file:// fast path (no browser params)."""
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
f.write("<html><body>Fast path file content</body></html>")
temp_path = f.name
try:
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(f"file://{temp_path}")
assert result.success
assert "Fast path file content" in result.html
finally:
os.unlink(temp_path)
# ============================================================================
# COMPATIBILITY: Extraction strategies with raw HTML
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_with_css_extraction():
"""Test CSS extraction on raw HTML after js_code modifies it."""
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
html = """
<html><body>
<div class="products">
<div class="product"><span class="name">Original Product</span></div>
</div>
</body></html>
"""
schema = {
"name": "Products",
"baseSelector": ".product",
"fields": [
{"name": "name", "selector": ".name", "type": "text"}
]
}
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="""
document.querySelector('.products').innerHTML +=
'<div class="product"><span class="name">JS Added Product</span></div>';
""",
extraction_strategy=JsonCssExtractionStrategy(schema)
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# Check that extraction found both products
import json
extracted = json.loads(result.extracted_content)
names = [p.get('name', '') for p in extracted]
assert any("JS Added Product" in name for name in names)
# ============================================================================
# EDGE CASE: Concurrent raw: requests
# ============================================================================
@pytest.mark.asyncio
async def test_concurrent_raw_requests():
"""Test multiple concurrent raw: requests don't interfere."""
htmls = [
f"<html><body><div id='test'>Request {i}</div></body></html>"
for i in range(5)
]
async with AsyncWebCrawler() as crawler:
configs = [
CrawlerRunConfig(
js_code=f"document.getElementById('test').innerText += ' Modified {i}'"
)
for i in range(5)
]
# Run concurrently
tasks = [
crawler.arun(f"raw:{html}", config=config)
for html, config in zip(htmls, configs)
]
results = await asyncio.gather(*tasks)
for i, result in enumerate(results):
assert result.success
assert f"Request {i}" in result.html
assert f"Modified {i}" in result.html
# ============================================================================
# EDGE CASE: raw: with base_url for link resolution
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_with_base_url():
"""Test that base_url is used for link resolution in markdown."""
html = """
<html><body>
<a href="/page1">Page 1</a>
<a href="/page2">Page 2</a>
<img src="/images/logo.png" alt="Logo">
</body></html>
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
base_url="https://example.com",
process_in_browser=True # Force browser to test base_url handling
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
# Check markdown has absolute URLs
if result.markdown:
# Links should be absolute
md = result.markdown.raw_markdown if hasattr(result.markdown, 'raw_markdown') else str(result.markdown)
assert "example.com" in md or "/page1" in md
# ============================================================================
# EDGE CASE: raw: with screenshot of complex page
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_screenshot_complex_page():
"""Test screenshot of complex raw HTML with CSS and JS modifications."""
html = """
<html>
<head>
<style>
body { font-family: Arial; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 40px; }
.card { background: white; padding: 20px; border-radius: 10px; box-shadow: 0 4px 6px rgba(0,0,0,0.1); }
h1 { color: #333; }
</style>
</head>
<body>
<div class="card">
<h1 id="title">Original Title</h1>
<p>This is a test card with styling.</p>
</div>
</body>
</html>
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.getElementById('title').innerText = 'Modified Title'",
screenshot=True
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert result.screenshot is not None
assert len(result.screenshot) > 1000 # Should be substantial
assert "Modified Title" in result.html
# ============================================================================
# EDGE CASE: JavaScript that tries to navigate away
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_js_navigation_blocked():
"""Test that JS trying to navigate doesn't break the crawl."""
html = """
<html><body>
<div id="content">Original Content</div>
<script>
// Try to navigate away (should be blocked or handled)
// window.location.href = 'https://example.com';
</script>
</body></html>
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
# Try to navigate via js_code
js_code=[
"document.getElementById('content').innerText = 'Before navigation attempt'",
# Actual navigation attempt commented - would cause issues
# "window.location.href = 'https://example.com'",
]
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Before navigation attempt" in result.html
# ============================================================================
# EDGE CASE: Raw HTML with iframes
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_with_iframes():
"""Test raw HTML containing iframes."""
html = """
<html><body>
<div id="main">Main content</div>
<iframe id="frame1" srcdoc="<html><body><div id='iframe-content'>Iframe Content</div></body></html>"></iframe>
</body></html>
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.getElementById('main').innerText = 'Modified main'",
process_iframes=True
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Modified main" in result.html
# ============================================================================
# TRICKY: Protocol inside raw content
# ============================================================================
@pytest.mark.asyncio
async def test_raw_html_with_urls_inside():
"""Test raw: with http:// URLs inside the content."""
html = """
<html><body>
<a href="http://example.com">Example</a>
<a href="https://google.com">Google</a>
<img src="https://placekitten.com/200/300" alt="Cat">
<div id="test">Test content with URL: https://test.com</div>
</body></html>
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.getElementById('test').innerText += ' - Modified'"
)
result = await crawler.arun(f"raw:{html}", config=config)
assert result.success
assert "Modified" in result.html
assert "http://example.com" in result.html or "example.com" in result.html
# ============================================================================
# TRICKY: Double raw: prefix
# ============================================================================
@pytest.mark.asyncio
async def test_double_raw_prefix():
"""Test what happens with double raw: prefix (edge case)."""
html = "<html><body>Content</body></html>"
async with AsyncWebCrawler() as crawler:
# raw:raw:<html>... - the second raw: becomes part of content
result = await crawler.arun(f"raw:raw:{html}")
# Should either handle gracefully or return "raw:<html>..." as content
assert result is not None
if __name__ == "__main__":
import sys
async def run_tests():
# Run a few key tests manually
tests = [
("Hash in CSS", test_raw_html_with_hash_in_css),
("Unicode", test_raw_html_with_unicode),
("Large HTML", test_raw_html_large),
("HTTP still works", test_http_urls_still_work),
("Concurrent requests", test_concurrent_raw_requests),
("Complex screenshot", test_raw_html_screenshot_complex_page),
]
for name, test_fn in tests:
print(f"\n=== Running: {name} ===")
try:
await test_fn()
print(f"{name} PASSED")
except Exception as e:
print(f"{name} FAILED: {e}")
import traceback
traceback.print_exc()
asyncio.run(run_tests())