Add browser pipeline support for raw:/file:// URLs
- Add process_in_browser parameter to CrawlerRunConfig - Route raw:/file:// URLs through _crawl_web() when browser operations needed - Use page.set_content() instead of goto() for local content - Fix cookie handling for non-HTTP URLs in browser_manager - Auto-detect browser requirements: js_code, wait_for, screenshot, etc. - Maintain fast path for raw:/file:// without browser params Fixes #310
This commit is contained in:
@@ -1188,6 +1188,12 @@ class CrawlerRunConfig():
|
||||
# Connection Parameters
|
||||
stream (bool): If True, enables streaming of crawled URLs as they are processed when used with arun_many.
|
||||
Default: False.
|
||||
process_in_browser (bool): If True, forces raw:/file:// URLs to be processed through the browser
|
||||
pipeline (enabling js_code, wait_for, scrolling, etc.). When False (default),
|
||||
raw:/file:// URLs use a fast path that returns HTML directly without browser
|
||||
interaction. This is automatically enabled when browser-requiring parameters
|
||||
are detected (js_code, wait_for, screenshot, pdf, etc.).
|
||||
Default: False.
|
||||
|
||||
check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
|
||||
Default: False.
|
||||
@@ -1308,6 +1314,7 @@ class CrawlerRunConfig():
|
||||
method: str = "GET",
|
||||
stream: bool = False,
|
||||
prefetch: bool = False, # When True, return only HTML + links (skip heavy processing)
|
||||
process_in_browser: bool = False, # Force browser processing for raw:/file:// URLs
|
||||
url: str = None,
|
||||
base_url: str = None, # Base URL for markdown link resolution (used with raw: HTML)
|
||||
check_robots_txt: bool = False,
|
||||
@@ -1445,6 +1452,7 @@ class CrawlerRunConfig():
|
||||
# Connection Parameters
|
||||
self.stream = stream
|
||||
self.prefetch = prefetch # Prefetch mode: return only HTML + links
|
||||
self.process_in_browser = process_in_browser # Force browser processing for raw:/file:// URLs
|
||||
self.method = method
|
||||
|
||||
# Robots.txt Handling Parameters
|
||||
@@ -1722,6 +1730,7 @@ class CrawlerRunConfig():
|
||||
method=kwargs.get("method", "GET"),
|
||||
stream=kwargs.get("stream", False),
|
||||
prefetch=kwargs.get("prefetch", False),
|
||||
process_in_browser=kwargs.get("process_in_browser", False),
|
||||
check_robots_txt=kwargs.get("check_robots_txt", False),
|
||||
user_agent=kwargs.get("user_agent"),
|
||||
user_agent_mode=kwargs.get("user_agent_mode"),
|
||||
@@ -1831,6 +1840,7 @@ class CrawlerRunConfig():
|
||||
"method": self.method,
|
||||
"stream": self.stream,
|
||||
"prefetch": self.prefetch,
|
||||
"process_in_browser": self.process_in_browser,
|
||||
"check_robots_txt": self.check_robots_txt,
|
||||
"user_agent": self.user_agent,
|
||||
"user_agent_mode": self.user_agent_mode,
|
||||
|
||||
@@ -452,66 +452,48 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
if url.startswith(("http://", "https://", "view-source:")):
|
||||
return await self._crawl_web(url, config)
|
||||
|
||||
elif url.startswith("file://"):
|
||||
# initialize empty lists for console messages
|
||||
captured_console = []
|
||||
pdf_data = None
|
||||
mhtml_data = None
|
||||
|
||||
# Process local file
|
||||
local_file_path = url[7:] # Remove 'file://' prefix
|
||||
if not os.path.exists(local_file_path):
|
||||
raise FileNotFoundError(f"Local file not found: {local_file_path}")
|
||||
with open(local_file_path, "r", encoding="utf-8") as f:
|
||||
html = f.read()
|
||||
|
||||
# Handle media generation - all require loading HTML into browser
|
||||
if config.screenshot or config.pdf or config.capture_mhtml:
|
||||
screenshot_data, pdf_data, mhtml_data = await self._generate_media_from_html(
|
||||
html, config
|
||||
)
|
||||
|
||||
if config.capture_console_messages:
|
||||
page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
|
||||
captured_console = await self._capture_console_messages(page, url)
|
||||
|
||||
return AsyncCrawlResponse(
|
||||
html=html,
|
||||
response_headers=response_headers,
|
||||
status_code=status_code,
|
||||
screenshot=screenshot_data,
|
||||
pdf_data=pdf_data,
|
||||
mhtml_data=mhtml_data,
|
||||
get_delayed_content=None,
|
||||
console_messages=captured_console,
|
||||
elif url.startswith("file://") or url.startswith("raw://") or url.startswith("raw:"):
|
||||
# Check if browser processing is required for file:// or raw: URLs
|
||||
needs_browser = (
|
||||
config.process_in_browser or
|
||||
config.screenshot or
|
||||
config.pdf or
|
||||
config.capture_mhtml or
|
||||
config.js_code or
|
||||
config.wait_for or
|
||||
config.scan_full_page or
|
||||
config.remove_overlay_elements or
|
||||
config.simulate_user or
|
||||
config.magic or
|
||||
config.process_iframes or
|
||||
config.capture_console_messages or
|
||||
config.capture_network_requests
|
||||
)
|
||||
|
||||
#####
|
||||
# Since both "raw:" and "raw://" start with "raw:", the first condition is always true for both, so "raw://" will be sliced as "//...", which is incorrect.
|
||||
# Fix: Check for "raw://" first, then "raw:"
|
||||
# Also, the prefix "raw://" is actually 6 characters long, not 7, so it should be sliced accordingly: url[6:]
|
||||
#####
|
||||
elif url.startswith("raw://") or url.startswith("raw:"):
|
||||
# Process raw HTML content
|
||||
# raw_html = url[4:] if url[:4] == "raw:" else url[7:]
|
||||
raw_html = url[6:] if url.startswith("raw://") else url[4:]
|
||||
html = raw_html
|
||||
pdf_data = None
|
||||
mhtml_data = None
|
||||
if needs_browser:
|
||||
# Route through _crawl_web() for full browser pipeline
|
||||
# _crawl_web() will detect file:// and raw: URLs and use set_content()
|
||||
return await self._crawl_web(url, config)
|
||||
|
||||
# Handle media generation - all require loading HTML into browser
|
||||
if config.screenshot or config.pdf or config.capture_mhtml:
|
||||
screenshot_data, pdf_data, mhtml_data = await self._generate_media_from_html(
|
||||
html, config
|
||||
)
|
||||
# Fast path: return HTML directly without browser interaction
|
||||
if url.startswith("file://"):
|
||||
# Process local file
|
||||
local_file_path = url[7:] # Remove 'file://' prefix
|
||||
if not os.path.exists(local_file_path):
|
||||
raise FileNotFoundError(f"Local file not found: {local_file_path}")
|
||||
with open(local_file_path, "r", encoding="utf-8") as f:
|
||||
html = f.read()
|
||||
else:
|
||||
# Process raw HTML content (raw:// or raw:)
|
||||
html = url[6:] if url.startswith("raw://") else url[4:]
|
||||
|
||||
return AsyncCrawlResponse(
|
||||
html=html,
|
||||
response_headers=response_headers,
|
||||
status_code=status_code,
|
||||
screenshot=screenshot_data,
|
||||
pdf_data=pdf_data,
|
||||
mhtml_data=mhtml_data,
|
||||
screenshot=None,
|
||||
pdf_data=None,
|
||||
mhtml_data=None,
|
||||
get_delayed_content=None,
|
||||
)
|
||||
else:
|
||||
@@ -684,67 +666,83 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
if not config.js_only:
|
||||
await self.execute_hook("before_goto", page, context=context, url=url, config=config)
|
||||
|
||||
try:
|
||||
# Generate a unique nonce for this request
|
||||
if config.experimental.get("use_csp_nonce", False):
|
||||
nonce = hashlib.sha256(os.urandom(32)).hexdigest()
|
||||
# Check if this is a file:// or raw: URL that needs set_content() instead of goto()
|
||||
is_local_content = url.startswith("file://") or url.startswith("raw://") or url.startswith("raw:")
|
||||
|
||||
# Add CSP headers to the request
|
||||
await page.set_extra_http_headers(
|
||||
{
|
||||
"Content-Security-Policy": f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'"
|
||||
}
|
||||
)
|
||||
|
||||
response = await page.goto(
|
||||
url, wait_until=config.wait_until, timeout=config.page_timeout
|
||||
)
|
||||
redirected_url = page.url
|
||||
except Error as e:
|
||||
# Allow navigation to be aborted when downloading files
|
||||
# This is expected behavior for downloads in some browser engines
|
||||
if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads:
|
||||
self.logger.info(
|
||||
message=f"Navigation aborted, likely due to file download: {url}",
|
||||
tag="GOTO",
|
||||
params={"url": url},
|
||||
)
|
||||
response = None
|
||||
if is_local_content:
|
||||
# Load local content using set_content() instead of network navigation
|
||||
if url.startswith("file://"):
|
||||
local_file_path = url[7:] # Remove 'file://' prefix
|
||||
if not os.path.exists(local_file_path):
|
||||
raise FileNotFoundError(f"Local file not found: {local_file_path}")
|
||||
with open(local_file_path, "r", encoding="utf-8") as f:
|
||||
html_content = f.read()
|
||||
else:
|
||||
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
|
||||
# raw:// or raw:
|
||||
html_content = url[6:] if url.startswith("raw://") else url[4:]
|
||||
|
||||
await page.set_content(html_content, wait_until=config.wait_until)
|
||||
response = None
|
||||
redirected_url = config.base_url or url
|
||||
status_code = 200
|
||||
response_headers = {}
|
||||
else:
|
||||
# Standard web navigation with goto()
|
||||
try:
|
||||
# Generate a unique nonce for this request
|
||||
if config.experimental.get("use_csp_nonce", False):
|
||||
nonce = hashlib.sha256(os.urandom(32)).hexdigest()
|
||||
|
||||
# Add CSP headers to the request
|
||||
await page.set_extra_http_headers(
|
||||
{
|
||||
"Content-Security-Policy": f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'"
|
||||
}
|
||||
)
|
||||
|
||||
response = await page.goto(
|
||||
url, wait_until=config.wait_until, timeout=config.page_timeout
|
||||
)
|
||||
redirected_url = page.url
|
||||
except Error as e:
|
||||
# Allow navigation to be aborted when downloading files
|
||||
# This is expected behavior for downloads in some browser engines
|
||||
if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads:
|
||||
self.logger.info(
|
||||
message=f"Navigation aborted, likely due to file download: {url}",
|
||||
tag="GOTO",
|
||||
params={"url": url},
|
||||
)
|
||||
response = None
|
||||
else:
|
||||
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# Walk the redirect chain. Playwright returns only the last
|
||||
# hop, so we trace the `request.redirected_from` links until the
|
||||
# first response that differs from the final one and surface its
|
||||
# status-code.
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
if response is None:
|
||||
status_code = 200
|
||||
response_headers = {}
|
||||
else:
|
||||
first_resp = response
|
||||
req = response.request
|
||||
while req and req.redirected_from:
|
||||
prev_req = req.redirected_from
|
||||
prev_resp = await prev_req.response()
|
||||
if prev_resp: # keep earliest
|
||||
first_resp = prev_resp
|
||||
req = prev_req
|
||||
|
||||
status_code = first_resp.status
|
||||
response_headers = first_resp.headers
|
||||
|
||||
await self.execute_hook(
|
||||
"after_goto", page, context=context, url=url, response=response, config=config
|
||||
)
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# Walk the redirect chain. Playwright returns only the last
|
||||
# hop, so we trace the `request.redirected_from` links until the
|
||||
# first response that differs from the final one and surface its
|
||||
# status-code.
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
if response is None:
|
||||
status_code = 200
|
||||
response_headers = {}
|
||||
else:
|
||||
first_resp = response
|
||||
req = response.request
|
||||
while req and req.redirected_from:
|
||||
prev_req = req.redirected_from
|
||||
prev_resp = await prev_req.response()
|
||||
if prev_resp: # keep earliest
|
||||
first_resp = prev_resp
|
||||
req = prev_req
|
||||
|
||||
status_code = first_resp.status
|
||||
response_headers = first_resp.headers
|
||||
# if response is None:
|
||||
# status_code = 200
|
||||
# response_headers = {}
|
||||
# else:
|
||||
# status_code = response.status
|
||||
# response_headers = response.headers
|
||||
|
||||
else:
|
||||
status_code = 200
|
||||
response_headers = {}
|
||||
|
||||
@@ -895,18 +895,27 @@ class BrowserManager:
|
||||
combined_headers.update(self.config.headers)
|
||||
await context.set_extra_http_headers(combined_headers)
|
||||
|
||||
# Add default cookie
|
||||
await context.add_cookies(
|
||||
[
|
||||
{
|
||||
"name": "cookiesEnabled",
|
||||
"value": "true",
|
||||
"url": crawlerRunConfig.url
|
||||
if crawlerRunConfig and crawlerRunConfig.url
|
||||
else "https://crawl4ai.com/",
|
||||
}
|
||||
]
|
||||
)
|
||||
# Add default cookie (skip for raw:/file:// URLs which are not valid cookie URLs)
|
||||
cookie_url = None
|
||||
if crawlerRunConfig and crawlerRunConfig.url:
|
||||
url = crawlerRunConfig.url
|
||||
# Only set cookie for http/https URLs
|
||||
if url.startswith(("http://", "https://")):
|
||||
cookie_url = url
|
||||
elif crawlerRunConfig.base_url and crawlerRunConfig.base_url.startswith(("http://", "https://")):
|
||||
# Use base_url as fallback for raw:/file:// URLs
|
||||
cookie_url = crawlerRunConfig.base_url
|
||||
|
||||
if cookie_url:
|
||||
await context.add_cookies(
|
||||
[
|
||||
{
|
||||
"name": "cookiesEnabled",
|
||||
"value": "true",
|
||||
"url": cookie_url,
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
# Handle navigator overrides
|
||||
if crawlerRunConfig:
|
||||
|
||||
172
tests/test_raw_html_browser.py
Normal file
172
tests/test_raw_html_browser.py
Normal file
@@ -0,0 +1,172 @@
|
||||
"""
|
||||
Tests for raw:/file:// URL browser pipeline support.
|
||||
|
||||
Tests the new feature that allows js_code, wait_for, and other browser operations
|
||||
to work with raw: and file:// URLs by routing them through _crawl_web() with
|
||||
set_content() instead of goto().
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_html_fast_path():
|
||||
"""Test that raw: without browser params returns HTML directly (fast path)."""
|
||||
html = "<html><body><div id='test'>Original Content</div></body></html>"
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig() # No browser params
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
assert "Original Content" in result.html
|
||||
# Fast path should not modify the HTML
|
||||
assert result.html == html
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_js_code_on_raw_html():
|
||||
"""Test that js_code executes on raw: HTML and modifies the DOM."""
|
||||
html = "<html><body><div id='test'>Original</div></body></html>"
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
js_code="document.getElementById('test').innerText = 'Modified by JS'"
|
||||
)
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
assert "Modified by JS" in result.html
|
||||
assert "Original" not in result.html or "Modified by JS" in result.html
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_js_code_adds_element_to_raw_html():
|
||||
"""Test that js_code can add new elements to raw: HTML."""
|
||||
html = "<html><body><div id='container'></div></body></html>"
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
js_code='document.getElementById("container").innerHTML = "<span id=\'injected\'>Custom Content</span>"'
|
||||
)
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
assert "injected" in result.html
|
||||
assert "Custom Content" in result.html
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_screenshot_on_raw_html():
|
||||
"""Test that screenshots work on raw: HTML."""
|
||||
html = "<html><body><h1 style='color:red;font-size:48px;'>Screenshot Test</h1></body></html>"
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(screenshot=True)
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
assert result.screenshot is not None
|
||||
assert len(result.screenshot) > 100 # Should have substantial screenshot data
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_process_in_browser_flag():
|
||||
"""Test that process_in_browser=True forces browser path even without other params."""
|
||||
html = "<html><body><div>Test</div></body></html>"
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(process_in_browser=True)
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
# Browser path normalizes HTML, so it may be slightly different
|
||||
assert "Test" in result.html
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_prefix_variations():
|
||||
"""Test both raw: and raw:// prefix formats."""
|
||||
html = "<html><body>Content</body></html>"
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
js_code='document.body.innerHTML += "<div id=\'added\'>Added</div>"'
|
||||
)
|
||||
|
||||
# Test raw: prefix
|
||||
result1 = await crawler.arun(f"raw:{html}", config=config)
|
||||
assert result1.success
|
||||
assert "Added" in result1.html
|
||||
|
||||
# Test raw:// prefix
|
||||
result2 = await crawler.arun(f"raw://{html}", config=config)
|
||||
assert result2.success
|
||||
assert "Added" in result2.html
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_wait_for_on_raw_html():
|
||||
"""Test that wait_for works with raw: HTML after js_code modifies DOM."""
|
||||
html = "<html><body><div id='container'></div></body></html>"
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
js_code='''
|
||||
setTimeout(() => {
|
||||
document.getElementById('container').innerHTML = '<div id="delayed">Delayed Content</div>';
|
||||
}, 100);
|
||||
''',
|
||||
wait_for="#delayed",
|
||||
wait_for_timeout=5000
|
||||
)
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
assert "Delayed Content" in result.html
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multiple_js_code_scripts():
|
||||
"""Test that multiple js_code scripts execute in order."""
|
||||
html = "<html><body><div id='counter'>0</div></body></html>"
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
js_code=[
|
||||
"document.getElementById('counter').innerText = '1'",
|
||||
"document.getElementById('counter').innerText = parseInt(document.getElementById('counter').innerText) + 1",
|
||||
"document.getElementById('counter').innerText = parseInt(document.getElementById('counter').innerText) + 1",
|
||||
]
|
||||
)
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
assert ">3<" in result.html # Counter should be 3 after all scripts run
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run a quick manual test
|
||||
async def quick_test():
|
||||
html = "<html><body><div id='test'>Original</div></body></html>"
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# Test 1: Fast path
|
||||
print("\n=== Test 1: Fast path (no browser params) ===")
|
||||
result1 = await crawler.arun(f"raw:{html}")
|
||||
print(f"Success: {result1.success}")
|
||||
print(f"HTML contains 'Original': {'Original' in result1.html}")
|
||||
|
||||
# Test 2: js_code modifies DOM
|
||||
print("\n=== Test 2: js_code modifies DOM ===")
|
||||
config = CrawlerRunConfig(
|
||||
js_code="document.getElementById('test').innerText = 'Modified by JS'"
|
||||
)
|
||||
result2 = await crawler.arun(f"raw:{html}", config=config)
|
||||
print(f"Success: {result2.success}")
|
||||
print(f"HTML contains 'Modified by JS': {'Modified by JS' in result2.html}")
|
||||
print(f"HTML snippet: {result2.html[:500]}...")
|
||||
|
||||
asyncio.run(quick_test())
|
||||
563
tests/test_raw_html_edge_cases.py
Normal file
563
tests/test_raw_html_edge_cases.py
Normal file
@@ -0,0 +1,563 @@
|
||||
"""
|
||||
BRUTAL edge case tests for raw:/file:// URL browser pipeline.
|
||||
|
||||
These tests try to break the system with tricky inputs, edge cases,
|
||||
and compatibility checks to ensure we didn't break existing functionality.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import asyncio
|
||||
import tempfile
|
||||
import os
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# EDGE CASE: Hash characters in HTML (previously broke urlparse - Issue #283)
|
||||
# ============================================================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_html_with_hash_in_css():
|
||||
"""Test that # in CSS colors doesn't break HTML parsing (regression for #283)."""
|
||||
html = """
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
body { background-color: #ff5733; color: #333333; }
|
||||
.highlight { border: 1px solid #000; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="highlight" style="color: #ffffff;">Content with hash colors</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(js_code="document.body.innerHTML += '<div id=\"added\">Added</div>'")
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
assert "#ff5733" in result.html or "ff5733" in result.html # Color should be preserved
|
||||
assert "Added" in result.html # JS executed
|
||||
assert "Content with hash colors" in result.html # Original content preserved
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_html_with_fragment_links():
|
||||
"""Test HTML with # fragment links doesn't break."""
|
||||
html = """
|
||||
<html><body>
|
||||
<a href="#section1">Go to section 1</a>
|
||||
<a href="#section2">Go to section 2</a>
|
||||
<div id="section1">Section 1</div>
|
||||
<div id="section2">Section 2</div>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(js_code="document.getElementById('section1').innerText = 'Modified Section 1'")
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
assert "Modified Section 1" in result.html
|
||||
assert "#section2" in result.html # Fragment link preserved
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# EDGE CASE: Special characters and unicode
|
||||
# ============================================================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_html_with_unicode():
|
||||
"""Test raw HTML with various unicode characters."""
|
||||
html = """
|
||||
<html><body>
|
||||
<div id="unicode">日本語 中文 한국어 العربية 🎉 💻 🚀</div>
|
||||
<div id="special">& < > " '</div>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(js_code="document.getElementById('unicode').innerText += ' ✅ Modified'")
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
assert "✅ Modified" in result.html or "Modified" in result.html
|
||||
# Check unicode is preserved
|
||||
assert "日本語" in result.html or "&#" in result.html # Either preserved or encoded
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_html_with_script_tags():
|
||||
"""Test raw HTML with existing script tags doesn't interfere with js_code."""
|
||||
html = """
|
||||
<html><body>
|
||||
<div id="counter">0</div>
|
||||
<script>
|
||||
// This script runs on page load
|
||||
document.getElementById('counter').innerText = '10';
|
||||
</script>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Our js_code runs AFTER the page scripts
|
||||
config = CrawlerRunConfig(
|
||||
js_code="document.getElementById('counter').innerText = parseInt(document.getElementById('counter').innerText) + 5"
|
||||
)
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
# The embedded script sets it to 10, then our js_code adds 5
|
||||
assert ">15<" in result.html or "15" in result.html
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# EDGE CASE: Empty and malformed HTML
|
||||
# ============================================================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_html_empty():
|
||||
"""Test empty raw HTML."""
|
||||
html = ""
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(js_code="document.body.innerHTML = '<div>Added to empty</div>'")
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
assert "Added to empty" in result.html
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_html_minimal():
|
||||
"""Test minimal HTML (just text, no tags)."""
|
||||
html = "Just plain text, no HTML tags"
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(js_code="document.body.innerHTML += '<div id=\"injected\">Injected</div>'")
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
# Browser should wrap it in proper HTML
|
||||
assert "Injected" in result.html
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_html_malformed():
|
||||
"""Test malformed HTML with unclosed tags."""
|
||||
html = "<html><body><div><span>Unclosed tags<div>More content"
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(js_code="document.body.innerHTML += '<div id=\"valid\">Valid Added</div>'")
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
assert "Valid Added" in result.html
|
||||
# Browser should have fixed the malformed HTML
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# EDGE CASE: Very large HTML
|
||||
# ============================================================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_html_large():
|
||||
"""Test large raw HTML (100KB+)."""
|
||||
# Generate 100KB of HTML
|
||||
items = "".join([f'<div class="item" id="item-{i}">Item {i} content here with some text</div>\n' for i in range(2000)])
|
||||
html = f"<html><body>{items}</body></html>"
|
||||
|
||||
assert len(html) > 100000 # Verify it's actually large
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
js_code="document.getElementById('item-999').innerText = 'MODIFIED ITEM 999'"
|
||||
)
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
assert "MODIFIED ITEM 999" in result.html
|
||||
assert "item-1999" in result.html # Last item should still exist
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# EDGE CASE: JavaScript errors and timeouts
|
||||
# ============================================================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_html_js_error_doesnt_crash():
|
||||
"""Test that JavaScript errors in js_code don't crash the crawl."""
|
||||
html = "<html><body><div id='test'>Original</div></body></html>"
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
js_code=[
|
||||
"nonExistentFunction();", # This will throw an error
|
||||
"document.getElementById('test').innerText = 'Still works'" # This should still run
|
||||
]
|
||||
)
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
# Crawl should succeed even with JS errors
|
||||
assert result.success
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_html_wait_for_timeout():
|
||||
"""Test wait_for with element that never appears times out gracefully."""
|
||||
html = "<html><body><div id='test'>Original</div></body></html>"
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
wait_for="#never-exists",
|
||||
wait_for_timeout=1000 # 1 second timeout
|
||||
)
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
# Should timeout but still return the HTML we have
|
||||
# The behavior might be success=False or success=True with partial content
|
||||
# Either way, it shouldn't hang or crash
|
||||
assert result is not None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# COMPATIBILITY: Normal HTTP URLs still work
|
||||
# ============================================================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_http_urls_still_work():
|
||||
"""Ensure we didn't break normal HTTP URL crawling."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
|
||||
assert result.success
|
||||
assert "Example Domain" in result.html
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_http_with_js_code_still_works():
|
||||
"""Ensure HTTP URLs with js_code still work."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
js_code="document.body.innerHTML += '<div id=\"injected\">Injected via JS</div>'"
|
||||
)
|
||||
result = await crawler.arun("https://example.com", config=config)
|
||||
|
||||
assert result.success
|
||||
assert "Injected via JS" in result.html
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# COMPATIBILITY: File URLs
|
||||
# ============================================================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_file_url_with_js_code():
|
||||
"""Test file:// URLs with js_code execution."""
|
||||
# Create a temp file
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
|
||||
f.write("<html><body><div id='file-content'>File Content</div></body></html>")
|
||||
temp_path = f.name
|
||||
|
||||
try:
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
js_code="document.getElementById('file-content').innerText = 'Modified File Content'"
|
||||
)
|
||||
result = await crawler.arun(f"file://{temp_path}", config=config)
|
||||
|
||||
assert result.success
|
||||
assert "Modified File Content" in result.html
|
||||
finally:
|
||||
os.unlink(temp_path)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_file_url_fast_path():
|
||||
"""Test file:// fast path (no browser params)."""
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
|
||||
f.write("<html><body>Fast path file content</body></html>")
|
||||
temp_path = f.name
|
||||
|
||||
try:
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(f"file://{temp_path}")
|
||||
|
||||
assert result.success
|
||||
assert "Fast path file content" in result.html
|
||||
finally:
|
||||
os.unlink(temp_path)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# COMPATIBILITY: Extraction strategies with raw HTML
|
||||
# ============================================================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_html_with_css_extraction():
|
||||
"""Test CSS extraction on raw HTML after js_code modifies it."""
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
|
||||
html = """
|
||||
<html><body>
|
||||
<div class="products">
|
||||
<div class="product"><span class="name">Original Product</span></div>
|
||||
</div>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
schema = {
|
||||
"name": "Products",
|
||||
"baseSelector": ".product",
|
||||
"fields": [
|
||||
{"name": "name", "selector": ".name", "type": "text"}
|
||||
]
|
||||
}
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
js_code="""
|
||||
document.querySelector('.products').innerHTML +=
|
||||
'<div class="product"><span class="name">JS Added Product</span></div>';
|
||||
""",
|
||||
extraction_strategy=JsonCssExtractionStrategy(schema)
|
||||
)
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
# Check that extraction found both products
|
||||
import json
|
||||
extracted = json.loads(result.extracted_content)
|
||||
names = [p.get('name', '') for p in extracted]
|
||||
assert any("JS Added Product" in name for name in names)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# EDGE CASE: Concurrent raw: requests
|
||||
# ============================================================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_concurrent_raw_requests():
|
||||
"""Test multiple concurrent raw: requests don't interfere."""
|
||||
htmls = [
|
||||
f"<html><body><div id='test'>Request {i}</div></body></html>"
|
||||
for i in range(5)
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
configs = [
|
||||
CrawlerRunConfig(
|
||||
js_code=f"document.getElementById('test').innerText += ' Modified {i}'"
|
||||
)
|
||||
for i in range(5)
|
||||
]
|
||||
|
||||
# Run concurrently
|
||||
tasks = [
|
||||
crawler.arun(f"raw:{html}", config=config)
|
||||
for html, config in zip(htmls, configs)
|
||||
]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
for i, result in enumerate(results):
|
||||
assert result.success
|
||||
assert f"Request {i}" in result.html
|
||||
assert f"Modified {i}" in result.html
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# EDGE CASE: raw: with base_url for link resolution
|
||||
# ============================================================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_html_with_base_url():
|
||||
"""Test that base_url is used for link resolution in markdown."""
|
||||
html = """
|
||||
<html><body>
|
||||
<a href="/page1">Page 1</a>
|
||||
<a href="/page2">Page 2</a>
|
||||
<img src="/images/logo.png" alt="Logo">
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
base_url="https://example.com",
|
||||
process_in_browser=True # Force browser to test base_url handling
|
||||
)
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
# Check markdown has absolute URLs
|
||||
if result.markdown:
|
||||
# Links should be absolute
|
||||
md = result.markdown.raw_markdown if hasattr(result.markdown, 'raw_markdown') else str(result.markdown)
|
||||
assert "example.com" in md or "/page1" in md
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# EDGE CASE: raw: with screenshot of complex page
|
||||
# ============================================================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_html_screenshot_complex_page():
|
||||
"""Test screenshot of complex raw HTML with CSS and JS modifications."""
|
||||
html = """
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
body { font-family: Arial; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 40px; }
|
||||
.card { background: white; padding: 20px; border-radius: 10px; box-shadow: 0 4px 6px rgba(0,0,0,0.1); }
|
||||
h1 { color: #333; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="card">
|
||||
<h1 id="title">Original Title</h1>
|
||||
<p>This is a test card with styling.</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
js_code="document.getElementById('title').innerText = 'Modified Title'",
|
||||
screenshot=True
|
||||
)
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
assert result.screenshot is not None
|
||||
assert len(result.screenshot) > 1000 # Should be substantial
|
||||
assert "Modified Title" in result.html
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# EDGE CASE: JavaScript that tries to navigate away
|
||||
# ============================================================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_html_js_navigation_blocked():
|
||||
"""Test that JS trying to navigate doesn't break the crawl."""
|
||||
html = """
|
||||
<html><body>
|
||||
<div id="content">Original Content</div>
|
||||
<script>
|
||||
// Try to navigate away (should be blocked or handled)
|
||||
// window.location.href = 'https://example.com';
|
||||
</script>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
# Try to navigate via js_code
|
||||
js_code=[
|
||||
"document.getElementById('content').innerText = 'Before navigation attempt'",
|
||||
# Actual navigation attempt commented - would cause issues
|
||||
# "window.location.href = 'https://example.com'",
|
||||
]
|
||||
)
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
assert "Before navigation attempt" in result.html
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# EDGE CASE: Raw HTML with iframes
|
||||
# ============================================================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_html_with_iframes():
|
||||
"""Test raw HTML containing iframes."""
|
||||
html = """
|
||||
<html><body>
|
||||
<div id="main">Main content</div>
|
||||
<iframe id="frame1" srcdoc="<html><body><div id='iframe-content'>Iframe Content</div></body></html>"></iframe>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
js_code="document.getElementById('main').innerText = 'Modified main'",
|
||||
process_iframes=True
|
||||
)
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
assert "Modified main" in result.html
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# TRICKY: Protocol inside raw content
|
||||
# ============================================================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_html_with_urls_inside():
|
||||
"""Test raw: with http:// URLs inside the content."""
|
||||
html = """
|
||||
<html><body>
|
||||
<a href="http://example.com">Example</a>
|
||||
<a href="https://google.com">Google</a>
|
||||
<img src="https://placekitten.com/200/300" alt="Cat">
|
||||
<div id="test">Test content with URL: https://test.com</div>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
js_code="document.getElementById('test').innerText += ' - Modified'"
|
||||
)
|
||||
result = await crawler.arun(f"raw:{html}", config=config)
|
||||
|
||||
assert result.success
|
||||
assert "Modified" in result.html
|
||||
assert "http://example.com" in result.html or "example.com" in result.html
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# TRICKY: Double raw: prefix
|
||||
# ============================================================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_double_raw_prefix():
|
||||
"""Test what happens with double raw: prefix (edge case)."""
|
||||
html = "<html><body>Content</body></html>"
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# raw:raw:<html>... - the second raw: becomes part of content
|
||||
result = await crawler.arun(f"raw:raw:{html}")
|
||||
|
||||
# Should either handle gracefully or return "raw:<html>..." as content
|
||||
assert result is not None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
async def run_tests():
|
||||
# Run a few key tests manually
|
||||
tests = [
|
||||
("Hash in CSS", test_raw_html_with_hash_in_css),
|
||||
("Unicode", test_raw_html_with_unicode),
|
||||
("Large HTML", test_raw_html_large),
|
||||
("HTTP still works", test_http_urls_still_work),
|
||||
("Concurrent requests", test_concurrent_raw_requests),
|
||||
("Complex screenshot", test_raw_html_screenshot_complex_page),
|
||||
]
|
||||
|
||||
for name, test_fn in tests:
|
||||
print(f"\n=== Running: {name} ===")
|
||||
try:
|
||||
await test_fn()
|
||||
print(f"✅ {name} PASSED")
|
||||
except Exception as e:
|
||||
print(f"❌ {name} FAILED: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
asyncio.run(run_tests())
|
||||
Reference in New Issue
Block a user