diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index 664ad16b..b287b544 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1188,6 +1188,12 @@ class CrawlerRunConfig():
# Connection Parameters
stream (bool): If True, enables streaming of crawled URLs as they are processed when used with arun_many.
Default: False.
+ process_in_browser (bool): If True, forces raw:/file:// URLs to be processed through the browser
+ pipeline (enabling js_code, wait_for, scrolling, etc.). When False (default),
+ raw:/file:// URLs use a fast path that returns HTML directly without browser
+ interaction. This is automatically enabled when browser-requiring parameters
+ are detected (js_code, wait_for, screenshot, pdf, etc.).
+ Default: False.
check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
Default: False.
@@ -1308,6 +1314,7 @@ class CrawlerRunConfig():
method: str = "GET",
stream: bool = False,
prefetch: bool = False, # When True, return only HTML + links (skip heavy processing)
+ process_in_browser: bool = False, # Force browser processing for raw:/file:// URLs
url: str = None,
base_url: str = None, # Base URL for markdown link resolution (used with raw: HTML)
check_robots_txt: bool = False,
@@ -1445,6 +1452,7 @@ class CrawlerRunConfig():
# Connection Parameters
self.stream = stream
self.prefetch = prefetch # Prefetch mode: return only HTML + links
+ self.process_in_browser = process_in_browser # Force browser processing for raw:/file:// URLs
self.method = method
# Robots.txt Handling Parameters
@@ -1722,6 +1730,7 @@ class CrawlerRunConfig():
method=kwargs.get("method", "GET"),
stream=kwargs.get("stream", False),
prefetch=kwargs.get("prefetch", False),
+ process_in_browser=kwargs.get("process_in_browser", False),
check_robots_txt=kwargs.get("check_robots_txt", False),
user_agent=kwargs.get("user_agent"),
user_agent_mode=kwargs.get("user_agent_mode"),
@@ -1831,6 +1840,7 @@ class CrawlerRunConfig():
"method": self.method,
"stream": self.stream,
"prefetch": self.prefetch,
+ "process_in_browser": self.process_in_browser,
"check_robots_txt": self.check_robots_txt,
"user_agent": self.user_agent,
"user_agent_mode": self.user_agent_mode,
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 3506d9c4..121a3861 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -452,66 +452,48 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if url.startswith(("http://", "https://", "view-source:")):
return await self._crawl_web(url, config)
- elif url.startswith("file://"):
- # initialize empty lists for console messages
- captured_console = []
- pdf_data = None
- mhtml_data = None
-
- # Process local file
- local_file_path = url[7:] # Remove 'file://' prefix
- if not os.path.exists(local_file_path):
- raise FileNotFoundError(f"Local file not found: {local_file_path}")
- with open(local_file_path, "r", encoding="utf-8") as f:
- html = f.read()
-
- # Handle media generation - all require loading HTML into browser
- if config.screenshot or config.pdf or config.capture_mhtml:
- screenshot_data, pdf_data, mhtml_data = await self._generate_media_from_html(
- html, config
- )
-
- if config.capture_console_messages:
- page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
- captured_console = await self._capture_console_messages(page, url)
-
- return AsyncCrawlResponse(
- html=html,
- response_headers=response_headers,
- status_code=status_code,
- screenshot=screenshot_data,
- pdf_data=pdf_data,
- mhtml_data=mhtml_data,
- get_delayed_content=None,
- console_messages=captured_console,
+ elif url.startswith("file://") or url.startswith("raw://") or url.startswith("raw:"):
+ # Check if browser processing is required for file:// or raw: URLs
+ needs_browser = (
+ config.process_in_browser or
+ config.screenshot or
+ config.pdf or
+ config.capture_mhtml or
+ config.js_code or
+ config.wait_for or
+ config.scan_full_page or
+ config.remove_overlay_elements or
+ config.simulate_user or
+ config.magic or
+ config.process_iframes or
+ config.capture_console_messages or
+ config.capture_network_requests
)
- #####
- # Since both "raw:" and "raw://" start with "raw:", the first condition is always true for both, so "raw://" will be sliced as "//...", which is incorrect.
- # Fix: Check for "raw://" first, then "raw:"
- # Also, the prefix "raw://" is actually 6 characters long, not 7, so it should be sliced accordingly: url[6:]
- #####
- elif url.startswith("raw://") or url.startswith("raw:"):
- # Process raw HTML content
- # raw_html = url[4:] if url[:4] == "raw:" else url[7:]
- raw_html = url[6:] if url.startswith("raw://") else url[4:]
- html = raw_html
- pdf_data = None
- mhtml_data = None
+ if needs_browser:
+ # Route through _crawl_web() for full browser pipeline
+ # _crawl_web() will detect file:// and raw: URLs and use set_content()
+ return await self._crawl_web(url, config)
- # Handle media generation - all require loading HTML into browser
- if config.screenshot or config.pdf or config.capture_mhtml:
- screenshot_data, pdf_data, mhtml_data = await self._generate_media_from_html(
- html, config
- )
+ # Fast path: return HTML directly without browser interaction
+ if url.startswith("file://"):
+ # Process local file
+ local_file_path = url[7:] # Remove 'file://' prefix
+ if not os.path.exists(local_file_path):
+ raise FileNotFoundError(f"Local file not found: {local_file_path}")
+ with open(local_file_path, "r", encoding="utf-8") as f:
+ html = f.read()
+ else:
+ # Process raw HTML content (raw:// or raw:)
+ html = url[6:] if url.startswith("raw://") else url[4:]
return AsyncCrawlResponse(
html=html,
response_headers=response_headers,
status_code=status_code,
- screenshot=screenshot_data,
- pdf_data=pdf_data,
- mhtml_data=mhtml_data,
+ screenshot=None,
+ pdf_data=None,
+ mhtml_data=None,
get_delayed_content=None,
)
else:
@@ -684,67 +666,83 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if not config.js_only:
await self.execute_hook("before_goto", page, context=context, url=url, config=config)
- try:
- # Generate a unique nonce for this request
- if config.experimental.get("use_csp_nonce", False):
- nonce = hashlib.sha256(os.urandom(32)).hexdigest()
+ # Check if this is a file:// or raw: URL that needs set_content() instead of goto()
+ is_local_content = url.startswith("file://") or url.startswith("raw://") or url.startswith("raw:")
- # Add CSP headers to the request
- await page.set_extra_http_headers(
- {
- "Content-Security-Policy": f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'"
- }
- )
-
- response = await page.goto(
- url, wait_until=config.wait_until, timeout=config.page_timeout
- )
- redirected_url = page.url
- except Error as e:
- # Allow navigation to be aborted when downloading files
- # This is expected behavior for downloads in some browser engines
- if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads:
- self.logger.info(
- message=f"Navigation aborted, likely due to file download: {url}",
- tag="GOTO",
- params={"url": url},
- )
- response = None
+ if is_local_content:
+ # Load local content using set_content() instead of network navigation
+ if url.startswith("file://"):
+ local_file_path = url[7:] # Remove 'file://' prefix
+ if not os.path.exists(local_file_path):
+ raise FileNotFoundError(f"Local file not found: {local_file_path}")
+ with open(local_file_path, "r", encoding="utf-8") as f:
+ html_content = f.read()
else:
- raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
+ # raw:// or raw:
+ html_content = url[6:] if url.startswith("raw://") else url[4:]
+
+ await page.set_content(html_content, wait_until=config.wait_until)
+ response = None
+ redirected_url = config.base_url or url
+ status_code = 200
+ response_headers = {}
+ else:
+ # Standard web navigation with goto()
+ try:
+ # Generate a unique nonce for this request
+ if config.experimental.get("use_csp_nonce", False):
+ nonce = hashlib.sha256(os.urandom(32)).hexdigest()
+
+ # Add CSP headers to the request
+ await page.set_extra_http_headers(
+ {
+ "Content-Security-Policy": f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'"
+ }
+ )
+
+ response = await page.goto(
+ url, wait_until=config.wait_until, timeout=config.page_timeout
+ )
+ redirected_url = page.url
+ except Error as e:
+ # Allow navigation to be aborted when downloading files
+ # This is expected behavior for downloads in some browser engines
+ if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads:
+ self.logger.info(
+ message=f"Navigation aborted, likely due to file download: {url}",
+ tag="GOTO",
+ params={"url": url},
+ )
+ response = None
+ else:
+ raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
+
+ # ──────────────────────────────────────────────────────────────
+ # Walk the redirect chain. Playwright returns only the last
+ # hop, so we trace the `request.redirected_from` links until the
+ # first response that differs from the final one and surface its
+ # status-code.
+ # ──────────────────────────────────────────────────────────────
+ if response is None:
+ status_code = 200
+ response_headers = {}
+ else:
+ first_resp = response
+ req = response.request
+ while req and req.redirected_from:
+ prev_req = req.redirected_from
+ prev_resp = await prev_req.response()
+ if prev_resp: # keep earliest
+ first_resp = prev_resp
+ req = prev_req
+
+ status_code = first_resp.status
+ response_headers = first_resp.headers
await self.execute_hook(
"after_goto", page, context=context, url=url, response=response, config=config
)
- # ──────────────────────────────────────────────────────────────
- # Walk the redirect chain. Playwright returns only the last
- # hop, so we trace the `request.redirected_from` links until the
- # first response that differs from the final one and surface its
- # status-code.
- # ──────────────────────────────────────────────────────────────
- if response is None:
- status_code = 200
- response_headers = {}
- else:
- first_resp = response
- req = response.request
- while req and req.redirected_from:
- prev_req = req.redirected_from
- prev_resp = await prev_req.response()
- if prev_resp: # keep earliest
- first_resp = prev_resp
- req = prev_req
-
- status_code = first_resp.status
- response_headers = first_resp.headers
- # if response is None:
- # status_code = 200
- # response_headers = {}
- # else:
- # status_code = response.status
- # response_headers = response.headers
-
else:
status_code = 200
response_headers = {}
diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py
index 309f6b0f..fedc974f 100644
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -895,18 +895,27 @@ class BrowserManager:
combined_headers.update(self.config.headers)
await context.set_extra_http_headers(combined_headers)
- # Add default cookie
- await context.add_cookies(
- [
- {
- "name": "cookiesEnabled",
- "value": "true",
- "url": crawlerRunConfig.url
- if crawlerRunConfig and crawlerRunConfig.url
- else "https://crawl4ai.com/",
- }
- ]
- )
+ # Add default cookie (skip for raw:/file:// URLs which are not valid cookie URLs)
+ cookie_url = None
+ if crawlerRunConfig and crawlerRunConfig.url:
+ url = crawlerRunConfig.url
+ # Only set cookie for http/https URLs
+ if url.startswith(("http://", "https://")):
+ cookie_url = url
+ elif crawlerRunConfig.base_url and crawlerRunConfig.base_url.startswith(("http://", "https://")):
+ # Use base_url as fallback for raw:/file:// URLs
+ cookie_url = crawlerRunConfig.base_url
+
+ if cookie_url:
+ await context.add_cookies(
+ [
+ {
+ "name": "cookiesEnabled",
+ "value": "true",
+ "url": cookie_url,
+ }
+ ]
+ )
# Handle navigator overrides
if crawlerRunConfig:
diff --git a/tests/test_raw_html_browser.py b/tests/test_raw_html_browser.py
new file mode 100644
index 00000000..0f8648ad
--- /dev/null
+++ b/tests/test_raw_html_browser.py
@@ -0,0 +1,172 @@
+"""
+Tests for raw:/file:// URL browser pipeline support.
+
+Tests the new feature that allows js_code, wait_for, and other browser operations
+to work with raw: and file:// URLs by routing them through _crawl_web() with
+set_content() instead of goto().
+"""
+
+import pytest
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+
+@pytest.mark.asyncio
+async def test_raw_html_fast_path():
+ """Test that raw: without browser params returns HTML directly (fast path)."""
+ html = "
Original Content
"
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig() # No browser params
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ assert "Original Content" in result.html
+ # Fast path should not modify the HTML
+ assert result.html == html
+
+
+@pytest.mark.asyncio
+async def test_js_code_on_raw_html():
+ """Test that js_code executes on raw: HTML and modifies the DOM."""
+ html = "Original
"
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ js_code="document.getElementById('test').innerText = 'Modified by JS'"
+ )
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ assert "Modified by JS" in result.html
+ assert "Original" not in result.html or "Modified by JS" in result.html
+
+
+@pytest.mark.asyncio
+async def test_js_code_adds_element_to_raw_html():
+ """Test that js_code can add new elements to raw: HTML."""
+ html = ""
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ js_code='document.getElementById("container").innerHTML = "Custom Content"'
+ )
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ assert "injected" in result.html
+ assert "Custom Content" in result.html
+
+
+@pytest.mark.asyncio
+async def test_screenshot_on_raw_html():
+ """Test that screenshots work on raw: HTML."""
+ html = "Screenshot Test
"
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(screenshot=True)
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ assert result.screenshot is not None
+ assert len(result.screenshot) > 100 # Should have substantial screenshot data
+
+
+@pytest.mark.asyncio
+async def test_process_in_browser_flag():
+ """Test that process_in_browser=True forces browser path even without other params."""
+ html = "Test
"
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(process_in_browser=True)
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ # Browser path normalizes HTML, so it may be slightly different
+ assert "Test" in result.html
+
+
+@pytest.mark.asyncio
+async def test_raw_prefix_variations():
+ """Test both raw: and raw:// prefix formats."""
+ html = "Content"
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ js_code='document.body.innerHTML += "Added
"'
+ )
+
+ # Test raw: prefix
+ result1 = await crawler.arun(f"raw:{html}", config=config)
+ assert result1.success
+ assert "Added" in result1.html
+
+ # Test raw:// prefix
+ result2 = await crawler.arun(f"raw://{html}", config=config)
+ assert result2.success
+ assert "Added" in result2.html
+
+
+@pytest.mark.asyncio
+async def test_wait_for_on_raw_html():
+ """Test that wait_for works with raw: HTML after js_code modifies DOM."""
+ html = ""
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ js_code='''
+ setTimeout(() => {
+ document.getElementById('container').innerHTML = 'Delayed Content
';
+ }, 100);
+ ''',
+ wait_for="#delayed",
+ wait_for_timeout=5000
+ )
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ assert "Delayed Content" in result.html
+
+
+@pytest.mark.asyncio
+async def test_multiple_js_code_scripts():
+ """Test that multiple js_code scripts execute in order."""
+ html = "0
"
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ js_code=[
+ "document.getElementById('counter').innerText = '1'",
+ "document.getElementById('counter').innerText = parseInt(document.getElementById('counter').innerText) + 1",
+ "document.getElementById('counter').innerText = parseInt(document.getElementById('counter').innerText) + 1",
+ ]
+ )
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ assert ">3<" in result.html # Counter should be 3 after all scripts run
+
+
+if __name__ == "__main__":
+ # Run a quick manual test
+ async def quick_test():
+ html = "Original
"
+
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ # Test 1: Fast path
+ print("\n=== Test 1: Fast path (no browser params) ===")
+ result1 = await crawler.arun(f"raw:{html}")
+ print(f"Success: {result1.success}")
+ print(f"HTML contains 'Original': {'Original' in result1.html}")
+
+ # Test 2: js_code modifies DOM
+ print("\n=== Test 2: js_code modifies DOM ===")
+ config = CrawlerRunConfig(
+ js_code="document.getElementById('test').innerText = 'Modified by JS'"
+ )
+ result2 = await crawler.arun(f"raw:{html}", config=config)
+ print(f"Success: {result2.success}")
+ print(f"HTML contains 'Modified by JS': {'Modified by JS' in result2.html}")
+ print(f"HTML snippet: {result2.html[:500]}...")
+
+ asyncio.run(quick_test())
diff --git a/tests/test_raw_html_edge_cases.py b/tests/test_raw_html_edge_cases.py
new file mode 100644
index 00000000..8517d19e
--- /dev/null
+++ b/tests/test_raw_html_edge_cases.py
@@ -0,0 +1,563 @@
+"""
+BRUTAL edge case tests for raw:/file:// URL browser pipeline.
+
+These tests try to break the system with tricky inputs, edge cases,
+and compatibility checks to ensure we didn't break existing functionality.
+"""
+
+import pytest
+import asyncio
+import tempfile
+import os
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+
+# ============================================================================
+# EDGE CASE: Hash characters in HTML (previously broke urlparse - Issue #283)
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_with_hash_in_css():
+ """Test that # in CSS colors doesn't break HTML parsing (regression for #283)."""
+ html = """
+
+
+
+
+
+ Content with hash colors
+
+
+ """
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(js_code="document.body.innerHTML += 'Added
'")
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ assert "#ff5733" in result.html or "ff5733" in result.html # Color should be preserved
+ assert "Added" in result.html # JS executed
+ assert "Content with hash colors" in result.html # Original content preserved
+
+
+@pytest.mark.asyncio
+async def test_raw_html_with_fragment_links():
+ """Test HTML with # fragment links doesn't break."""
+ html = """
+
+ Go to section 1
+ Go to section 2
+ Section 1
+ Section 2
+
+ """
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(js_code="document.getElementById('section1').innerText = 'Modified Section 1'")
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ assert "Modified Section 1" in result.html
+ assert "#section2" in result.html # Fragment link preserved
+
+
+# ============================================================================
+# EDGE CASE: Special characters and unicode
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_with_unicode():
+ """Test raw HTML with various unicode characters."""
+ html = """
+
+ 日本語 中文 한국어 العربية 🎉 💻 🚀
+ & < > " '
+
+ """
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(js_code="document.getElementById('unicode').innerText += ' ✅ Modified'")
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ assert "✅ Modified" in result.html or "Modified" in result.html
+ # Check unicode is preserved
+ assert "日本語" in result.html or "" in result.html # Either preserved or encoded
+
+
+@pytest.mark.asyncio
+async def test_raw_html_with_script_tags():
+ """Test raw HTML with existing script tags doesn't interfere with js_code."""
+ html = """
+
+ 0
+
+
+ """
+
+ async with AsyncWebCrawler() as crawler:
+ # Our js_code runs AFTER the page scripts
+ config = CrawlerRunConfig(
+ js_code="document.getElementById('counter').innerText = parseInt(document.getElementById('counter').innerText) + 5"
+ )
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ # The embedded script sets it to 10, then our js_code adds 5
+ assert ">15<" in result.html or "15" in result.html
+
+
+# ============================================================================
+# EDGE CASE: Empty and malformed HTML
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_empty():
+ """Test empty raw HTML."""
+ html = ""
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(js_code="document.body.innerHTML = 'Added to empty
'")
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ assert "Added to empty" in result.html
+
+
+@pytest.mark.asyncio
+async def test_raw_html_minimal():
+ """Test minimal HTML (just text, no tags)."""
+ html = "Just plain text, no HTML tags"
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(js_code="document.body.innerHTML += 'Injected
'")
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ # Browser should wrap it in proper HTML
+ assert "Injected" in result.html
+
+
+@pytest.mark.asyncio
+async def test_raw_html_malformed():
+ """Test malformed HTML with unclosed tags."""
+ html = "Unclosed tagsMore content"
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(js_code="document.body.innerHTML += '
Valid Added
'")
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ assert "Valid Added" in result.html
+ # Browser should have fixed the malformed HTML
+
+
+# ============================================================================
+# EDGE CASE: Very large HTML
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_large():
+ """Test large raw HTML (100KB+)."""
+ # Generate 100KB of HTML
+ items = "".join([f'
Item {i} content here with some text
\n' for i in range(2000)])
+ html = f"{items}"
+
+ assert len(html) > 100000 # Verify it's actually large
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ js_code="document.getElementById('item-999').innerText = 'MODIFIED ITEM 999'"
+ )
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ assert "MODIFIED ITEM 999" in result.html
+ assert "item-1999" in result.html # Last item should still exist
+
+
+# ============================================================================
+# EDGE CASE: JavaScript errors and timeouts
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_js_error_doesnt_crash():
+ """Test that JavaScript errors in js_code don't crash the crawl."""
+ html = "
Original
"
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ js_code=[
+ "nonExistentFunction();", # This will throw an error
+ "document.getElementById('test').innerText = 'Still works'" # This should still run
+ ]
+ )
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ # Crawl should succeed even with JS errors
+ assert result.success
+
+
+@pytest.mark.asyncio
+async def test_raw_html_wait_for_timeout():
+ """Test wait_for with element that never appears times out gracefully."""
+ html = "
Original
"
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ wait_for="#never-exists",
+ wait_for_timeout=1000 # 1 second timeout
+ )
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ # Should timeout but still return the HTML we have
+ # The behavior might be success=False or success=True with partial content
+ # Either way, it shouldn't hang or crash
+ assert result is not None
+
+
+# ============================================================================
+# COMPATIBILITY: Normal HTTP URLs still work
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_http_urls_still_work():
+ """Ensure we didn't break normal HTTP URL crawling."""
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun("https://example.com")
+
+ assert result.success
+ assert "Example Domain" in result.html
+
+
+@pytest.mark.asyncio
+async def test_http_with_js_code_still_works():
+ """Ensure HTTP URLs with js_code still work."""
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ js_code="document.body.innerHTML += '
Injected via JS
'"
+ )
+ result = await crawler.arun("https://example.com", config=config)
+
+ assert result.success
+ assert "Injected via JS" in result.html
+
+
+# ============================================================================
+# COMPATIBILITY: File URLs
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_file_url_with_js_code():
+ """Test file:// URLs with js_code execution."""
+ # Create a temp file
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
+ f.write("
File Content
")
+ temp_path = f.name
+
+ try:
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ js_code="document.getElementById('file-content').innerText = 'Modified File Content'"
+ )
+ result = await crawler.arun(f"file://{temp_path}", config=config)
+
+ assert result.success
+ assert "Modified File Content" in result.html
+ finally:
+ os.unlink(temp_path)
+
+
+@pytest.mark.asyncio
+async def test_file_url_fast_path():
+ """Test file:// fast path (no browser params)."""
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
+ f.write("Fast path file content")
+ temp_path = f.name
+
+ try:
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(f"file://{temp_path}")
+
+ assert result.success
+ assert "Fast path file content" in result.html
+ finally:
+ os.unlink(temp_path)
+
+
+# ============================================================================
+# COMPATIBILITY: Extraction strategies with raw HTML
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_with_css_extraction():
+ """Test CSS extraction on raw HTML after js_code modifies it."""
+ from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+ html = """
+
+
+
+ """
+
+ schema = {
+ "name": "Products",
+ "baseSelector": ".product",
+ "fields": [
+ {"name": "name", "selector": ".name", "type": "text"}
+ ]
+ }
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ js_code="""
+ document.querySelector('.products').innerHTML +=
+ '
JS Added Product
';
+ """,
+ extraction_strategy=JsonCssExtractionStrategy(schema)
+ )
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ # Check that extraction found both products
+ import json
+ extracted = json.loads(result.extracted_content)
+ names = [p.get('name', '') for p in extracted]
+ assert any("JS Added Product" in name for name in names)
+
+
+# ============================================================================
+# EDGE CASE: Concurrent raw: requests
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_concurrent_raw_requests():
+ """Test multiple concurrent raw: requests don't interfere."""
+ htmls = [
+ f"
Request {i}
"
+ for i in range(5)
+ ]
+
+ async with AsyncWebCrawler() as crawler:
+ configs = [
+ CrawlerRunConfig(
+ js_code=f"document.getElementById('test').innerText += ' Modified {i}'"
+ )
+ for i in range(5)
+ ]
+
+ # Run concurrently
+ tasks = [
+ crawler.arun(f"raw:{html}", config=config)
+ for html, config in zip(htmls, configs)
+ ]
+ results = await asyncio.gather(*tasks)
+
+ for i, result in enumerate(results):
+ assert result.success
+ assert f"Request {i}" in result.html
+ assert f"Modified {i}" in result.html
+
+
+# ============================================================================
+# EDGE CASE: raw: with base_url for link resolution
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_with_base_url():
+ """Test that base_url is used for link resolution in markdown."""
+ html = """
+
+
Page 1
+
Page 2
+

+
+ """
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ base_url="https://example.com",
+ process_in_browser=True # Force browser to test base_url handling
+ )
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ # Check markdown has absolute URLs
+ if result.markdown:
+ # Links should be absolute
+ md = result.markdown.raw_markdown if hasattr(result.markdown, 'raw_markdown') else str(result.markdown)
+ assert "example.com" in md or "/page1" in md
+
+
+# ============================================================================
+# EDGE CASE: raw: with screenshot of complex page
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_screenshot_complex_page():
+ """Test screenshot of complex raw HTML with CSS and JS modifications."""
+ html = """
+
+
+
+
+
+
+
Original Title
+
This is a test card with styling.
+
+
+
+ """
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ js_code="document.getElementById('title').innerText = 'Modified Title'",
+ screenshot=True
+ )
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ assert result.screenshot is not None
+ assert len(result.screenshot) > 1000 # Should be substantial
+ assert "Modified Title" in result.html
+
+
+# ============================================================================
+# EDGE CASE: JavaScript that tries to navigate away
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_js_navigation_blocked():
+ """Test that JS trying to navigate doesn't break the crawl."""
+ html = """
+
+
Original Content
+
+
+ """
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ # Try to navigate via js_code
+ js_code=[
+ "document.getElementById('content').innerText = 'Before navigation attempt'",
+ # Actual navigation attempt commented - would cause issues
+ # "window.location.href = 'https://example.com'",
+ ]
+ )
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ assert "Before navigation attempt" in result.html
+
+
+# ============================================================================
+# EDGE CASE: Raw HTML with iframes
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_with_iframes():
+ """Test raw HTML containing iframes."""
+ html = """
+
+
Main content
+
+
+ """
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ js_code="document.getElementById('main').innerText = 'Modified main'",
+ process_iframes=True
+ )
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ assert "Modified main" in result.html
+
+
+# ============================================================================
+# TRICKY: Protocol inside raw content
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_with_urls_inside():
+ """Test raw: with http:// URLs inside the content."""
+ html = """
+
+
Example
+
Google
+

+
Test content with URL: https://test.com
+
+ """
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ js_code="document.getElementById('test').innerText += ' - Modified'"
+ )
+ result = await crawler.arun(f"raw:{html}", config=config)
+
+ assert result.success
+ assert "Modified" in result.html
+ assert "http://example.com" in result.html or "example.com" in result.html
+
+
+# ============================================================================
+# TRICKY: Double raw: prefix
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_double_raw_prefix():
+ """Test what happens with double raw: prefix (edge case)."""
+ html = "Content"
+
+ async with AsyncWebCrawler() as crawler:
+ # raw:raw:... - the second raw: becomes part of content
+ result = await crawler.arun(f"raw:raw:{html}")
+
+ # Should either handle gracefully or return "raw:..." as content
+ assert result is not None
+
+
+if __name__ == "__main__":
+ import sys
+
+ async def run_tests():
+ # Run a few key tests manually
+ tests = [
+ ("Hash in CSS", test_raw_html_with_hash_in_css),
+ ("Unicode", test_raw_html_with_unicode),
+ ("Large HTML", test_raw_html_large),
+ ("HTTP still works", test_http_urls_still_work),
+ ("Concurrent requests", test_concurrent_raw_requests),
+ ("Complex screenshot", test_raw_html_screenshot_complex_page),
+ ]
+
+ for name, test_fn in tests:
+ print(f"\n=== Running: {name} ===")
+ try:
+ await test_fn()
+ print(f"✅ {name} PASSED")
+ except Exception as e:
+ print(f"❌ {name} FAILED: {e}")
+ import traceback
+ traceback.print_exc()
+
+ asyncio.run(run_tests())