Add browser pipeline support for raw:/file:// URLs

- Add process_in_browser parameter to CrawlerRunConfig - Route raw:/file:// URLs through _crawl_web() when browser operations needed - Use page.set_content() instead of goto() for local content - Fix cookie handling for non-HTTP URLs in browser_manager - Auto-detect browser requirements: js_code, wait_for, screenshot, etc. - Maintain fast path for raw:/file:// without browser params Fixes #310
2025-12-27 12:32:42 +00:00
parent a43256b27a
commit 2550f3d2d5
5 changed files with 870 additions and 118 deletions
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1188,6 +1188,12 @@ class CrawlerRunConfig():
        # Connection Parameters
        stream (bool): If True, enables streaming of crawled URLs as they are processed when used with arun_many.
                      Default: False.
+        process_in_browser (bool): If True, forces raw:/file:// URLs to be processed through the browser
+                                   pipeline (enabling js_code, wait_for, scrolling, etc.). When False (default),
+                                   raw:/file:// URLs use a fast path that returns HTML directly without browser
+                                   interaction. This is automatically enabled when browser-requiring parameters
+                                   are detected (js_code, wait_for, screenshot, pdf, etc.).
+                                   Default: False.

        check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
                                 Default: False.
@@ -1308,6 +1314,7 @@ class CrawlerRunConfig():
        method: str = "GET",
        stream: bool = False,
        prefetch: bool = False,  # When True, return only HTML + links (skip heavy processing)
+        process_in_browser: bool = False,  # Force browser processing for raw:/file:// URLs
        url: str = None,
        base_url: str = None,  # Base URL for markdown link resolution (used with raw: HTML)
        check_robots_txt: bool = False,
@@ -1445,6 +1452,7 @@ class CrawlerRunConfig():
        # Connection Parameters
        self.stream = stream
        self.prefetch = prefetch  # Prefetch mode: return only HTML + links
+        self.process_in_browser = process_in_browser  # Force browser processing for raw:/file:// URLs
        self.method = method

        # Robots.txt Handling Parameters
@@ -1722,6 +1730,7 @@ class CrawlerRunConfig():
            method=kwargs.get("method", "GET"),
            stream=kwargs.get("stream", False),
            prefetch=kwargs.get("prefetch", False),
+            process_in_browser=kwargs.get("process_in_browser", False),
            check_robots_txt=kwargs.get("check_robots_txt", False),
            user_agent=kwargs.get("user_agent"),
            user_agent_mode=kwargs.get("user_agent_mode"),
@@ -1831,6 +1840,7 @@ class CrawlerRunConfig():
            "method": self.method,
            "stream": self.stream,
            "prefetch": self.prefetch,
+            "process_in_browser": self.process_in_browser,
            "check_robots_txt": self.check_robots_txt,
            "user_agent": self.user_agent,
            "user_agent_mode": self.user_agent_mode,
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -452,66 +452,48 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        if url.startswith(("http://", "https://", "view-source:")):
            return await self._crawl_web(url, config)

-        elif url.startswith("file://"):
-            # initialize empty lists for console messages
-            captured_console = []
-            pdf_data = None
-            mhtml_data = None
-
-            # Process local file
-            local_file_path = url[7:]  # Remove 'file://' prefix
-            if not os.path.exists(local_file_path):
-                raise FileNotFoundError(f"Local file not found: {local_file_path}")
-            with open(local_file_path, "r", encoding="utf-8") as f:
-                html = f.read()
-
-            # Handle media generation - all require loading HTML into browser
-            if config.screenshot or config.pdf or config.capture_mhtml:
-                screenshot_data, pdf_data, mhtml_data = await self._generate_media_from_html(
-                    html, config
-                )
-
-            if config.capture_console_messages:
-                page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
-                captured_console = await self._capture_console_messages(page, url)
-
-            return AsyncCrawlResponse(
-                html=html,
-                response_headers=response_headers,
-                status_code=status_code,
-                screenshot=screenshot_data,
-                pdf_data=pdf_data,
-                mhtml_data=mhtml_data,
-                get_delayed_content=None,
-                console_messages=captured_console,
+        elif url.startswith("file://") or url.startswith("raw://") or url.startswith("raw:"):
+            # Check if browser processing is required for file:// or raw: URLs
+            needs_browser = (
+                config.process_in_browser or
+                config.screenshot or
+                config.pdf or
+                config.capture_mhtml or
+                config.js_code or
+                config.wait_for or
+                config.scan_full_page or
+                config.remove_overlay_elements or
+                config.simulate_user or
+                config.magic or
+                config.process_iframes or
+                config.capture_console_messages or
+                config.capture_network_requests
            )

-        ##### 
-        # Since both "raw:" and "raw://" start with "raw:", the first condition is always true for both, so "raw://" will be sliced as "//...", which is incorrect.
-        # Fix: Check for "raw://" first, then "raw:"
-        # Also, the prefix "raw://" is actually 6 characters long, not 7, so it should be sliced accordingly: url[6:]
-        #####
-        elif url.startswith("raw://") or url.startswith("raw:"):
-            # Process raw HTML content
-            # raw_html = url[4:] if url[:4] == "raw:" else url[7:]
-            raw_html = url[6:] if url.startswith("raw://") else url[4:]
-            html = raw_html
-            pdf_data = None
-            mhtml_data = None
+            if needs_browser:
+                # Route through _crawl_web() for full browser pipeline
+                # _crawl_web() will detect file:// and raw: URLs and use set_content()
+                return await self._crawl_web(url, config)

-            # Handle media generation - all require loading HTML into browser
-            if config.screenshot or config.pdf or config.capture_mhtml:
-                screenshot_data, pdf_data, mhtml_data = await self._generate_media_from_html(
-                    html, config
-                )
+            # Fast path: return HTML directly without browser interaction
+            if url.startswith("file://"):
+                # Process local file
+                local_file_path = url[7:]  # Remove 'file://' prefix
+                if not os.path.exists(local_file_path):
+                    raise FileNotFoundError(f"Local file not found: {local_file_path}")
+                with open(local_file_path, "r", encoding="utf-8") as f:
+                    html = f.read()
+            else:
+                # Process raw HTML content (raw:// or raw:)
+                html = url[6:] if url.startswith("raw://") else url[4:]

            return AsyncCrawlResponse(
                html=html,
                response_headers=response_headers,
                status_code=status_code,
-                screenshot=screenshot_data,
-                pdf_data=pdf_data,
-                mhtml_data=mhtml_data,
+                screenshot=None,
+                pdf_data=None,
+                mhtml_data=None,
                get_delayed_content=None,
            )
        else:
@@ -684,67 +666,83 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            if not config.js_only:
                await self.execute_hook("before_goto", page, context=context, url=url, config=config)

-                try:
-                    # Generate a unique nonce for this request
-                    if config.experimental.get("use_csp_nonce", False):
-                        nonce = hashlib.sha256(os.urandom(32)).hexdigest()
+                # Check if this is a file:// or raw: URL that needs set_content() instead of goto()
+                is_local_content = url.startswith("file://") or url.startswith("raw://") or url.startswith("raw:")

-                        # Add CSP headers to the request
-                        await page.set_extra_http_headers(
-                            {
-                                "Content-Security-Policy": f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'"
-                            }
-                        )
-
-                    response = await page.goto(
-                        url, wait_until=config.wait_until, timeout=config.page_timeout
-                    )
-                    redirected_url = page.url
-                except Error as e:
-                    # Allow navigation to be aborted when downloading files
-                    # This is expected behavior for downloads in some browser engines
-                    if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads:
-                        self.logger.info(
-                            message=f"Navigation aborted, likely due to file download: {url}",
-                            tag="GOTO",
-                            params={"url": url},
-                        )
-                        response = None
+                if is_local_content:
+                    # Load local content using set_content() instead of network navigation
+                    if url.startswith("file://"):
+                        local_file_path = url[7:]  # Remove 'file://' prefix
+                        if not os.path.exists(local_file_path):
+                            raise FileNotFoundError(f"Local file not found: {local_file_path}")
+                        with open(local_file_path, "r", encoding="utf-8") as f:
+                            html_content = f.read()
                    else:
-                        raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
+                        # raw:// or raw:
+                        html_content = url[6:] if url.startswith("raw://") else url[4:]
+
+                    await page.set_content(html_content, wait_until=config.wait_until)
+                    response = None
+                    redirected_url = config.base_url or url
+                    status_code = 200
+                    response_headers = {}
+                else:
+                    # Standard web navigation with goto()
+                    try:
+                        # Generate a unique nonce for this request
+                        if config.experimental.get("use_csp_nonce", False):
+                            nonce = hashlib.sha256(os.urandom(32)).hexdigest()
+
+                            # Add CSP headers to the request
+                            await page.set_extra_http_headers(
+                                {
+                                    "Content-Security-Policy": f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'"
+                                }
+                            )
+
+                        response = await page.goto(
+                            url, wait_until=config.wait_until, timeout=config.page_timeout
+                        )
+                        redirected_url = page.url
+                    except Error as e:
+                        # Allow navigation to be aborted when downloading files
+                        # This is expected behavior for downloads in some browser engines
+                        if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads:
+                            self.logger.info(
+                                message=f"Navigation aborted, likely due to file download: {url}",
+                                tag="GOTO",
+                                params={"url": url},
+                            )
+                            response = None
+                        else:
+                            raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
+
+                    # ──────────────────────────────────────────────────────────────
+                    # Walk the redirect chain.  Playwright returns only the last
+                    # hop, so we trace the `request.redirected_from` links until the
+                    # first response that differs from the final one and surface its
+                    # status-code.
+                    # ──────────────────────────────────────────────────────────────
+                    if response is None:
+                        status_code = 200
+                        response_headers = {}
+                    else:
+                        first_resp = response
+                        req = response.request
+                        while req and req.redirected_from:
+                            prev_req = req.redirected_from
+                            prev_resp = await prev_req.response()
+                            if prev_resp:                       # keep earliest
+                                first_resp = prev_resp
+                            req = prev_req
+
+                        status_code = first_resp.status
+                        response_headers = first_resp.headers

                await self.execute_hook(
                    "after_goto", page, context=context, url=url, response=response, config=config
                )

-                # ──────────────────────────────────────────────────────────────
-                # Walk the redirect chain.  Playwright returns only the last
-                # hop, so we trace the `request.redirected_from` links until the
-                # first response that differs from the final one and surface its
-                # status-code.
-                # ──────────────────────────────────────────────────────────────
-                if response is None:
-                    status_code = 200
-                    response_headers = {}
-                else:
-                    first_resp = response
-                    req = response.request
-                    while req and req.redirected_from:
-                        prev_req = req.redirected_from
-                        prev_resp = await prev_req.response()
-                        if prev_resp:                       # keep earliest
-                            first_resp = prev_resp
-                        req = prev_req
-                
-                    status_code = first_resp.status
-                    response_headers = first_resp.headers
-                # if response is None:
-                #     status_code = 200
-                #     response_headers = {}
-                # else:
-                #     status_code = response.status
-                #     response_headers = response.headers
-
            else:
                status_code = 200
                response_headers = {}
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -895,18 +895,27 @@ class BrowserManager:
            combined_headers.update(self.config.headers)
            await context.set_extra_http_headers(combined_headers)

-        # Add default cookie
-        await context.add_cookies(
-            [
-                {
-                    "name": "cookiesEnabled",
-                    "value": "true",
-                    "url": crawlerRunConfig.url
-                    if crawlerRunConfig and crawlerRunConfig.url
-                    else "https://crawl4ai.com/",
-                }
-            ]
-        )
+        # Add default cookie (skip for raw:/file:// URLs which are not valid cookie URLs)
+        cookie_url = None
+        if crawlerRunConfig and crawlerRunConfig.url:
+            url = crawlerRunConfig.url
+            # Only set cookie for http/https URLs
+            if url.startswith(("http://", "https://")):
+                cookie_url = url
+            elif crawlerRunConfig.base_url and crawlerRunConfig.base_url.startswith(("http://", "https://")):
+                # Use base_url as fallback for raw:/file:// URLs
+                cookie_url = crawlerRunConfig.base_url
+
+        if cookie_url:
+            await context.add_cookies(
+                [
+                    {
+                        "name": "cookiesEnabled",
+                        "value": "true",
+                        "url": cookie_url,
+                    }
+                ]
+            )

        # Handle navigator overrides
        if crawlerRunConfig:
--- a/tests/test_raw_html_browser.py
+++ b/tests/test_raw_html_browser.py
@@ -0,0 +1,172 @@
+"""
+Tests for raw:/file:// URL browser pipeline support.
+
+Tests the new feature that allows js_code, wait_for, and other browser operations
+to work with raw: and file:// URLs by routing them through _crawl_web() with
+set_content() instead of goto().
+"""
+
+import pytest
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+
+@pytest.mark.asyncio
+async def test_raw_html_fast_path():
+    """Test that raw: without browser params returns HTML directly (fast path)."""
+    html = "<html><body><div id='test'>Original Content</div></body></html>"
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig()  # No browser params
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    assert "Original Content" in result.html
+    # Fast path should not modify the HTML
+    assert result.html == html
+
+
+@pytest.mark.asyncio
+async def test_js_code_on_raw_html():
+    """Test that js_code executes on raw: HTML and modifies the DOM."""
+    html = "<html><body><div id='test'>Original</div></body></html>"
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            js_code="document.getElementById('test').innerText = 'Modified by JS'"
+        )
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    assert "Modified by JS" in result.html
+    assert "Original" not in result.html or "Modified by JS" in result.html
+
+
+@pytest.mark.asyncio
+async def test_js_code_adds_element_to_raw_html():
+    """Test that js_code can add new elements to raw: HTML."""
+    html = "<html><body><div id='container'></div></body></html>"
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            js_code='document.getElementById("container").innerHTML = "<span id=\'injected\'>Custom Content</span>"'
+        )
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    assert "injected" in result.html
+    assert "Custom Content" in result.html
+
+
+@pytest.mark.asyncio
+async def test_screenshot_on_raw_html():
+    """Test that screenshots work on raw: HTML."""
+    html = "<html><body><h1 style='color:red;font-size:48px;'>Screenshot Test</h1></body></html>"
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(screenshot=True)
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    assert result.screenshot is not None
+    assert len(result.screenshot) > 100  # Should have substantial screenshot data
+
+
+@pytest.mark.asyncio
+async def test_process_in_browser_flag():
+    """Test that process_in_browser=True forces browser path even without other params."""
+    html = "<html><body><div>Test</div></body></html>"
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(process_in_browser=True)
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    # Browser path normalizes HTML, so it may be slightly different
+    assert "Test" in result.html
+
+
+@pytest.mark.asyncio
+async def test_raw_prefix_variations():
+    """Test both raw: and raw:// prefix formats."""
+    html = "<html><body>Content</body></html>"
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            js_code='document.body.innerHTML += "<div id=\'added\'>Added</div>"'
+        )
+
+        # Test raw: prefix
+        result1 = await crawler.arun(f"raw:{html}", config=config)
+        assert result1.success
+        assert "Added" in result1.html
+
+        # Test raw:// prefix
+        result2 = await crawler.arun(f"raw://{html}", config=config)
+        assert result2.success
+        assert "Added" in result2.html
+
+
+@pytest.mark.asyncio
+async def test_wait_for_on_raw_html():
+    """Test that wait_for works with raw: HTML after js_code modifies DOM."""
+    html = "<html><body><div id='container'></div></body></html>"
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            js_code='''
+                setTimeout(() => {
+                    document.getElementById('container').innerHTML = '<div id="delayed">Delayed Content</div>';
+                }, 100);
+            ''',
+            wait_for="#delayed",
+            wait_for_timeout=5000
+        )
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    assert "Delayed Content" in result.html
+
+
+@pytest.mark.asyncio
+async def test_multiple_js_code_scripts():
+    """Test that multiple js_code scripts execute in order."""
+    html = "<html><body><div id='counter'>0</div></body></html>"
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            js_code=[
+                "document.getElementById('counter').innerText = '1'",
+                "document.getElementById('counter').innerText = parseInt(document.getElementById('counter').innerText) + 1",
+                "document.getElementById('counter').innerText = parseInt(document.getElementById('counter').innerText) + 1",
+            ]
+        )
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    assert ">3<" in result.html  # Counter should be 3 after all scripts run
+
+
+if __name__ == "__main__":
+    # Run a quick manual test
+    async def quick_test():
+        html = "<html><body><div id='test'>Original</div></body></html>"
+
+        async with AsyncWebCrawler(verbose=True) as crawler:
+            # Test 1: Fast path
+            print("\n=== Test 1: Fast path (no browser params) ===")
+            result1 = await crawler.arun(f"raw:{html}")
+            print(f"Success: {result1.success}")
+            print(f"HTML contains 'Original': {'Original' in result1.html}")
+
+            # Test 2: js_code modifies DOM
+            print("\n=== Test 2: js_code modifies DOM ===")
+            config = CrawlerRunConfig(
+                js_code="document.getElementById('test').innerText = 'Modified by JS'"
+            )
+            result2 = await crawler.arun(f"raw:{html}", config=config)
+            print(f"Success: {result2.success}")
+            print(f"HTML contains 'Modified by JS': {'Modified by JS' in result2.html}")
+            print(f"HTML snippet: {result2.html[:500]}...")
+
+    asyncio.run(quick_test())
--- a/tests/test_raw_html_edge_cases.py
+++ b/tests/test_raw_html_edge_cases.py
@@ -0,0 +1,563 @@
+"""
+BRUTAL edge case tests for raw:/file:// URL browser pipeline.
+
+These tests try to break the system with tricky inputs, edge cases,
+and compatibility checks to ensure we didn't break existing functionality.
+"""
+
+import pytest
+import asyncio
+import tempfile
+import os
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+
+# ============================================================================
+# EDGE CASE: Hash characters in HTML (previously broke urlparse - Issue #283)
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_with_hash_in_css():
+    """Test that # in CSS colors doesn't break HTML parsing (regression for #283)."""
+    html = """
+    <html>
+    <head>
+        <style>
+            body { background-color: #ff5733; color: #333333; }
+            .highlight { border: 1px solid #000; }
+        </style>
+    </head>
+    <body>
+        <div class="highlight" style="color: #ffffff;">Content with hash colors</div>
+    </body>
+    </html>
+    """
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(js_code="document.body.innerHTML += '<div id=\"added\">Added</div>'")
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    assert "#ff5733" in result.html or "ff5733" in result.html  # Color should be preserved
+    assert "Added" in result.html  # JS executed
+    assert "Content with hash colors" in result.html  # Original content preserved
+
+
+@pytest.mark.asyncio
+async def test_raw_html_with_fragment_links():
+    """Test HTML with # fragment links doesn't break."""
+    html = """
+    <html><body>
+        <a href="#section1">Go to section 1</a>
+        <a href="#section2">Go to section 2</a>
+        <div id="section1">Section 1</div>
+        <div id="section2">Section 2</div>
+    </body></html>
+    """
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(js_code="document.getElementById('section1').innerText = 'Modified Section 1'")
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    assert "Modified Section 1" in result.html
+    assert "#section2" in result.html  # Fragment link preserved
+
+
+# ============================================================================
+# EDGE CASE: Special characters and unicode
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_with_unicode():
+    """Test raw HTML with various unicode characters."""
+    html = """
+    <html><body>
+        <div id="unicode">日本語 中文 한국어 العربية 🎉 💻 🚀</div>
+        <div id="special">&amp; &lt; &gt; &quot; &apos;</div>
+    </body></html>
+    """
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(js_code="document.getElementById('unicode').innerText += ' ✅ Modified'")
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    assert "✅ Modified" in result.html or "Modified" in result.html
+    # Check unicode is preserved
+    assert "日本語" in result.html or "&#" in result.html  # Either preserved or encoded
+
+
+@pytest.mark.asyncio
+async def test_raw_html_with_script_tags():
+    """Test raw HTML with existing script tags doesn't interfere with js_code."""
+    html = """
+    <html><body>
+        <div id="counter">0</div>
+        <script>
+            // This script runs on page load
+            document.getElementById('counter').innerText = '10';
+        </script>
+    </body></html>
+    """
+
+    async with AsyncWebCrawler() as crawler:
+        # Our js_code runs AFTER the page scripts
+        config = CrawlerRunConfig(
+            js_code="document.getElementById('counter').innerText = parseInt(document.getElementById('counter').innerText) + 5"
+        )
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    # The embedded script sets it to 10, then our js_code adds 5
+    assert ">15<" in result.html or "15" in result.html
+
+
+# ============================================================================
+# EDGE CASE: Empty and malformed HTML
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_empty():
+    """Test empty raw HTML."""
+    html = ""
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(js_code="document.body.innerHTML = '<div>Added to empty</div>'")
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    assert "Added to empty" in result.html
+
+
+@pytest.mark.asyncio
+async def test_raw_html_minimal():
+    """Test minimal HTML (just text, no tags)."""
+    html = "Just plain text, no HTML tags"
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(js_code="document.body.innerHTML += '<div id=\"injected\">Injected</div>'")
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    # Browser should wrap it in proper HTML
+    assert "Injected" in result.html
+
+
+@pytest.mark.asyncio
+async def test_raw_html_malformed():
+    """Test malformed HTML with unclosed tags."""
+    html = "<html><body><div><span>Unclosed tags<div>More content"
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(js_code="document.body.innerHTML += '<div id=\"valid\">Valid Added</div>'")
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    assert "Valid Added" in result.html
+    # Browser should have fixed the malformed HTML
+
+
+# ============================================================================
+# EDGE CASE: Very large HTML
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_large():
+    """Test large raw HTML (100KB+)."""
+    # Generate 100KB of HTML
+    items = "".join([f'<div class="item" id="item-{i}">Item {i} content here with some text</div>\n' for i in range(2000)])
+    html = f"<html><body>{items}</body></html>"
+
+    assert len(html) > 100000  # Verify it's actually large
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            js_code="document.getElementById('item-999').innerText = 'MODIFIED ITEM 999'"
+        )
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    assert "MODIFIED ITEM 999" in result.html
+    assert "item-1999" in result.html  # Last item should still exist
+
+
+# ============================================================================
+# EDGE CASE: JavaScript errors and timeouts
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_js_error_doesnt_crash():
+    """Test that JavaScript errors in js_code don't crash the crawl."""
+    html = "<html><body><div id='test'>Original</div></body></html>"
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            js_code=[
+                "nonExistentFunction();",  # This will throw an error
+                "document.getElementById('test').innerText = 'Still works'"  # This should still run
+            ]
+        )
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    # Crawl should succeed even with JS errors
+    assert result.success
+
+
+@pytest.mark.asyncio
+async def test_raw_html_wait_for_timeout():
+    """Test wait_for with element that never appears times out gracefully."""
+    html = "<html><body><div id='test'>Original</div></body></html>"
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            wait_for="#never-exists",
+            wait_for_timeout=1000  # 1 second timeout
+        )
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    # Should timeout but still return the HTML we have
+    # The behavior might be success=False or success=True with partial content
+    # Either way, it shouldn't hang or crash
+    assert result is not None
+
+
+# ============================================================================
+# COMPATIBILITY: Normal HTTP URLs still work
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_http_urls_still_work():
+    """Ensure we didn't break normal HTTP URL crawling."""
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com")
+
+    assert result.success
+    assert "Example Domain" in result.html
+
+
+@pytest.mark.asyncio
+async def test_http_with_js_code_still_works():
+    """Ensure HTTP URLs with js_code still work."""
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            js_code="document.body.innerHTML += '<div id=\"injected\">Injected via JS</div>'"
+        )
+        result = await crawler.arun("https://example.com", config=config)
+
+    assert result.success
+    assert "Injected via JS" in result.html
+
+
+# ============================================================================
+# COMPATIBILITY: File URLs
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_file_url_with_js_code():
+    """Test file:// URLs with js_code execution."""
+    # Create a temp file
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
+        f.write("<html><body><div id='file-content'>File Content</div></body></html>")
+        temp_path = f.name
+
+    try:
+        async with AsyncWebCrawler() as crawler:
+            config = CrawlerRunConfig(
+                js_code="document.getElementById('file-content').innerText = 'Modified File Content'"
+            )
+            result = await crawler.arun(f"file://{temp_path}", config=config)
+
+        assert result.success
+        assert "Modified File Content" in result.html
+    finally:
+        os.unlink(temp_path)
+
+
+@pytest.mark.asyncio
+async def test_file_url_fast_path():
+    """Test file:// fast path (no browser params)."""
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
+        f.write("<html><body>Fast path file content</body></html>")
+        temp_path = f.name
+
+    try:
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(f"file://{temp_path}")
+
+        assert result.success
+        assert "Fast path file content" in result.html
+    finally:
+        os.unlink(temp_path)
+
+
+# ============================================================================
+# COMPATIBILITY: Extraction strategies with raw HTML
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_with_css_extraction():
+    """Test CSS extraction on raw HTML after js_code modifies it."""
+    from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+    html = """
+    <html><body>
+        <div class="products">
+            <div class="product"><span class="name">Original Product</span></div>
+        </div>
+    </body></html>
+    """
+
+    schema = {
+        "name": "Products",
+        "baseSelector": ".product",
+        "fields": [
+            {"name": "name", "selector": ".name", "type": "text"}
+        ]
+    }
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            js_code="""
+                document.querySelector('.products').innerHTML +=
+                '<div class="product"><span class="name">JS Added Product</span></div>';
+            """,
+            extraction_strategy=JsonCssExtractionStrategy(schema)
+        )
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    # Check that extraction found both products
+    import json
+    extracted = json.loads(result.extracted_content)
+    names = [p.get('name', '') for p in extracted]
+    assert any("JS Added Product" in name for name in names)
+
+
+# ============================================================================
+# EDGE CASE: Concurrent raw: requests
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_concurrent_raw_requests():
+    """Test multiple concurrent raw: requests don't interfere."""
+    htmls = [
+        f"<html><body><div id='test'>Request {i}</div></body></html>"
+        for i in range(5)
+    ]
+
+    async with AsyncWebCrawler() as crawler:
+        configs = [
+            CrawlerRunConfig(
+                js_code=f"document.getElementById('test').innerText += ' Modified {i}'"
+            )
+            for i in range(5)
+        ]
+
+        # Run concurrently
+        tasks = [
+            crawler.arun(f"raw:{html}", config=config)
+            for html, config in zip(htmls, configs)
+        ]
+        results = await asyncio.gather(*tasks)
+
+    for i, result in enumerate(results):
+        assert result.success
+        assert f"Request {i}" in result.html
+        assert f"Modified {i}" in result.html
+
+
+# ============================================================================
+# EDGE CASE: raw: with base_url for link resolution
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_with_base_url():
+    """Test that base_url is used for link resolution in markdown."""
+    html = """
+    <html><body>
+        <a href="/page1">Page 1</a>
+        <a href="/page2">Page 2</a>
+        <img src="/images/logo.png" alt="Logo">
+    </body></html>
+    """
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            base_url="https://example.com",
+            process_in_browser=True  # Force browser to test base_url handling
+        )
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    # Check markdown has absolute URLs
+    if result.markdown:
+        # Links should be absolute
+        md = result.markdown.raw_markdown if hasattr(result.markdown, 'raw_markdown') else str(result.markdown)
+        assert "example.com" in md or "/page1" in md
+
+
+# ============================================================================
+# EDGE CASE: raw: with screenshot of complex page
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_screenshot_complex_page():
+    """Test screenshot of complex raw HTML with CSS and JS modifications."""
+    html = """
+    <html>
+    <head>
+        <style>
+            body { font-family: Arial; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 40px; }
+            .card { background: white; padding: 20px; border-radius: 10px; box-shadow: 0 4px 6px rgba(0,0,0,0.1); }
+            h1 { color: #333; }
+        </style>
+    </head>
+    <body>
+        <div class="card">
+            <h1 id="title">Original Title</h1>
+            <p>This is a test card with styling.</p>
+        </div>
+    </body>
+    </html>
+    """
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            js_code="document.getElementById('title').innerText = 'Modified Title'",
+            screenshot=True
+        )
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    assert result.screenshot is not None
+    assert len(result.screenshot) > 1000  # Should be substantial
+    assert "Modified Title" in result.html
+
+
+# ============================================================================
+# EDGE CASE: JavaScript that tries to navigate away
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_js_navigation_blocked():
+    """Test that JS trying to navigate doesn't break the crawl."""
+    html = """
+    <html><body>
+        <div id="content">Original Content</div>
+        <script>
+            // Try to navigate away (should be blocked or handled)
+            // window.location.href = 'https://example.com';
+        </script>
+    </body></html>
+    """
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            # Try to navigate via js_code
+            js_code=[
+                "document.getElementById('content').innerText = 'Before navigation attempt'",
+                # Actual navigation attempt commented - would cause issues
+                # "window.location.href = 'https://example.com'",
+            ]
+        )
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    assert "Before navigation attempt" in result.html
+
+
+# ============================================================================
+# EDGE CASE: Raw HTML with iframes
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_with_iframes():
+    """Test raw HTML containing iframes."""
+    html = """
+    <html><body>
+        <div id="main">Main content</div>
+        <iframe id="frame1" srcdoc="<html><body><div id='iframe-content'>Iframe Content</div></body></html>"></iframe>
+    </body></html>
+    """
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            js_code="document.getElementById('main').innerText = 'Modified main'",
+            process_iframes=True
+        )
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    assert "Modified main" in result.html
+
+
+# ============================================================================
+# TRICKY: Protocol inside raw content
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_raw_html_with_urls_inside():
+    """Test raw: with http:// URLs inside the content."""
+    html = """
+    <html><body>
+        <a href="http://example.com">Example</a>
+        <a href="https://google.com">Google</a>
+        <img src="https://placekitten.com/200/300" alt="Cat">
+        <div id="test">Test content with URL: https://test.com</div>
+    </body></html>
+    """
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            js_code="document.getElementById('test').innerText += ' - Modified'"
+        )
+        result = await crawler.arun(f"raw:{html}", config=config)
+
+    assert result.success
+    assert "Modified" in result.html
+    assert "http://example.com" in result.html or "example.com" in result.html
+
+
+# ============================================================================
+# TRICKY: Double raw: prefix
+# ============================================================================
+
+@pytest.mark.asyncio
+async def test_double_raw_prefix():
+    """Test what happens with double raw: prefix (edge case)."""
+    html = "<html><body>Content</body></html>"
+
+    async with AsyncWebCrawler() as crawler:
+        # raw:raw:<html>... - the second raw: becomes part of content
+        result = await crawler.arun(f"raw:raw:{html}")
+
+    # Should either handle gracefully or return "raw:<html>..." as content
+    assert result is not None
+
+
+if __name__ == "__main__":
+    import sys
+
+    async def run_tests():
+        # Run a few key tests manually
+        tests = [
+            ("Hash in CSS", test_raw_html_with_hash_in_css),
+            ("Unicode", test_raw_html_with_unicode),
+            ("Large HTML", test_raw_html_large),
+            ("HTTP still works", test_http_urls_still_work),
+            ("Concurrent requests", test_concurrent_raw_requests),
+            ("Complex screenshot", test_raw_html_screenshot_complex_page),
+        ]
+
+        for name, test_fn in tests:
+            print(f"\n=== Running: {name} ===")
+            try:
+                await test_fn()
+                print(f"✅ {name} PASSED")
+            except Exception as e:
+                print(f"❌ {name} FAILED: {e}")
+                import traceback
+                traceback.print_exc()
+
+    asyncio.run(run_tests())