From 2550f3d2d5f573df9b49ec1265c09f2fa6585871 Mon Sep 17 00:00:00 2001 From: unclecode Date: Sat, 27 Dec 2025 12:32:42 +0000 Subject: [PATCH] Add browser pipeline support for raw:/file:// URLs - Add process_in_browser parameter to CrawlerRunConfig - Route raw:/file:// URLs through _crawl_web() when browser operations needed - Use page.set_content() instead of goto() for local content - Fix cookie handling for non-HTTP URLs in browser_manager - Auto-detect browser requirements: js_code, wait_for, screenshot, etc. - Maintain fast path for raw:/file:// without browser params Fixes #310 --- crawl4ai/async_configs.py | 10 + crawl4ai/async_crawler_strategy.py | 210 ++++++----- crawl4ai/browser_manager.py | 33 +- tests/test_raw_html_browser.py | 172 +++++++++ tests/test_raw_html_edge_cases.py | 563 +++++++++++++++++++++++++++++ 5 files changed, 870 insertions(+), 118 deletions(-) create mode 100644 tests/test_raw_html_browser.py create mode 100644 tests/test_raw_html_edge_cases.py diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 664ad16b..b287b544 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1188,6 +1188,12 @@ class CrawlerRunConfig(): # Connection Parameters stream (bool): If True, enables streaming of crawled URLs as they are processed when used with arun_many. Default: False. + process_in_browser (bool): If True, forces raw:/file:// URLs to be processed through the browser + pipeline (enabling js_code, wait_for, scrolling, etc.). When False (default), + raw:/file:// URLs use a fast path that returns HTML directly without browser + interaction. This is automatically enabled when browser-requiring parameters + are detected (js_code, wait_for, screenshot, pdf, etc.). + Default: False. check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False Default: False. @@ -1308,6 +1314,7 @@ class CrawlerRunConfig(): method: str = "GET", stream: bool = False, prefetch: bool = False, # When True, return only HTML + links (skip heavy processing) + process_in_browser: bool = False, # Force browser processing for raw:/file:// URLs url: str = None, base_url: str = None, # Base URL for markdown link resolution (used with raw: HTML) check_robots_txt: bool = False, @@ -1445,6 +1452,7 @@ class CrawlerRunConfig(): # Connection Parameters self.stream = stream self.prefetch = prefetch # Prefetch mode: return only HTML + links + self.process_in_browser = process_in_browser # Force browser processing for raw:/file:// URLs self.method = method # Robots.txt Handling Parameters @@ -1722,6 +1730,7 @@ class CrawlerRunConfig(): method=kwargs.get("method", "GET"), stream=kwargs.get("stream", False), prefetch=kwargs.get("prefetch", False), + process_in_browser=kwargs.get("process_in_browser", False), check_robots_txt=kwargs.get("check_robots_txt", False), user_agent=kwargs.get("user_agent"), user_agent_mode=kwargs.get("user_agent_mode"), @@ -1831,6 +1840,7 @@ class CrawlerRunConfig(): "method": self.method, "stream": self.stream, "prefetch": self.prefetch, + "process_in_browser": self.process_in_browser, "check_robots_txt": self.check_robots_txt, "user_agent": self.user_agent, "user_agent_mode": self.user_agent_mode, diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 3506d9c4..121a3861 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -452,66 +452,48 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if url.startswith(("http://", "https://", "view-source:")): return await self._crawl_web(url, config) - elif url.startswith("file://"): - # initialize empty lists for console messages - captured_console = [] - pdf_data = None - mhtml_data = None - - # Process local file - local_file_path = url[7:] # Remove 'file://' prefix - if not os.path.exists(local_file_path): - raise FileNotFoundError(f"Local file not found: {local_file_path}") - with open(local_file_path, "r", encoding="utf-8") as f: - html = f.read() - - # Handle media generation - all require loading HTML into browser - if config.screenshot or config.pdf or config.capture_mhtml: - screenshot_data, pdf_data, mhtml_data = await self._generate_media_from_html( - html, config - ) - - if config.capture_console_messages: - page, context = await self.browser_manager.get_page(crawlerRunConfig=config) - captured_console = await self._capture_console_messages(page, url) - - return AsyncCrawlResponse( - html=html, - response_headers=response_headers, - status_code=status_code, - screenshot=screenshot_data, - pdf_data=pdf_data, - mhtml_data=mhtml_data, - get_delayed_content=None, - console_messages=captured_console, + elif url.startswith("file://") or url.startswith("raw://") or url.startswith("raw:"): + # Check if browser processing is required for file:// or raw: URLs + needs_browser = ( + config.process_in_browser or + config.screenshot or + config.pdf or + config.capture_mhtml or + config.js_code or + config.wait_for or + config.scan_full_page or + config.remove_overlay_elements or + config.simulate_user or + config.magic or + config.process_iframes or + config.capture_console_messages or + config.capture_network_requests ) - ##### - # Since both "raw:" and "raw://" start with "raw:", the first condition is always true for both, so "raw://" will be sliced as "//...", which is incorrect. - # Fix: Check for "raw://" first, then "raw:" - # Also, the prefix "raw://" is actually 6 characters long, not 7, so it should be sliced accordingly: url[6:] - ##### - elif url.startswith("raw://") or url.startswith("raw:"): - # Process raw HTML content - # raw_html = url[4:] if url[:4] == "raw:" else url[7:] - raw_html = url[6:] if url.startswith("raw://") else url[4:] - html = raw_html - pdf_data = None - mhtml_data = None + if needs_browser: + # Route through _crawl_web() for full browser pipeline + # _crawl_web() will detect file:// and raw: URLs and use set_content() + return await self._crawl_web(url, config) - # Handle media generation - all require loading HTML into browser - if config.screenshot or config.pdf or config.capture_mhtml: - screenshot_data, pdf_data, mhtml_data = await self._generate_media_from_html( - html, config - ) + # Fast path: return HTML directly without browser interaction + if url.startswith("file://"): + # Process local file + local_file_path = url[7:] # Remove 'file://' prefix + if not os.path.exists(local_file_path): + raise FileNotFoundError(f"Local file not found: {local_file_path}") + with open(local_file_path, "r", encoding="utf-8") as f: + html = f.read() + else: + # Process raw HTML content (raw:// or raw:) + html = url[6:] if url.startswith("raw://") else url[4:] return AsyncCrawlResponse( html=html, response_headers=response_headers, status_code=status_code, - screenshot=screenshot_data, - pdf_data=pdf_data, - mhtml_data=mhtml_data, + screenshot=None, + pdf_data=None, + mhtml_data=None, get_delayed_content=None, ) else: @@ -684,67 +666,83 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if not config.js_only: await self.execute_hook("before_goto", page, context=context, url=url, config=config) - try: - # Generate a unique nonce for this request - if config.experimental.get("use_csp_nonce", False): - nonce = hashlib.sha256(os.urandom(32)).hexdigest() + # Check if this is a file:// or raw: URL that needs set_content() instead of goto() + is_local_content = url.startswith("file://") or url.startswith("raw://") or url.startswith("raw:") - # Add CSP headers to the request - await page.set_extra_http_headers( - { - "Content-Security-Policy": f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'" - } - ) - - response = await page.goto( - url, wait_until=config.wait_until, timeout=config.page_timeout - ) - redirected_url = page.url - except Error as e: - # Allow navigation to be aborted when downloading files - # This is expected behavior for downloads in some browser engines - if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads: - self.logger.info( - message=f"Navigation aborted, likely due to file download: {url}", - tag="GOTO", - params={"url": url}, - ) - response = None + if is_local_content: + # Load local content using set_content() instead of network navigation + if url.startswith("file://"): + local_file_path = url[7:] # Remove 'file://' prefix + if not os.path.exists(local_file_path): + raise FileNotFoundError(f"Local file not found: {local_file_path}") + with open(local_file_path, "r", encoding="utf-8") as f: + html_content = f.read() else: - raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") + # raw:// or raw: + html_content = url[6:] if url.startswith("raw://") else url[4:] + + await page.set_content(html_content, wait_until=config.wait_until) + response = None + redirected_url = config.base_url or url + status_code = 200 + response_headers = {} + else: + # Standard web navigation with goto() + try: + # Generate a unique nonce for this request + if config.experimental.get("use_csp_nonce", False): + nonce = hashlib.sha256(os.urandom(32)).hexdigest() + + # Add CSP headers to the request + await page.set_extra_http_headers( + { + "Content-Security-Policy": f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'" + } + ) + + response = await page.goto( + url, wait_until=config.wait_until, timeout=config.page_timeout + ) + redirected_url = page.url + except Error as e: + # Allow navigation to be aborted when downloading files + # This is expected behavior for downloads in some browser engines + if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads: + self.logger.info( + message=f"Navigation aborted, likely due to file download: {url}", + tag="GOTO", + params={"url": url}, + ) + response = None + else: + raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") + + # ────────────────────────────────────────────────────────────── + # Walk the redirect chain. Playwright returns only the last + # hop, so we trace the `request.redirected_from` links until the + # first response that differs from the final one and surface its + # status-code. + # ────────────────────────────────────────────────────────────── + if response is None: + status_code = 200 + response_headers = {} + else: + first_resp = response + req = response.request + while req and req.redirected_from: + prev_req = req.redirected_from + prev_resp = await prev_req.response() + if prev_resp: # keep earliest + first_resp = prev_resp + req = prev_req + + status_code = first_resp.status + response_headers = first_resp.headers await self.execute_hook( "after_goto", page, context=context, url=url, response=response, config=config ) - # ────────────────────────────────────────────────────────────── - # Walk the redirect chain. Playwright returns only the last - # hop, so we trace the `request.redirected_from` links until the - # first response that differs from the final one and surface its - # status-code. - # ────────────────────────────────────────────────────────────── - if response is None: - status_code = 200 - response_headers = {} - else: - first_resp = response - req = response.request - while req and req.redirected_from: - prev_req = req.redirected_from - prev_resp = await prev_req.response() - if prev_resp: # keep earliest - first_resp = prev_resp - req = prev_req - - status_code = first_resp.status - response_headers = first_resp.headers - # if response is None: - # status_code = 200 - # response_headers = {} - # else: - # status_code = response.status - # response_headers = response.headers - else: status_code = 200 response_headers = {} diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 309f6b0f..fedc974f 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -895,18 +895,27 @@ class BrowserManager: combined_headers.update(self.config.headers) await context.set_extra_http_headers(combined_headers) - # Add default cookie - await context.add_cookies( - [ - { - "name": "cookiesEnabled", - "value": "true", - "url": crawlerRunConfig.url - if crawlerRunConfig and crawlerRunConfig.url - else "https://crawl4ai.com/", - } - ] - ) + # Add default cookie (skip for raw:/file:// URLs which are not valid cookie URLs) + cookie_url = None + if crawlerRunConfig and crawlerRunConfig.url: + url = crawlerRunConfig.url + # Only set cookie for http/https URLs + if url.startswith(("http://", "https://")): + cookie_url = url + elif crawlerRunConfig.base_url and crawlerRunConfig.base_url.startswith(("http://", "https://")): + # Use base_url as fallback for raw:/file:// URLs + cookie_url = crawlerRunConfig.base_url + + if cookie_url: + await context.add_cookies( + [ + { + "name": "cookiesEnabled", + "value": "true", + "url": cookie_url, + } + ] + ) # Handle navigator overrides if crawlerRunConfig: diff --git a/tests/test_raw_html_browser.py b/tests/test_raw_html_browser.py new file mode 100644 index 00000000..0f8648ad --- /dev/null +++ b/tests/test_raw_html_browser.py @@ -0,0 +1,172 @@ +""" +Tests for raw:/file:// URL browser pipeline support. + +Tests the new feature that allows js_code, wait_for, and other browser operations +to work with raw: and file:// URLs by routing them through _crawl_web() with +set_content() instead of goto(). +""" + +import pytest +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + + +@pytest.mark.asyncio +async def test_raw_html_fast_path(): + """Test that raw: without browser params returns HTML directly (fast path).""" + html = "
Original Content
" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig() # No browser params + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + assert "Original Content" in result.html + # Fast path should not modify the HTML + assert result.html == html + + +@pytest.mark.asyncio +async def test_js_code_on_raw_html(): + """Test that js_code executes on raw: HTML and modifies the DOM.""" + html = "
Original
" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + js_code="document.getElementById('test').innerText = 'Modified by JS'" + ) + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + assert "Modified by JS" in result.html + assert "Original" not in result.html or "Modified by JS" in result.html + + +@pytest.mark.asyncio +async def test_js_code_adds_element_to_raw_html(): + """Test that js_code can add new elements to raw: HTML.""" + html = "
" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + js_code='document.getElementById("container").innerHTML = "Custom Content"' + ) + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + assert "injected" in result.html + assert "Custom Content" in result.html + + +@pytest.mark.asyncio +async def test_screenshot_on_raw_html(): + """Test that screenshots work on raw: HTML.""" + html = "

Screenshot Test

" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(screenshot=True) + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + assert result.screenshot is not None + assert len(result.screenshot) > 100 # Should have substantial screenshot data + + +@pytest.mark.asyncio +async def test_process_in_browser_flag(): + """Test that process_in_browser=True forces browser path even without other params.""" + html = "
Test
" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(process_in_browser=True) + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + # Browser path normalizes HTML, so it may be slightly different + assert "Test" in result.html + + +@pytest.mark.asyncio +async def test_raw_prefix_variations(): + """Test both raw: and raw:// prefix formats.""" + html = "Content" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + js_code='document.body.innerHTML += "
Added
"' + ) + + # Test raw: prefix + result1 = await crawler.arun(f"raw:{html}", config=config) + assert result1.success + assert "Added" in result1.html + + # Test raw:// prefix + result2 = await crawler.arun(f"raw://{html}", config=config) + assert result2.success + assert "Added" in result2.html + + +@pytest.mark.asyncio +async def test_wait_for_on_raw_html(): + """Test that wait_for works with raw: HTML after js_code modifies DOM.""" + html = "
" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + js_code=''' + setTimeout(() => { + document.getElementById('container').innerHTML = '
Delayed Content
'; + }, 100); + ''', + wait_for="#delayed", + wait_for_timeout=5000 + ) + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + assert "Delayed Content" in result.html + + +@pytest.mark.asyncio +async def test_multiple_js_code_scripts(): + """Test that multiple js_code scripts execute in order.""" + html = "
0
" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + js_code=[ + "document.getElementById('counter').innerText = '1'", + "document.getElementById('counter').innerText = parseInt(document.getElementById('counter').innerText) + 1", + "document.getElementById('counter').innerText = parseInt(document.getElementById('counter').innerText) + 1", + ] + ) + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + assert ">3<" in result.html # Counter should be 3 after all scripts run + + +if __name__ == "__main__": + # Run a quick manual test + async def quick_test(): + html = "
Original
" + + async with AsyncWebCrawler(verbose=True) as crawler: + # Test 1: Fast path + print("\n=== Test 1: Fast path (no browser params) ===") + result1 = await crawler.arun(f"raw:{html}") + print(f"Success: {result1.success}") + print(f"HTML contains 'Original': {'Original' in result1.html}") + + # Test 2: js_code modifies DOM + print("\n=== Test 2: js_code modifies DOM ===") + config = CrawlerRunConfig( + js_code="document.getElementById('test').innerText = 'Modified by JS'" + ) + result2 = await crawler.arun(f"raw:{html}", config=config) + print(f"Success: {result2.success}") + print(f"HTML contains 'Modified by JS': {'Modified by JS' in result2.html}") + print(f"HTML snippet: {result2.html[:500]}...") + + asyncio.run(quick_test()) diff --git a/tests/test_raw_html_edge_cases.py b/tests/test_raw_html_edge_cases.py new file mode 100644 index 00000000..8517d19e --- /dev/null +++ b/tests/test_raw_html_edge_cases.py @@ -0,0 +1,563 @@ +""" +BRUTAL edge case tests for raw:/file:// URL browser pipeline. + +These tests try to break the system with tricky inputs, edge cases, +and compatibility checks to ensure we didn't break existing functionality. +""" + +import pytest +import asyncio +import tempfile +import os +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + + +# ============================================================================ +# EDGE CASE: Hash characters in HTML (previously broke urlparse - Issue #283) +# ============================================================================ + +@pytest.mark.asyncio +async def test_raw_html_with_hash_in_css(): + """Test that # in CSS colors doesn't break HTML parsing (regression for #283).""" + html = """ + + + + + +
Content with hash colors
+ + + """ + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(js_code="document.body.innerHTML += '
Added
'") + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + assert "#ff5733" in result.html or "ff5733" in result.html # Color should be preserved + assert "Added" in result.html # JS executed + assert "Content with hash colors" in result.html # Original content preserved + + +@pytest.mark.asyncio +async def test_raw_html_with_fragment_links(): + """Test HTML with # fragment links doesn't break.""" + html = """ + + Go to section 1 + Go to section 2 +
Section 1
+
Section 2
+ + """ + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(js_code="document.getElementById('section1').innerText = 'Modified Section 1'") + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + assert "Modified Section 1" in result.html + assert "#section2" in result.html # Fragment link preserved + + +# ============================================================================ +# EDGE CASE: Special characters and unicode +# ============================================================================ + +@pytest.mark.asyncio +async def test_raw_html_with_unicode(): + """Test raw HTML with various unicode characters.""" + html = """ + +
日本語 中文 한국어 العربية 🎉 💻 🚀
+
& < > " '
+ + """ + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(js_code="document.getElementById('unicode').innerText += ' ✅ Modified'") + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + assert "✅ Modified" in result.html or "Modified" in result.html + # Check unicode is preserved + assert "日本語" in result.html or "&#" in result.html # Either preserved or encoded + + +@pytest.mark.asyncio +async def test_raw_html_with_script_tags(): + """Test raw HTML with existing script tags doesn't interfere with js_code.""" + html = """ + +
0
+ + + """ + + async with AsyncWebCrawler() as crawler: + # Our js_code runs AFTER the page scripts + config = CrawlerRunConfig( + js_code="document.getElementById('counter').innerText = parseInt(document.getElementById('counter').innerText) + 5" + ) + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + # The embedded script sets it to 10, then our js_code adds 5 + assert ">15<" in result.html or "15" in result.html + + +# ============================================================================ +# EDGE CASE: Empty and malformed HTML +# ============================================================================ + +@pytest.mark.asyncio +async def test_raw_html_empty(): + """Test empty raw HTML.""" + html = "" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(js_code="document.body.innerHTML = '
Added to empty
'") + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + assert "Added to empty" in result.html + + +@pytest.mark.asyncio +async def test_raw_html_minimal(): + """Test minimal HTML (just text, no tags).""" + html = "Just plain text, no HTML tags" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(js_code="document.body.innerHTML += '
Injected
'") + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + # Browser should wrap it in proper HTML + assert "Injected" in result.html + + +@pytest.mark.asyncio +async def test_raw_html_malformed(): + """Test malformed HTML with unclosed tags.""" + html = "
Unclosed tags
More content" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(js_code="document.body.innerHTML += '
Valid Added
'") + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + assert "Valid Added" in result.html + # Browser should have fixed the malformed HTML + + +# ============================================================================ +# EDGE CASE: Very large HTML +# ============================================================================ + +@pytest.mark.asyncio +async def test_raw_html_large(): + """Test large raw HTML (100KB+).""" + # Generate 100KB of HTML + items = "".join([f'
Item {i} content here with some text
\n' for i in range(2000)]) + html = f"{items}" + + assert len(html) > 100000 # Verify it's actually large + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + js_code="document.getElementById('item-999').innerText = 'MODIFIED ITEM 999'" + ) + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + assert "MODIFIED ITEM 999" in result.html + assert "item-1999" in result.html # Last item should still exist + + +# ============================================================================ +# EDGE CASE: JavaScript errors and timeouts +# ============================================================================ + +@pytest.mark.asyncio +async def test_raw_html_js_error_doesnt_crash(): + """Test that JavaScript errors in js_code don't crash the crawl.""" + html = "
Original
" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + js_code=[ + "nonExistentFunction();", # This will throw an error + "document.getElementById('test').innerText = 'Still works'" # This should still run + ] + ) + result = await crawler.arun(f"raw:{html}", config=config) + + # Crawl should succeed even with JS errors + assert result.success + + +@pytest.mark.asyncio +async def test_raw_html_wait_for_timeout(): + """Test wait_for with element that never appears times out gracefully.""" + html = "
Original
" + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + wait_for="#never-exists", + wait_for_timeout=1000 # 1 second timeout + ) + result = await crawler.arun(f"raw:{html}", config=config) + + # Should timeout but still return the HTML we have + # The behavior might be success=False or success=True with partial content + # Either way, it shouldn't hang or crash + assert result is not None + + +# ============================================================================ +# COMPATIBILITY: Normal HTTP URLs still work +# ============================================================================ + +@pytest.mark.asyncio +async def test_http_urls_still_work(): + """Ensure we didn't break normal HTTP URL crawling.""" + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com") + + assert result.success + assert "Example Domain" in result.html + + +@pytest.mark.asyncio +async def test_http_with_js_code_still_works(): + """Ensure HTTP URLs with js_code still work.""" + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + js_code="document.body.innerHTML += '
Injected via JS
'" + ) + result = await crawler.arun("https://example.com", config=config) + + assert result.success + assert "Injected via JS" in result.html + + +# ============================================================================ +# COMPATIBILITY: File URLs +# ============================================================================ + +@pytest.mark.asyncio +async def test_file_url_with_js_code(): + """Test file:// URLs with js_code execution.""" + # Create a temp file + with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f: + f.write("
File Content
") + temp_path = f.name + + try: + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + js_code="document.getElementById('file-content').innerText = 'Modified File Content'" + ) + result = await crawler.arun(f"file://{temp_path}", config=config) + + assert result.success + assert "Modified File Content" in result.html + finally: + os.unlink(temp_path) + + +@pytest.mark.asyncio +async def test_file_url_fast_path(): + """Test file:// fast path (no browser params).""" + with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f: + f.write("Fast path file content") + temp_path = f.name + + try: + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(f"file://{temp_path}") + + assert result.success + assert "Fast path file content" in result.html + finally: + os.unlink(temp_path) + + +# ============================================================================ +# COMPATIBILITY: Extraction strategies with raw HTML +# ============================================================================ + +@pytest.mark.asyncio +async def test_raw_html_with_css_extraction(): + """Test CSS extraction on raw HTML after js_code modifies it.""" + from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + + html = """ + +
+
Original Product
+
+ + """ + + schema = { + "name": "Products", + "baseSelector": ".product", + "fields": [ + {"name": "name", "selector": ".name", "type": "text"} + ] + } + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + js_code=""" + document.querySelector('.products').innerHTML += + '
JS Added Product
'; + """, + extraction_strategy=JsonCssExtractionStrategy(schema) + ) + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + # Check that extraction found both products + import json + extracted = json.loads(result.extracted_content) + names = [p.get('name', '') for p in extracted] + assert any("JS Added Product" in name for name in names) + + +# ============================================================================ +# EDGE CASE: Concurrent raw: requests +# ============================================================================ + +@pytest.mark.asyncio +async def test_concurrent_raw_requests(): + """Test multiple concurrent raw: requests don't interfere.""" + htmls = [ + f"
Request {i}
" + for i in range(5) + ] + + async with AsyncWebCrawler() as crawler: + configs = [ + CrawlerRunConfig( + js_code=f"document.getElementById('test').innerText += ' Modified {i}'" + ) + for i in range(5) + ] + + # Run concurrently + tasks = [ + crawler.arun(f"raw:{html}", config=config) + for html, config in zip(htmls, configs) + ] + results = await asyncio.gather(*tasks) + + for i, result in enumerate(results): + assert result.success + assert f"Request {i}" in result.html + assert f"Modified {i}" in result.html + + +# ============================================================================ +# EDGE CASE: raw: with base_url for link resolution +# ============================================================================ + +@pytest.mark.asyncio +async def test_raw_html_with_base_url(): + """Test that base_url is used for link resolution in markdown.""" + html = """ + + Page 1 + Page 2 + Logo + + """ + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + base_url="https://example.com", + process_in_browser=True # Force browser to test base_url handling + ) + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + # Check markdown has absolute URLs + if result.markdown: + # Links should be absolute + md = result.markdown.raw_markdown if hasattr(result.markdown, 'raw_markdown') else str(result.markdown) + assert "example.com" in md or "/page1" in md + + +# ============================================================================ +# EDGE CASE: raw: with screenshot of complex page +# ============================================================================ + +@pytest.mark.asyncio +async def test_raw_html_screenshot_complex_page(): + """Test screenshot of complex raw HTML with CSS and JS modifications.""" + html = """ + + + + + +
+

Original Title

+

This is a test card with styling.

+
+ + + """ + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + js_code="document.getElementById('title').innerText = 'Modified Title'", + screenshot=True + ) + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + assert result.screenshot is not None + assert len(result.screenshot) > 1000 # Should be substantial + assert "Modified Title" in result.html + + +# ============================================================================ +# EDGE CASE: JavaScript that tries to navigate away +# ============================================================================ + +@pytest.mark.asyncio +async def test_raw_html_js_navigation_blocked(): + """Test that JS trying to navigate doesn't break the crawl.""" + html = """ + +
Original Content
+ + + """ + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + # Try to navigate via js_code + js_code=[ + "document.getElementById('content').innerText = 'Before navigation attempt'", + # Actual navigation attempt commented - would cause issues + # "window.location.href = 'https://example.com'", + ] + ) + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + assert "Before navigation attempt" in result.html + + +# ============================================================================ +# EDGE CASE: Raw HTML with iframes +# ============================================================================ + +@pytest.mark.asyncio +async def test_raw_html_with_iframes(): + """Test raw HTML containing iframes.""" + html = """ + +
Main content
+ + + """ + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + js_code="document.getElementById('main').innerText = 'Modified main'", + process_iframes=True + ) + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + assert "Modified main" in result.html + + +# ============================================================================ +# TRICKY: Protocol inside raw content +# ============================================================================ + +@pytest.mark.asyncio +async def test_raw_html_with_urls_inside(): + """Test raw: with http:// URLs inside the content.""" + html = """ + + Example + Google + Cat +
Test content with URL: https://test.com
+ + """ + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + js_code="document.getElementById('test').innerText += ' - Modified'" + ) + result = await crawler.arun(f"raw:{html}", config=config) + + assert result.success + assert "Modified" in result.html + assert "http://example.com" in result.html or "example.com" in result.html + + +# ============================================================================ +# TRICKY: Double raw: prefix +# ============================================================================ + +@pytest.mark.asyncio +async def test_double_raw_prefix(): + """Test what happens with double raw: prefix (edge case).""" + html = "Content" + + async with AsyncWebCrawler() as crawler: + # raw:raw:... - the second raw: becomes part of content + result = await crawler.arun(f"raw:raw:{html}") + + # Should either handle gracefully or return "raw:..." as content + assert result is not None + + +if __name__ == "__main__": + import sys + + async def run_tests(): + # Run a few key tests manually + tests = [ + ("Hash in CSS", test_raw_html_with_hash_in_css), + ("Unicode", test_raw_html_with_unicode), + ("Large HTML", test_raw_html_large), + ("HTTP still works", test_http_urls_still_work), + ("Concurrent requests", test_concurrent_raw_requests), + ("Complex screenshot", test_raw_html_screenshot_complex_page), + ] + + for name, test_fn in tests: + print(f"\n=== Running: {name} ===") + try: + await test_fn() + print(f"✅ {name} PASSED") + except Exception as e: + print(f"❌ {name} FAILED: {e}") + import traceback + traceback.print_exc() + + asyncio.run(run_tests())