Unclosed tags

More content" async with AsyncWebCrawler() as crawler: config = CrawlerRunConfig(js_code="document.body.innerHTML += '

Valid Added

'") result = await crawler.arun(f"raw:{html}", config=config) assert result.success assert "Valid Added" in result.html # Browser should have fixed the malformed HTML # ============================================================================ # EDGE CASE: Very large HTML # ============================================================================ @pytest.mark.asyncio async def test_raw_html_large(): """Test large raw HTML (100KB+).""" # Generate 100KB of HTML items = "".join([f'

Item {i} content here with some text

\n' for i in range(2000)]) html = f"{items}" assert len(html) > 100000 # Verify it's actually large async with AsyncWebCrawler() as crawler: config = CrawlerRunConfig( js_code="document.getElementById('item-999').innerText = 'MODIFIED ITEM 999'" ) result = await crawler.arun(f"raw:{html}", config=config) assert result.success assert "MODIFIED ITEM 999" in result.html assert "item-1999" in result.html # Last item should still exist # ============================================================================ # EDGE CASE: JavaScript errors and timeouts # ============================================================================ @pytest.mark.asyncio async def test_raw_html_js_error_doesnt_crash(): """Test that JavaScript errors in js_code don't crash the crawl.""" html = "

Original

" async with AsyncWebCrawler() as crawler: config = CrawlerRunConfig( js_code=[ "nonExistentFunction();", # This will throw an error "document.getElementById('test').innerText = 'Still works'" # This should still run ] ) result = await crawler.arun(f"raw:{html}", config=config) # Crawl should succeed even with JS errors assert result.success @pytest.mark.asyncio async def test_raw_html_wait_for_timeout(): """Test wait_for with element that never appears times out gracefully.""" html = "

Original

" async with AsyncWebCrawler() as crawler: config = CrawlerRunConfig( wait_for="#never-exists", wait_for_timeout=1000 # 1 second timeout ) result = await crawler.arun(f"raw:{html}", config=config) # Should timeout but still return the HTML we have # The behavior might be success=False or success=True with partial content # Either way, it shouldn't hang or crash assert result is not None # ============================================================================ # COMPATIBILITY: Normal HTTP URLs still work # ============================================================================ @pytest.mark.asyncio async def test_http_urls_still_work(): """Ensure we didn't break normal HTTP URL crawling.""" async with AsyncWebCrawler() as crawler: result = await crawler.arun("https://example.com") assert result.success assert "Example Domain" in result.html @pytest.mark.asyncio async def test_http_with_js_code_still_works(): """Ensure HTTP URLs with js_code still work.""" async with AsyncWebCrawler() as crawler: config = CrawlerRunConfig( js_code="document.body.innerHTML += '

Injected via JS

'" ) result = await crawler.arun("https://example.com", config=config) assert result.success assert "Injected via JS" in result.html # ============================================================================ # COMPATIBILITY: File URLs # ============================================================================ @pytest.mark.asyncio async def test_file_url_with_js_code(): """Test file:// URLs with js_code execution.""" # Create a temp file with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f: f.write("

File Content

") temp_path = f.name try: async with AsyncWebCrawler() as crawler: config = CrawlerRunConfig( js_code="document.getElementById('file-content').innerText = 'Modified File Content'" ) result = await crawler.arun(f"file://{temp_path}", config=config) assert result.success assert "Modified File Content" in result.html finally: os.unlink(temp_path) @pytest.mark.asyncio async def test_file_url_fast_path(): """Test file:// fast path (no browser params).""" with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f: f.write("Fast path file content") temp_path = f.name try: async with AsyncWebCrawler() as crawler: result = await crawler.arun(f"file://{temp_path}") assert result.success assert "Fast path file content" in result.html finally: os.unlink(temp_path) # ============================================================================ # COMPATIBILITY: Extraction strategies with raw HTML # ============================================================================ @pytest.mark.asyncio async def test_raw_html_with_css_extraction(): """Test CSS extraction on raw HTML after js_code modifies it.""" from crawl4ai.extraction_strategy import JsonCssExtractionStrategy html = """

Original Product

""" schema = { "name": "Products", "baseSelector": ".product", "fields": [ {"name": "name", "selector": ".name", "type": "text"} ] } async with AsyncWebCrawler() as crawler: config = CrawlerRunConfig( js_code=""" document.querySelector('.products').innerHTML += '

JS Added Product

'; """, extraction_strategy=JsonCssExtractionStrategy(schema) ) result = await crawler.arun(f"raw:{html}", config=config) assert result.success # Check that extraction found both products import json extracted = json.loads(result.extracted_content) names = [p.get('name', '') for p in extracted] assert any("JS Added Product" in name for name in names) # ============================================================================ # EDGE CASE: Concurrent raw: requests # ============================================================================ @pytest.mark.asyncio async def test_concurrent_raw_requests(): """Test multiple concurrent raw: requests don't interfere.""" htmls = [ f"

Request {i}

" for i in range(5) ] async with AsyncWebCrawler() as crawler: configs = [ CrawlerRunConfig( js_code=f"document.getElementById('test').innerText += ' Modified {i}'" ) for i in range(5) ] # Run concurrently tasks = [ crawler.arun(f"raw:{html}", config=config) for html, config in zip(htmls, configs) ] results = await asyncio.gather(*tasks) for i, result in enumerate(results): assert result.success assert f"Request {i}" in result.html assert f"Modified {i}" in result.html # ============================================================================ # EDGE CASE: raw: with base_url for link resolution # ============================================================================ @pytest.mark.asyncio async def test_raw_html_with_base_url(): """Test that base_url is used for link resolution in markdown.""" html = """ Page 1 Page 2

""" async with AsyncWebCrawler() as crawler: config = CrawlerRunConfig( base_url="https://example.com", process_in_browser=True # Force browser to test base_url handling ) result = await crawler.arun(f"raw:{html}", config=config) assert result.success # Check markdown has absolute URLs if result.markdown: # Links should be absolute md = result.markdown.raw_markdown if hasattr(result.markdown, 'raw_markdown') else str(result.markdown) assert "example.com" in md or "/page1" in md # ============================================================================ # EDGE CASE: raw: with screenshot of complex page # ============================================================================ @pytest.mark.asyncio async def test_raw_html_screenshot_complex_page(): """Test screenshot of complex raw HTML with CSS and JS modifications.""" html = """

Original Title

This is a test card with styling.

""" async with AsyncWebCrawler() as crawler: config = CrawlerRunConfig( js_code="document.getElementById('title').innerText = 'Modified Title'", screenshot=True ) result = await crawler.arun(f"raw:{html}", config=config) assert result.success assert result.screenshot is not None assert len(result.screenshot) > 1000 # Should be substantial assert "Modified Title" in result.html # ============================================================================ # EDGE CASE: JavaScript that tries to navigate away # ============================================================================ @pytest.mark.asyncio async def test_raw_html_js_navigation_blocked(): """Test that JS trying to navigate doesn't break the crawl.""" html = """

Original Content

""" async with AsyncWebCrawler() as crawler: config = CrawlerRunConfig( # Try to navigate via js_code js_code=[ "document.getElementById('content').innerText = 'Before navigation attempt'", # Actual navigation attempt commented - would cause issues # "window.location.href = 'https://example.com'", ] ) result = await crawler.arun(f"raw:{html}", config=config) assert result.success assert "Before navigation attempt" in result.html # ============================================================================ # EDGE CASE: Raw HTML with iframes # ============================================================================ @pytest.mark.asyncio async def test_raw_html_with_iframes(): """Test raw HTML containing iframes.""" html = """

Main content

""" async with AsyncWebCrawler() as crawler: config = CrawlerRunConfig( js_code="document.getElementById('main').innerText = 'Modified main'", process_iframes=True ) result = await crawler.arun(f"raw:{html}", config=config) assert result.success assert "Modified main" in result.html # ============================================================================ # TRICKY: Protocol inside raw content # ============================================================================ @pytest.mark.asyncio async def test_raw_html_with_urls_inside(): """Test raw: with http:// URLs inside the content.""" html = """ Example Google Cat

Test content with URL: https://test.com

""" async with AsyncWebCrawler() as crawler: config = CrawlerRunConfig( js_code="document.getElementById('test').innerText += ' - Modified'" ) result = await crawler.arun(f"raw:{html}", config=config) assert result.success assert "Modified" in result.html assert "http://example.com" in result.html or "example.com" in result.html # ============================================================================ # TRICKY: Double raw: prefix # ============================================================================ @pytest.mark.asyncio async def test_double_raw_prefix(): """Test what happens with double raw: prefix (edge case).""" html = "Content" async with AsyncWebCrawler() as crawler: # raw:raw:... - the second raw: becomes part of content result = await crawler.arun(f"raw:raw:{html}") # Should either handle gracefully or return "raw:..." as content assert result is not None if __name__ == "__main__": import sys async def run_tests(): # Run a few key tests manually tests = [ ("Hash in CSS", test_raw_html_with_hash_in_css), ("Unicode", test_raw_html_with_unicode), ("Large HTML", test_raw_html_large), ("HTTP still works", test_http_urls_still_work), ("Concurrent requests", test_concurrent_raw_requests), ("Complex screenshot", test_raw_html_screenshot_complex_page), ] for name, test_fn in tests: print(f"\n=== Running: {name} ===") try: await test_fn() print(f"✅ {name} PASSED") except Exception as e: print(f"❌ {name} FAILED: {e}") import traceback traceback.print_exc() asyncio.run(run_tests())