import pytest import asyncio import time from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig, CacheMode @pytest.mark.asyncio async def test_wait_for_timeout_separate_from_page_timeout(): """Test that wait_for has its own timeout separate from page_timeout""" browser_config = BrowserConfig(headless=True) # Test with short wait_for_timeout but longer page_timeout config = CrawlerRunConfig( wait_for="css:.nonexistent-element", wait_for_timeout=2000, # 2 seconds page_timeout=10000, # 10 seconds cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler(config=browser_config) as crawler: start_time = time.time() result = await crawler.arun("https://example.com", config=config) elapsed = time.time() - start_time # Should timeout after ~2 seconds (wait_for_timeout), not 10 seconds assert elapsed < 5, f"Expected timeout around 2s, but took {elapsed:.2f}s" assert result.success, "Crawl should still succeed even if wait_for times out" @pytest.mark.asyncio async def test_wait_for_timeout_with_existing_element(): """Test that wait_for_timeout works correctly when element exists""" browser_config = BrowserConfig(headless=True) config = CrawlerRunConfig( wait_for="css:body", # This should exist quickly wait_for_timeout=5000, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler(config=browser_config) as crawler: start_time = time.time() result = await crawler.arun("https://example.com", config=config) elapsed = time.time() - start_time # Should complete quickly since body element exists assert elapsed < 3, f"Expected quick completion, but took {elapsed:.2f}s" assert result.success assert "
Testing Google Analytics integration
""" async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(f"raw://{html_content}", config=config) assert result.success # Check that GA scripts are preserved in the HTML assert "googletagmanager.com/gtag/js" in result.html assert "dataLayer" in result.html assert "gtag('config'" in result.html @pytest.mark.asyncio async def test_mkdocs_no_duplicate_gtag(): """Test that there are no duplicate gtag.js entries in documentation""" browser_config = BrowserConfig(headless=True) config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) # Simulate MkDocs-like HTML structure html_content = """Welcome to the documentation
""" async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(f"raw://{html_content}", config=config) assert result.success # Count occurrences of gtag.js to ensure no duplicates gtag_count = result.html.count("googletagmanager.com/gtag/js") assert gtag_count <= 1, f"Found {gtag_count} gtag.js scripts, expected at most 1" # Ensure the analytics functionality is still there if gtag_count == 1: assert "dataLayer" in result.html assert "gtag('config'" in result.html if __name__ == "__main__": pytest.main([__file__, "-v"])