diff --git a/tests/releases/test_release_0.7.0.py b/tests/releases/test_release_0.7.0.py new file mode 100644 index 00000000..a0885a70 --- /dev/null +++ b/tests/releases/test_release_0.7.0.py @@ -0,0 +1,317 @@ +#!/usr/bin/env python3 + +import asyncio +import pytest +import os +import json +import tempfile +from pathlib import Path +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy, LLMConfig +from crawl4ai.content_filter_strategy import BM25ContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.async_url_seeder import AsyncUrlSeeder +from crawl4ai.utils import RobotsParser + + +class TestCrawl4AIv070: + """Test suite for Crawl4AI v0.7.0 changes""" + + @pytest.mark.asyncio + async def test_raw_url_parsing(self): + """Test raw:// URL parsing logic fix""" + html_content = "
This is a test paragraph.
" + + async with AsyncWebCrawler() as crawler: + # Test raw:// prefix + result1 = await crawler.arun(f"raw://{html_content}") + assert result1.success + assert "Test Content" in result1.markdown + + # Test raw: prefix + result2 = await crawler.arun(f"raw:{html_content}") + assert result2.success + assert "Test Content" in result2.markdown + + @pytest.mark.asyncio + async def test_max_pages_limit_batch_processing(self): + """Test max_pages limit is respected during batch processing""" + urls = [ + "https://httpbin.org/html", + "https://httpbin.org/json", + "https://httpbin.org/xml" + ] + + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + max_pages=2 + ) + + async with AsyncWebCrawler() as crawler: + results = await crawler.arun_many(urls, config=config) + # Should only process 2 pages due to max_pages limit + successful_results = [r for r in results if r.success] + assert len(successful_results) <= 2 + + @pytest.mark.asyncio + async def test_navigation_abort_handling(self): + """Test handling of navigation aborts during file downloads""" + async with AsyncWebCrawler() as crawler: + # Test with a URL that might cause navigation issues + result = await crawler.arun( + "https://httpbin.org/status/404", + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + # Should not crash even with navigation issues + assert result is not None + + @pytest.mark.asyncio + async def test_screenshot_capture_fix(self): + """Test screenshot capture improvements""" + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + screenshot=True + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://httpbin.org/html", config=config) + assert result.success + assert result.screenshot is not None + assert len(result.screenshot) > 0 + + @pytest.mark.asyncio + async def test_redirect_status_codes(self): + """Test that real redirect status codes are surfaced""" + async with AsyncWebCrawler() as crawler: + # Test with a redirect URL + result = await crawler.arun( + "https://httpbin.org/redirect/1", + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + assert result.success + # Should have redirect information + assert result.status_code in [200, 301, 302, 303, 307, 308] + + @pytest.mark.asyncio + async def test_local_file_processing(self): + """Test local file processing with captured_console initialization""" + with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f: + f.write("
+
+
+ '''
+
+ config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ exclude_external_images=True
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(f"raw://{html_with_images}", config=config)
+ assert result.success
+ # External images should be excluded
+ assert "external.com" not in result.cleaned_html
+
+ @pytest.mark.asyncio
+ async def test_llm_extraction_strategy_fix(self):
+ """Test LLM extraction strategy choices error fix"""
+ if not os.getenv("OPENAI_API_KEY"):
+ pytest.skip("OpenAI API key not available")
+
+ llm_config = LLMConfig(
+ provider="openai/gpt-4o-mini",
+ api_token=os.getenv("OPENAI_API_KEY")
+ )
+
+ strategy = LLMExtractionStrategy(
+ llm_config=llm_config,
+ instruction="Extract the main heading",
+ extraction_type="block"
+ )
+
+ config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ extraction_strategy=strategy
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun("https://httpbin.org/html", config=config)
+ assert result.success
+ # Should not throw 'str' object has no attribute 'choices' error
+ assert result.extracted_content is not None
+
+ @pytest.mark.asyncio
+ async def test_wait_for_timeout(self):
+ """Test separate timeout for wait_for condition"""
+ config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ wait_for="css:non-existent-element",
+ wait_for_timeout=1000 # 1 second timeout
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun("https://httpbin.org/html", config=config)
+ # Should timeout gracefully and still return result
+ assert result is not None
+
+ @pytest.mark.asyncio
+ async def test_bm25_content_filter_language_parameter(self):
+ """Test BM25 filter with language parameter for stemming"""
+ content_filter = BM25ContentFilter(
+ user_query="test content",
+ language="english",
+ use_stemming=True
+ )
+
+ markdown_generator = DefaultMarkdownGenerator(
+ content_filter=content_filter
+ )
+
+ config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ markdown_generator=markdown_generator
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun("https://httpbin.org/html", config=config)
+ assert result.success
+ assert result.markdown is not None
+
+ @pytest.mark.asyncio
+ async def test_url_normalization(self):
+ """Test URL normalization for invalid schemes and trailing slashes"""
+ async with AsyncWebCrawler() as crawler:
+ # Test with trailing slash
+ result = await crawler.arun(
+ "https://httpbin.org/html/",
+ config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+ )
+ assert result.success
+
+ @pytest.mark.asyncio
+ async def test_max_scroll_steps(self):
+ """Test max_scroll_steps parameter for full page scanning"""
+ config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ scan_full_page=True,
+ max_scroll_steps=3
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun("https://httpbin.org/html", config=config)
+ assert result.success
+
+ @pytest.mark.asyncio
+ async def test_async_url_seeder(self):
+ """Test AsyncUrlSeeder functionality"""
+ seeder = AsyncUrlSeeder(
+ base_url="https://httpbin.org",
+ max_depth=1,
+ max_urls=5
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ urls = await seeder.seed(crawler)
+ assert isinstance(urls, list)
+ assert len(urls) <= 5
+
+ @pytest.mark.asyncio
+ async def test_pdf_processing_timeout(self):
+ """Test PDF processing with timeout"""
+ config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ pdf=True,
+ pdf_timeout=10000 # 10 seconds
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun("https://httpbin.org/html", config=config)
+ assert result.success
+ # PDF might be None for HTML pages, but should not hang
+ assert result.pdf is not None or result.pdf is None
+
+ @pytest.mark.asyncio
+ async def test_browser_session_management(self):
+ """Test improved browser session management"""
+ browser_config = BrowserConfig(
+ headless=True,
+ use_persistent_context=True
+ )
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(
+ "https://httpbin.org/html",
+ config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+ )
+ assert result.success
+
+ @pytest.mark.asyncio
+ async def test_memory_management(self):
+ """Test memory management features"""
+ config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ memory_threshold_percent=80.0,
+ check_interval=1.0,
+ memory_wait_timeout=600 # 10 minutes default
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun("https://httpbin.org/html", config=config)
+ assert result.success
+
+ @pytest.mark.asyncio
+ async def test_virtual_scroll_support(self):
+ """Test virtual scroll support for modern web scraping"""
+ config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ scan_full_page=True,
+ virtual_scroll=True
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun("https://httpbin.org/html", config=config)
+ assert result.success
+
+ @pytest.mark.asyncio
+ async def test_adaptive_crawling(self):
+ """Test adaptive crawling feature"""
+ config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ adaptive_crawling=True
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun("https://httpbin.org/html", config=config)
+ assert result.success
+
+
+if __name__ == "__main__":
+ # Run the tests
+ pytest.main([__file__, "-v"])