From ba2ed53ff1874538420c148cdee2f1d8d607d715 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 11 Jul 2025 22:27:18 +0800 Subject: [PATCH] test(releases): Add test cases for release 0.7.0 --- tests/releases/test_release_0.7.0.py | 317 +++++++++++++++++++++++++++ 1 file changed, 317 insertions(+) create mode 100644 tests/releases/test_release_0.7.0.py diff --git a/tests/releases/test_release_0.7.0.py b/tests/releases/test_release_0.7.0.py new file mode 100644 index 00000000..a0885a70 --- /dev/null +++ b/tests/releases/test_release_0.7.0.py @@ -0,0 +1,317 @@ +#!/usr/bin/env python3 + +import asyncio +import pytest +import os +import json +import tempfile +from pathlib import Path +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy, LLMConfig +from crawl4ai.content_filter_strategy import BM25ContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.async_url_seeder import AsyncUrlSeeder +from crawl4ai.utils import RobotsParser + + +class TestCrawl4AIv070: + """Test suite for Crawl4AI v0.7.0 changes""" + + @pytest.mark.asyncio + async def test_raw_url_parsing(self): + """Test raw:// URL parsing logic fix""" + html_content = "

Test Content

This is a test paragraph.

" + + async with AsyncWebCrawler() as crawler: + # Test raw:// prefix + result1 = await crawler.arun(f"raw://{html_content}") + assert result1.success + assert "Test Content" in result1.markdown + + # Test raw: prefix + result2 = await crawler.arun(f"raw:{html_content}") + assert result2.success + assert "Test Content" in result2.markdown + + @pytest.mark.asyncio + async def test_max_pages_limit_batch_processing(self): + """Test max_pages limit is respected during batch processing""" + urls = [ + "https://httpbin.org/html", + "https://httpbin.org/json", + "https://httpbin.org/xml" + ] + + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + max_pages=2 + ) + + async with AsyncWebCrawler() as crawler: + results = await crawler.arun_many(urls, config=config) + # Should only process 2 pages due to max_pages limit + successful_results = [r for r in results if r.success] + assert len(successful_results) <= 2 + + @pytest.mark.asyncio + async def test_navigation_abort_handling(self): + """Test handling of navigation aborts during file downloads""" + async with AsyncWebCrawler() as crawler: + # Test with a URL that might cause navigation issues + result = await crawler.arun( + "https://httpbin.org/status/404", + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + # Should not crash even with navigation issues + assert result is not None + + @pytest.mark.asyncio + async def test_screenshot_capture_fix(self): + """Test screenshot capture improvements""" + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + screenshot=True + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://httpbin.org/html", config=config) + assert result.success + assert result.screenshot is not None + assert len(result.screenshot) > 0 + + @pytest.mark.asyncio + async def test_redirect_status_codes(self): + """Test that real redirect status codes are surfaced""" + async with AsyncWebCrawler() as crawler: + # Test with a redirect URL + result = await crawler.arun( + "https://httpbin.org/redirect/1", + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + assert result.success + # Should have redirect information + assert result.status_code in [200, 301, 302, 303, 307, 308] + + @pytest.mark.asyncio + async def test_local_file_processing(self): + """Test local file processing with captured_console initialization""" + with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f: + f.write("

Local File Test

") + temp_file = f.name + + try: + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(f"file://{temp_file}") + assert result.success + assert "Local File Test" in result.markdown + finally: + os.unlink(temp_file) + + @pytest.mark.asyncio + async def test_robots_txt_wildcard_support(self): + """Test robots.txt wildcard rules support""" + parser = RobotsParser() + + # Test wildcard patterns + robots_content = "User-agent: *\nDisallow: /admin/*\nDisallow: *.pdf" + + # This should work without throwing exceptions + assert parser is not None + + @pytest.mark.asyncio + async def test_exclude_external_images(self): + """Test exclude_external_images flag""" + html_with_images = ''' + + Local + External + + ''' + + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + exclude_external_images=True + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(f"raw://{html_with_images}", config=config) + assert result.success + # External images should be excluded + assert "external.com" not in result.cleaned_html + + @pytest.mark.asyncio + async def test_llm_extraction_strategy_fix(self): + """Test LLM extraction strategy choices error fix""" + if not os.getenv("OPENAI_API_KEY"): + pytest.skip("OpenAI API key not available") + + llm_config = LLMConfig( + provider="openai/gpt-4o-mini", + api_token=os.getenv("OPENAI_API_KEY") + ) + + strategy = LLMExtractionStrategy( + llm_config=llm_config, + instruction="Extract the main heading", + extraction_type="block" + ) + + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=strategy + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://httpbin.org/html", config=config) + assert result.success + # Should not throw 'str' object has no attribute 'choices' error + assert result.extracted_content is not None + + @pytest.mark.asyncio + async def test_wait_for_timeout(self): + """Test separate timeout for wait_for condition""" + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + wait_for="css:non-existent-element", + wait_for_timeout=1000 # 1 second timeout + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://httpbin.org/html", config=config) + # Should timeout gracefully and still return result + assert result is not None + + @pytest.mark.asyncio + async def test_bm25_content_filter_language_parameter(self): + """Test BM25 filter with language parameter for stemming""" + content_filter = BM25ContentFilter( + user_query="test content", + language="english", + use_stemming=True + ) + + markdown_generator = DefaultMarkdownGenerator( + content_filter=content_filter + ) + + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + markdown_generator=markdown_generator + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://httpbin.org/html", config=config) + assert result.success + assert result.markdown is not None + + @pytest.mark.asyncio + async def test_url_normalization(self): + """Test URL normalization for invalid schemes and trailing slashes""" + async with AsyncWebCrawler() as crawler: + # Test with trailing slash + result = await crawler.arun( + "https://httpbin.org/html/", + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + assert result.success + + @pytest.mark.asyncio + async def test_max_scroll_steps(self): + """Test max_scroll_steps parameter for full page scanning""" + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + scan_full_page=True, + max_scroll_steps=3 + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://httpbin.org/html", config=config) + assert result.success + + @pytest.mark.asyncio + async def test_async_url_seeder(self): + """Test AsyncUrlSeeder functionality""" + seeder = AsyncUrlSeeder( + base_url="https://httpbin.org", + max_depth=1, + max_urls=5 + ) + + async with AsyncWebCrawler() as crawler: + urls = await seeder.seed(crawler) + assert isinstance(urls, list) + assert len(urls) <= 5 + + @pytest.mark.asyncio + async def test_pdf_processing_timeout(self): + """Test PDF processing with timeout""" + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + pdf=True, + pdf_timeout=10000 # 10 seconds + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://httpbin.org/html", config=config) + assert result.success + # PDF might be None for HTML pages, but should not hang + assert result.pdf is not None or result.pdf is None + + @pytest.mark.asyncio + async def test_browser_session_management(self): + """Test improved browser session management""" + browser_config = BrowserConfig( + headless=True, + use_persistent_context=True + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + "https://httpbin.org/html", + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + assert result.success + + @pytest.mark.asyncio + async def test_memory_management(self): + """Test memory management features""" + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + memory_threshold_percent=80.0, + check_interval=1.0, + memory_wait_timeout=600 # 10 minutes default + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://httpbin.org/html", config=config) + assert result.success + + @pytest.mark.asyncio + async def test_virtual_scroll_support(self): + """Test virtual scroll support for modern web scraping""" + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + scan_full_page=True, + virtual_scroll=True + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://httpbin.org/html", config=config) + assert result.success + + @pytest.mark.asyncio + async def test_adaptive_crawling(self): + """Test adaptive crawling feature""" + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + adaptive_crawling=True + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://httpbin.org/html", config=config) + assert result.success + + +if __name__ == "__main__": + # Run the tests + pytest.main([__file__, "-v"])