Files
crawl4ai/tests/test_prefetch_regression.py
unclecode fde4e9f0c6 Add prefetch mode for two-phase deep crawling
- Add `prefetch` parameter to CrawlerRunConfig
- Add `quick_extract_links()` function for fast link extraction
- Add short-circuit in aprocess_html() for prefetch mode
- Add 42 tests (unit, integration, regression)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-25 01:55:08 +00:00

233 lines
8.5 KiB
Python

"""Regression tests to ensure prefetch mode doesn't break existing functionality."""
import pytest
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
TEST_URL = "https://docs.crawl4ai.com"
class TestNoRegressions:
"""Ensure prefetch mode doesn't break existing functionality."""
@pytest.mark.asyncio
async def test_default_mode_unchanged(self):
"""Test that default mode (prefetch=False) works exactly as before."""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig() # Default config
result = await crawler.arun(TEST_URL, config=config)
# All standard fields should be populated
assert result.html is not None
assert result.cleaned_html is not None
assert result.links is not None
assert result.success is True
@pytest.mark.asyncio
async def test_explicit_prefetch_false(self):
"""Test explicit prefetch=False works like default."""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(prefetch=False)
result = await crawler.arun(TEST_URL, config=config)
assert result.cleaned_html is not None
@pytest.mark.asyncio
async def test_config_clone_preserves_prefetch(self):
"""Test that config.clone() preserves prefetch setting."""
config = CrawlerRunConfig(prefetch=True)
cloned = config.clone()
assert cloned.prefetch == True
# Clone with override
cloned_false = config.clone(prefetch=False)
assert cloned_false.prefetch == False
@pytest.mark.asyncio
async def test_config_to_dict_includes_prefetch(self):
"""Test that to_dict() includes prefetch."""
config_true = CrawlerRunConfig(prefetch=True)
config_false = CrawlerRunConfig(prefetch=False)
assert config_true.to_dict()["prefetch"] == True
assert config_false.to_dict()["prefetch"] == False
@pytest.mark.asyncio
async def test_existing_extraction_still_works(self):
"""Test that extraction strategies still work in normal mode."""
from crawl4ai import JsonCssExtractionStrategy
schema = {
"name": "Links",
"baseSelector": "a",
"fields": [
{"name": "href", "selector": "", "type": "attribute", "attribute": "href"},
{"name": "text", "selector": "", "type": "text"}
]
}
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
extraction_strategy=JsonCssExtractionStrategy(schema=schema)
)
result = await crawler.arun(TEST_URL, config=config)
assert result.extracted_content is not None
@pytest.mark.asyncio
async def test_existing_deep_crawl_still_works(self):
"""Test that deep crawl without prefetch still does full processing."""
from crawl4ai import BFSDeepCrawlStrategy
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=1,
max_pages=2
)
# No prefetch - should do full processing
)
result_container = await crawler.arun(TEST_URL, config=config)
# Handle both list and iterator results
if hasattr(result_container, '__aiter__'):
results = [r async for r in result_container]
else:
results = list(result_container) if hasattr(result_container, '__iter__') else [result_container]
# Each result should have full processing
for result in results:
assert result.cleaned_html is not None
assert len(results) >= 1
@pytest.mark.asyncio
async def test_raw_url_scheme_still_works(self):
"""Test that raw: URL scheme works for processing stored HTML."""
sample_html = """
<html>
<head><title>Test Page</title></head>
<body>
<h1>Hello World</h1>
<p>This is a test paragraph.</p>
<a href="/link1">Link 1</a>
</body>
</html>
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig()
result = await crawler.arun(f"raw:{sample_html}", config=config)
assert result.success is True
assert result.html is not None
assert "Hello World" in result.html
assert result.cleaned_html is not None
@pytest.mark.asyncio
async def test_screenshot_still_works(self):
"""Test that screenshot option still works in normal mode."""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(screenshot=True)
result = await crawler.arun(TEST_URL, config=config)
assert result.success is True
# Screenshot data should be present
assert result.screenshot is not None or result.screenshot_data is not None
@pytest.mark.asyncio
async def test_js_execution_still_works(self):
"""Test that JavaScript execution still works in normal mode."""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.querySelector('h1')?.textContent"
)
result = await crawler.arun(TEST_URL, config=config)
assert result.success is True
assert result.html is not None
class TestPrefetchDoesNotAffectOtherModes:
"""Test that prefetch doesn't interfere with other configurations."""
@pytest.mark.asyncio
async def test_prefetch_with_other_options_ignored(self):
"""Test that other options are properly ignored in prefetch mode."""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
prefetch=True,
# These should be ignored in prefetch mode
screenshot=True,
pdf=True,
only_text=True,
word_count_threshold=100
)
result = await crawler.arun(TEST_URL, config=config)
# Should still return HTML and links
assert result.html is not None
assert result.links is not None
# But should NOT have processed content
assert result.cleaned_html is None
assert result.extracted_content is None
@pytest.mark.asyncio
async def test_stream_mode_still_works(self):
"""Test that stream mode still works normally."""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(stream=True)
result = await crawler.arun(TEST_URL, config=config)
assert result.success is True
assert result.html is not None
@pytest.mark.asyncio
async def test_cache_mode_still_works(self):
"""Test that cache mode still works normally."""
from crawl4ai import CacheMode
async with AsyncWebCrawler() as crawler:
# First request - bypass cache
config1 = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
result1 = await crawler.arun(TEST_URL, config=config1)
assert result1.success is True
# Second request - should work
config2 = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
result2 = await crawler.arun(TEST_URL, config=config2)
assert result2.success is True
class TestBackwardsCompatibility:
"""Test backwards compatibility with existing code patterns."""
@pytest.mark.asyncio
async def test_config_without_prefetch_works(self):
"""Test that configs created without prefetch parameter work."""
# Simulating old code that doesn't know about prefetch
config = CrawlerRunConfig(
word_count_threshold=50,
css_selector="body"
)
# Should default to prefetch=False
assert config.prefetch == False
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(TEST_URL, config=config)
assert result.success is True
assert result.cleaned_html is not None
@pytest.mark.asyncio
async def test_from_kwargs_without_prefetch(self):
"""Test CrawlerRunConfig.from_kwargs works without prefetch."""
config = CrawlerRunConfig.from_kwargs({
"word_count_threshold": 50,
"verbose": False
})
assert config.prefetch == False