- Add `prefetch` parameter to CrawlerRunConfig - Add `quick_extract_links()` function for fast link extraction - Add short-circuit in aprocess_html() for prefetch mode - Add 42 tests (unit, integration, regression) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
233 lines
8.5 KiB
Python
233 lines
8.5 KiB
Python
"""Regression tests to ensure prefetch mode doesn't break existing functionality."""
|
|
|
|
import pytest
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
|
|
|
TEST_URL = "https://docs.crawl4ai.com"
|
|
|
|
|
|
class TestNoRegressions:
|
|
"""Ensure prefetch mode doesn't break existing functionality."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_default_mode_unchanged(self):
|
|
"""Test that default mode (prefetch=False) works exactly as before."""
|
|
async with AsyncWebCrawler() as crawler:
|
|
config = CrawlerRunConfig() # Default config
|
|
result = await crawler.arun(TEST_URL, config=config)
|
|
|
|
# All standard fields should be populated
|
|
assert result.html is not None
|
|
assert result.cleaned_html is not None
|
|
assert result.links is not None
|
|
assert result.success is True
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_explicit_prefetch_false(self):
|
|
"""Test explicit prefetch=False works like default."""
|
|
async with AsyncWebCrawler() as crawler:
|
|
config = CrawlerRunConfig(prefetch=False)
|
|
result = await crawler.arun(TEST_URL, config=config)
|
|
|
|
assert result.cleaned_html is not None
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_config_clone_preserves_prefetch(self):
|
|
"""Test that config.clone() preserves prefetch setting."""
|
|
config = CrawlerRunConfig(prefetch=True)
|
|
cloned = config.clone()
|
|
|
|
assert cloned.prefetch == True
|
|
|
|
# Clone with override
|
|
cloned_false = config.clone(prefetch=False)
|
|
assert cloned_false.prefetch == False
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_config_to_dict_includes_prefetch(self):
|
|
"""Test that to_dict() includes prefetch."""
|
|
config_true = CrawlerRunConfig(prefetch=True)
|
|
config_false = CrawlerRunConfig(prefetch=False)
|
|
|
|
assert config_true.to_dict()["prefetch"] == True
|
|
assert config_false.to_dict()["prefetch"] == False
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_existing_extraction_still_works(self):
|
|
"""Test that extraction strategies still work in normal mode."""
|
|
from crawl4ai import JsonCssExtractionStrategy
|
|
|
|
schema = {
|
|
"name": "Links",
|
|
"baseSelector": "a",
|
|
"fields": [
|
|
{"name": "href", "selector": "", "type": "attribute", "attribute": "href"},
|
|
{"name": "text", "selector": "", "type": "text"}
|
|
]
|
|
}
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
config = CrawlerRunConfig(
|
|
extraction_strategy=JsonCssExtractionStrategy(schema=schema)
|
|
)
|
|
result = await crawler.arun(TEST_URL, config=config)
|
|
|
|
assert result.extracted_content is not None
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_existing_deep_crawl_still_works(self):
|
|
"""Test that deep crawl without prefetch still does full processing."""
|
|
from crawl4ai import BFSDeepCrawlStrategy
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
config = CrawlerRunConfig(
|
|
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
|
max_depth=1,
|
|
max_pages=2
|
|
)
|
|
# No prefetch - should do full processing
|
|
)
|
|
|
|
result_container = await crawler.arun(TEST_URL, config=config)
|
|
|
|
# Handle both list and iterator results
|
|
if hasattr(result_container, '__aiter__'):
|
|
results = [r async for r in result_container]
|
|
else:
|
|
results = list(result_container) if hasattr(result_container, '__iter__') else [result_container]
|
|
|
|
# Each result should have full processing
|
|
for result in results:
|
|
assert result.cleaned_html is not None
|
|
|
|
assert len(results) >= 1
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_raw_url_scheme_still_works(self):
|
|
"""Test that raw: URL scheme works for processing stored HTML."""
|
|
sample_html = """
|
|
<html>
|
|
<head><title>Test Page</title></head>
|
|
<body>
|
|
<h1>Hello World</h1>
|
|
<p>This is a test paragraph.</p>
|
|
<a href="/link1">Link 1</a>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
config = CrawlerRunConfig()
|
|
result = await crawler.arun(f"raw:{sample_html}", config=config)
|
|
|
|
assert result.success is True
|
|
assert result.html is not None
|
|
assert "Hello World" in result.html
|
|
assert result.cleaned_html is not None
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_screenshot_still_works(self):
|
|
"""Test that screenshot option still works in normal mode."""
|
|
async with AsyncWebCrawler() as crawler:
|
|
config = CrawlerRunConfig(screenshot=True)
|
|
result = await crawler.arun(TEST_URL, config=config)
|
|
|
|
assert result.success is True
|
|
# Screenshot data should be present
|
|
assert result.screenshot is not None or result.screenshot_data is not None
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_js_execution_still_works(self):
|
|
"""Test that JavaScript execution still works in normal mode."""
|
|
async with AsyncWebCrawler() as crawler:
|
|
config = CrawlerRunConfig(
|
|
js_code="document.querySelector('h1')?.textContent"
|
|
)
|
|
result = await crawler.arun(TEST_URL, config=config)
|
|
|
|
assert result.success is True
|
|
assert result.html is not None
|
|
|
|
|
|
class TestPrefetchDoesNotAffectOtherModes:
|
|
"""Test that prefetch doesn't interfere with other configurations."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_prefetch_with_other_options_ignored(self):
|
|
"""Test that other options are properly ignored in prefetch mode."""
|
|
async with AsyncWebCrawler() as crawler:
|
|
config = CrawlerRunConfig(
|
|
prefetch=True,
|
|
# These should be ignored in prefetch mode
|
|
screenshot=True,
|
|
pdf=True,
|
|
only_text=True,
|
|
word_count_threshold=100
|
|
)
|
|
result = await crawler.arun(TEST_URL, config=config)
|
|
|
|
# Should still return HTML and links
|
|
assert result.html is not None
|
|
assert result.links is not None
|
|
|
|
# But should NOT have processed content
|
|
assert result.cleaned_html is None
|
|
assert result.extracted_content is None
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_stream_mode_still_works(self):
|
|
"""Test that stream mode still works normally."""
|
|
async with AsyncWebCrawler() as crawler:
|
|
config = CrawlerRunConfig(stream=True)
|
|
result = await crawler.arun(TEST_URL, config=config)
|
|
|
|
assert result.success is True
|
|
assert result.html is not None
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cache_mode_still_works(self):
|
|
"""Test that cache mode still works normally."""
|
|
from crawl4ai import CacheMode
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
# First request - bypass cache
|
|
config1 = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
|
result1 = await crawler.arun(TEST_URL, config=config1)
|
|
assert result1.success is True
|
|
|
|
# Second request - should work
|
|
config2 = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
|
result2 = await crawler.arun(TEST_URL, config=config2)
|
|
assert result2.success is True
|
|
|
|
|
|
class TestBackwardsCompatibility:
|
|
"""Test backwards compatibility with existing code patterns."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_config_without_prefetch_works(self):
|
|
"""Test that configs created without prefetch parameter work."""
|
|
# Simulating old code that doesn't know about prefetch
|
|
config = CrawlerRunConfig(
|
|
word_count_threshold=50,
|
|
css_selector="body"
|
|
)
|
|
|
|
# Should default to prefetch=False
|
|
assert config.prefetch == False
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(TEST_URL, config=config)
|
|
assert result.success is True
|
|
assert result.cleaned_html is not None
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_from_kwargs_without_prefetch(self):
|
|
"""Test CrawlerRunConfig.from_kwargs works without prefetch."""
|
|
config = CrawlerRunConfig.from_kwargs({
|
|
"word_count_threshold": 50,
|
|
"verbose": False
|
|
})
|
|
|
|
assert config.prefetch == False
|