Add prefetch mode for two-phase deep crawling
- Add `prefetch` parameter to CrawlerRunConfig - Add `quick_extract_links()` function for fast link extraction - Add short-circuit in aprocess_html() for prefetch mode - Add 42 tests (unit, integration, regression) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
232
tests/test_prefetch_regression.py
Normal file
232
tests/test_prefetch_regression.py
Normal file
@@ -0,0 +1,232 @@
|
||||
"""Regression tests to ensure prefetch mode doesn't break existing functionality."""
|
||||
|
||||
import pytest
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
TEST_URL = "https://docs.crawl4ai.com"
|
||||
|
||||
|
||||
class TestNoRegressions:
|
||||
"""Ensure prefetch mode doesn't break existing functionality."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_default_mode_unchanged(self):
|
||||
"""Test that default mode (prefetch=False) works exactly as before."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig() # Default config
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
# All standard fields should be populated
|
||||
assert result.html is not None
|
||||
assert result.cleaned_html is not None
|
||||
assert result.links is not None
|
||||
assert result.success is True
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_explicit_prefetch_false(self):
|
||||
"""Test explicit prefetch=False works like default."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(prefetch=False)
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
assert result.cleaned_html is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_config_clone_preserves_prefetch(self):
|
||||
"""Test that config.clone() preserves prefetch setting."""
|
||||
config = CrawlerRunConfig(prefetch=True)
|
||||
cloned = config.clone()
|
||||
|
||||
assert cloned.prefetch == True
|
||||
|
||||
# Clone with override
|
||||
cloned_false = config.clone(prefetch=False)
|
||||
assert cloned_false.prefetch == False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_config_to_dict_includes_prefetch(self):
|
||||
"""Test that to_dict() includes prefetch."""
|
||||
config_true = CrawlerRunConfig(prefetch=True)
|
||||
config_false = CrawlerRunConfig(prefetch=False)
|
||||
|
||||
assert config_true.to_dict()["prefetch"] == True
|
||||
assert config_false.to_dict()["prefetch"] == False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_existing_extraction_still_works(self):
|
||||
"""Test that extraction strategies still work in normal mode."""
|
||||
from crawl4ai import JsonCssExtractionStrategy
|
||||
|
||||
schema = {
|
||||
"name": "Links",
|
||||
"baseSelector": "a",
|
||||
"fields": [
|
||||
{"name": "href", "selector": "", "type": "attribute", "attribute": "href"},
|
||||
{"name": "text", "selector": "", "type": "text"}
|
||||
]
|
||||
}
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
extraction_strategy=JsonCssExtractionStrategy(schema=schema)
|
||||
)
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
assert result.extracted_content is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_existing_deep_crawl_still_works(self):
|
||||
"""Test that deep crawl without prefetch still does full processing."""
|
||||
from crawl4ai import BFSDeepCrawlStrategy
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=1,
|
||||
max_pages=2
|
||||
)
|
||||
# No prefetch - should do full processing
|
||||
)
|
||||
|
||||
result_container = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
# Handle both list and iterator results
|
||||
if hasattr(result_container, '__aiter__'):
|
||||
results = [r async for r in result_container]
|
||||
else:
|
||||
results = list(result_container) if hasattr(result_container, '__iter__') else [result_container]
|
||||
|
||||
# Each result should have full processing
|
||||
for result in results:
|
||||
assert result.cleaned_html is not None
|
||||
|
||||
assert len(results) >= 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_url_scheme_still_works(self):
|
||||
"""Test that raw: URL scheme works for processing stored HTML."""
|
||||
sample_html = """
|
||||
<html>
|
||||
<head><title>Test Page</title></head>
|
||||
<body>
|
||||
<h1>Hello World</h1>
|
||||
<p>This is a test paragraph.</p>
|
||||
<a href="/link1">Link 1</a>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig()
|
||||
result = await crawler.arun(f"raw:{sample_html}", config=config)
|
||||
|
||||
assert result.success is True
|
||||
assert result.html is not None
|
||||
assert "Hello World" in result.html
|
||||
assert result.cleaned_html is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_screenshot_still_works(self):
|
||||
"""Test that screenshot option still works in normal mode."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(screenshot=True)
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
assert result.success is True
|
||||
# Screenshot data should be present
|
||||
assert result.screenshot is not None or result.screenshot_data is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_js_execution_still_works(self):
|
||||
"""Test that JavaScript execution still works in normal mode."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
js_code="document.querySelector('h1')?.textContent"
|
||||
)
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
assert result.success is True
|
||||
assert result.html is not None
|
||||
|
||||
|
||||
class TestPrefetchDoesNotAffectOtherModes:
|
||||
"""Test that prefetch doesn't interfere with other configurations."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_with_other_options_ignored(self):
|
||||
"""Test that other options are properly ignored in prefetch mode."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
prefetch=True,
|
||||
# These should be ignored in prefetch mode
|
||||
screenshot=True,
|
||||
pdf=True,
|
||||
only_text=True,
|
||||
word_count_threshold=100
|
||||
)
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
# Should still return HTML and links
|
||||
assert result.html is not None
|
||||
assert result.links is not None
|
||||
|
||||
# But should NOT have processed content
|
||||
assert result.cleaned_html is None
|
||||
assert result.extracted_content is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_stream_mode_still_works(self):
|
||||
"""Test that stream mode still works normally."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(stream=True)
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
assert result.success is True
|
||||
assert result.html is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cache_mode_still_works(self):
|
||||
"""Test that cache mode still works normally."""
|
||||
from crawl4ai import CacheMode
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# First request - bypass cache
|
||||
config1 = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
result1 = await crawler.arun(TEST_URL, config=config1)
|
||||
assert result1.success is True
|
||||
|
||||
# Second request - should work
|
||||
config2 = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
result2 = await crawler.arun(TEST_URL, config=config2)
|
||||
assert result2.success is True
|
||||
|
||||
|
||||
class TestBackwardsCompatibility:
|
||||
"""Test backwards compatibility with existing code patterns."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_config_without_prefetch_works(self):
|
||||
"""Test that configs created without prefetch parameter work."""
|
||||
# Simulating old code that doesn't know about prefetch
|
||||
config = CrawlerRunConfig(
|
||||
word_count_threshold=50,
|
||||
css_selector="body"
|
||||
)
|
||||
|
||||
# Should default to prefetch=False
|
||||
assert config.prefetch == False
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
assert result.success is True
|
||||
assert result.cleaned_html is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_from_kwargs_without_prefetch(self):
|
||||
"""Test CrawlerRunConfig.from_kwargs works without prefetch."""
|
||||
config = CrawlerRunConfig.from_kwargs({
|
||||
"word_count_threshold": 50,
|
||||
"verbose": False
|
||||
})
|
||||
|
||||
assert config.prefetch == False
|
||||
Reference in New Issue
Block a user