Add prefetch mode for two-phase deep crawling
- Add `prefetch` parameter to CrawlerRunConfig - Add `quick_extract_links()` function for fast link extraction - Add short-circuit in aprocess_html() for prefetch mode - Add 42 tests (unit, integration, regression) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
236
tests/test_prefetch_integration.py
Normal file
236
tests/test_prefetch_integration.py
Normal file
@@ -0,0 +1,236 @@
|
||||
"""Integration tests for prefetch mode with the crawler."""
|
||||
|
||||
import pytest
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig
|
||||
|
||||
# Use crawl4ai docs as test domain
|
||||
TEST_DOMAIN = "https://docs.crawl4ai.com"
|
||||
|
||||
|
||||
class TestPrefetchModeIntegration:
|
||||
"""Integration tests for prefetch mode."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_returns_html_and_links(self):
|
||||
"""Test that prefetch mode returns HTML and links only."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(prefetch=True)
|
||||
result = await crawler.arun(TEST_DOMAIN, config=config)
|
||||
|
||||
# Should have HTML
|
||||
assert result.html is not None
|
||||
assert len(result.html) > 0
|
||||
assert "<html" in result.html.lower() or "<!doctype" in result.html.lower()
|
||||
|
||||
# Should have links
|
||||
assert result.links is not None
|
||||
assert "internal" in result.links
|
||||
assert "external" in result.links
|
||||
|
||||
# Should NOT have processed content
|
||||
assert result.markdown is None or (
|
||||
hasattr(result.markdown, 'raw_markdown') and
|
||||
result.markdown.raw_markdown is None
|
||||
)
|
||||
assert result.cleaned_html is None
|
||||
assert result.extracted_content is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_preserves_metadata(self):
|
||||
"""Test that prefetch mode preserves essential metadata."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(prefetch=True)
|
||||
result = await crawler.arun(TEST_DOMAIN, config=config)
|
||||
|
||||
# Should have success flag
|
||||
assert result.success is True
|
||||
|
||||
# Should have URL
|
||||
assert result.url is not None
|
||||
|
||||
# Status code should be present
|
||||
assert result.status_code is not None or result.status_code == 200
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_with_deep_crawl(self):
|
||||
"""Test prefetch mode with deep crawl strategy."""
|
||||
from crawl4ai import BFSDeepCrawlStrategy
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
prefetch=True,
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=1,
|
||||
max_pages=3
|
||||
)
|
||||
)
|
||||
|
||||
result_container = await crawler.arun(TEST_DOMAIN, config=config)
|
||||
|
||||
# Handle both list and iterator results
|
||||
if hasattr(result_container, '__aiter__'):
|
||||
results = [r async for r in result_container]
|
||||
else:
|
||||
results = list(result_container) if hasattr(result_container, '__iter__') else [result_container]
|
||||
|
||||
# Each result should have HTML and links
|
||||
for result in results:
|
||||
assert result.html is not None
|
||||
assert result.links is not None
|
||||
|
||||
# Should have crawled at least one page
|
||||
assert len(results) >= 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_then_process_with_raw(self):
|
||||
"""Test the full two-phase workflow: prefetch then process."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Phase 1: Prefetch
|
||||
prefetch_config = CrawlerRunConfig(prefetch=True)
|
||||
prefetch_result = await crawler.arun(TEST_DOMAIN, config=prefetch_config)
|
||||
|
||||
stored_html = prefetch_result.html
|
||||
|
||||
assert stored_html is not None
|
||||
assert len(stored_html) > 0
|
||||
|
||||
# Phase 2: Process with raw: URL
|
||||
process_config = CrawlerRunConfig(
|
||||
# No prefetch - full processing
|
||||
base_url=TEST_DOMAIN # Provide base URL for link resolution
|
||||
)
|
||||
processed_result = await crawler.arun(
|
||||
f"raw:{stored_html}",
|
||||
config=process_config
|
||||
)
|
||||
|
||||
# Should now have full processing
|
||||
assert processed_result.html is not None
|
||||
assert processed_result.success is True
|
||||
# Note: cleaned_html and markdown depend on the content
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_links_structure(self):
|
||||
"""Test that links have the expected structure."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(prefetch=True)
|
||||
result = await crawler.arun(TEST_DOMAIN, config=config)
|
||||
|
||||
assert result.links is not None
|
||||
|
||||
# Check internal links structure
|
||||
if result.links["internal"]:
|
||||
link = result.links["internal"][0]
|
||||
assert "href" in link
|
||||
assert "text" in link
|
||||
assert link["href"].startswith("http")
|
||||
|
||||
# Check external links structure (if any)
|
||||
if result.links["external"]:
|
||||
link = result.links["external"][0]
|
||||
assert "href" in link
|
||||
assert "text" in link
|
||||
assert link["href"].startswith("http")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_config_clone(self):
|
||||
"""Test that config.clone() preserves prefetch setting."""
|
||||
config = CrawlerRunConfig(prefetch=True)
|
||||
cloned = config.clone()
|
||||
|
||||
assert cloned.prefetch == True
|
||||
|
||||
# Clone with override
|
||||
cloned_false = config.clone(prefetch=False)
|
||||
assert cloned_false.prefetch == False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_to_dict(self):
|
||||
"""Test that to_dict() includes prefetch."""
|
||||
config = CrawlerRunConfig(prefetch=True)
|
||||
config_dict = config.to_dict()
|
||||
|
||||
assert "prefetch" in config_dict
|
||||
assert config_dict["prefetch"] == True
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_default_false(self):
|
||||
"""Test that prefetch defaults to False."""
|
||||
config = CrawlerRunConfig()
|
||||
assert config.prefetch == False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_explicit_false(self):
|
||||
"""Test explicit prefetch=False works like default."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(prefetch=False)
|
||||
result = await crawler.arun(TEST_DOMAIN, config=config)
|
||||
|
||||
# Should have full processing
|
||||
assert result.html is not None
|
||||
# cleaned_html should be populated in normal mode
|
||||
assert result.cleaned_html is not None
|
||||
|
||||
|
||||
class TestPrefetchPerformance:
|
||||
"""Performance-related tests for prefetch mode."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_returns_quickly(self):
|
||||
"""Test that prefetch mode returns results quickly."""
|
||||
import time
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Prefetch mode
|
||||
start = time.time()
|
||||
prefetch_config = CrawlerRunConfig(prefetch=True)
|
||||
await crawler.arun(TEST_DOMAIN, config=prefetch_config)
|
||||
prefetch_time = time.time() - start
|
||||
|
||||
# Full mode
|
||||
start = time.time()
|
||||
full_config = CrawlerRunConfig()
|
||||
await crawler.arun(TEST_DOMAIN, config=full_config)
|
||||
full_time = time.time() - start
|
||||
|
||||
# Log times for debugging
|
||||
print(f"\nPrefetch: {prefetch_time:.3f}s, Full: {full_time:.3f}s")
|
||||
|
||||
# Prefetch should not be significantly slower
|
||||
# (may be same or slightly faster depending on content)
|
||||
# This is a soft check - mostly for logging
|
||||
|
||||
|
||||
class TestPrefetchWithRawHTML:
|
||||
"""Test prefetch mode with raw HTML input."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_with_raw_html(self):
|
||||
"""Test prefetch mode works with raw: URL scheme."""
|
||||
sample_html = """
|
||||
<html>
|
||||
<head><title>Test Page</title></head>
|
||||
<body>
|
||||
<h1>Hello World</h1>
|
||||
<a href="/link1">Link 1</a>
|
||||
<a href="/link2">Link 2</a>
|
||||
<a href="https://external.com/page">External</a>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
prefetch=True,
|
||||
base_url="https://example.com"
|
||||
)
|
||||
result = await crawler.arun(f"raw:{sample_html}", config=config)
|
||||
|
||||
assert result.success is True
|
||||
assert result.html is not None
|
||||
assert result.links is not None
|
||||
|
||||
# Should have extracted links
|
||||
assert len(result.links["internal"]) >= 2
|
||||
assert len(result.links["external"]) >= 1
|
||||
275
tests/test_prefetch_mode.py
Normal file
275
tests/test_prefetch_mode.py
Normal file
@@ -0,0 +1,275 @@
|
||||
"""Unit tests for the quick_extract_links function used in prefetch mode."""
|
||||
|
||||
import pytest
|
||||
from crawl4ai.utils import quick_extract_links
|
||||
|
||||
|
||||
class TestQuickExtractLinks:
|
||||
"""Unit tests for the quick_extract_links function."""
|
||||
|
||||
def test_basic_internal_links(self):
|
||||
"""Test extraction of internal links."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="/page1">Page 1</a>
|
||||
<a href="/page2">Page 2</a>
|
||||
<a href="https://example.com/page3">Page 3</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
assert len(result["internal"]) == 3
|
||||
assert result["internal"][0]["href"] == "https://example.com/page1"
|
||||
assert result["internal"][0]["text"] == "Page 1"
|
||||
|
||||
def test_external_links(self):
|
||||
"""Test extraction and classification of external links."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="https://other.com/page">External</a>
|
||||
<a href="/internal">Internal</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
assert len(result["internal"]) == 1
|
||||
assert len(result["external"]) == 1
|
||||
assert result["external"][0]["href"] == "https://other.com/page"
|
||||
|
||||
def test_ignores_javascript_and_mailto(self):
|
||||
"""Test that javascript: and mailto: links are ignored."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="javascript:void(0)">Click</a>
|
||||
<a href="mailto:test@example.com">Email</a>
|
||||
<a href="tel:+1234567890">Call</a>
|
||||
<a href="/valid">Valid</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
assert len(result["internal"]) == 1
|
||||
assert result["internal"][0]["href"] == "https://example.com/valid"
|
||||
|
||||
def test_ignores_anchor_only_links(self):
|
||||
"""Test that anchor-only links (#section) are ignored."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="#section1">Section 1</a>
|
||||
<a href="#section2">Section 2</a>
|
||||
<a href="/page#section">Page with anchor</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
# Only the page link should be included, anchor-only links are skipped
|
||||
assert len(result["internal"]) == 1
|
||||
assert "/page" in result["internal"][0]["href"]
|
||||
|
||||
def test_deduplication(self):
|
||||
"""Test that duplicate URLs are deduplicated."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="/page">Link 1</a>
|
||||
<a href="/page">Link 2</a>
|
||||
<a href="/page">Link 3</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
assert len(result["internal"]) == 1
|
||||
|
||||
def test_handles_malformed_html(self):
|
||||
"""Test graceful handling of malformed HTML."""
|
||||
html = "not valid html at all <><><"
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
# Should not raise, should return empty
|
||||
assert result["internal"] == []
|
||||
assert result["external"] == []
|
||||
|
||||
def test_empty_html(self):
|
||||
"""Test handling of empty HTML."""
|
||||
result = quick_extract_links("", "https://example.com")
|
||||
assert result == {"internal": [], "external": []}
|
||||
|
||||
def test_relative_url_resolution(self):
|
||||
"""Test that relative URLs are resolved correctly."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="page1.html">Relative</a>
|
||||
<a href="./page2.html">Dot Relative</a>
|
||||
<a href="../page3.html">Parent Relative</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com/docs/")
|
||||
|
||||
assert len(result["internal"]) >= 1
|
||||
# All should be internal and properly resolved
|
||||
for link in result["internal"]:
|
||||
assert link["href"].startswith("https://example.com")
|
||||
|
||||
def test_text_truncation(self):
|
||||
"""Test that long link text is truncated to 200 chars."""
|
||||
long_text = "A" * 300
|
||||
html = f'''
|
||||
<html>
|
||||
<body>
|
||||
<a href="/page">{long_text}</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
assert len(result["internal"]) == 1
|
||||
assert len(result["internal"][0]["text"]) == 200
|
||||
|
||||
def test_empty_href_ignored(self):
|
||||
"""Test that empty href attributes are ignored."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="">Empty</a>
|
||||
<a href=" ">Whitespace</a>
|
||||
<a href="/valid">Valid</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
assert len(result["internal"]) == 1
|
||||
assert result["internal"][0]["href"] == "https://example.com/valid"
|
||||
|
||||
def test_mixed_internal_external(self):
|
||||
"""Test correct classification of mixed internal and external links."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="/internal1">Internal 1</a>
|
||||
<a href="https://example.com/internal2">Internal 2</a>
|
||||
<a href="https://google.com">Google</a>
|
||||
<a href="https://github.com/repo">GitHub</a>
|
||||
<a href="/internal3">Internal 3</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
assert len(result["internal"]) == 3
|
||||
assert len(result["external"]) == 2
|
||||
|
||||
def test_subdomain_handling(self):
|
||||
"""Test that subdomains are handled correctly."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="https://docs.example.com/page">Docs subdomain</a>
|
||||
<a href="https://api.example.com/v1">API subdomain</a>
|
||||
<a href="https://example.com/main">Main domain</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
# All should be internal (same base domain)
|
||||
total_links = len(result["internal"]) + len(result["external"])
|
||||
assert total_links == 3
|
||||
|
||||
|
||||
class TestQuickExtractLinksEdgeCases:
|
||||
"""Edge case tests for quick_extract_links."""
|
||||
|
||||
def test_no_links_in_page(self):
|
||||
"""Test page with no links."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<h1>No Links Here</h1>
|
||||
<p>Just some text content.</p>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
assert result["internal"] == []
|
||||
assert result["external"] == []
|
||||
|
||||
def test_links_in_nested_elements(self):
|
||||
"""Test links nested in various elements."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<nav>
|
||||
<ul>
|
||||
<li><a href="/home">Home</a></li>
|
||||
<li><a href="/about">About</a></li>
|
||||
</ul>
|
||||
</nav>
|
||||
<div class="content">
|
||||
<p>Check out <a href="/products">our products</a>.</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
assert len(result["internal"]) == 3
|
||||
|
||||
def test_link_with_nested_elements(self):
|
||||
"""Test links containing nested elements."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="/page"><span>Nested</span> <strong>Text</strong></a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
assert len(result["internal"]) == 1
|
||||
assert "Nested" in result["internal"][0]["text"]
|
||||
assert "Text" in result["internal"][0]["text"]
|
||||
|
||||
def test_protocol_relative_urls(self):
|
||||
"""Test handling of protocol-relative URLs (//example.com)."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="//cdn.example.com/asset">CDN Link</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
# Should be resolved with https:
|
||||
total = len(result["internal"]) + len(result["external"])
|
||||
assert total >= 1
|
||||
|
||||
def test_whitespace_in_href(self):
|
||||
"""Test handling of whitespace around href values."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href=" /page1 ">Padded</a>
|
||||
<a href="
|
||||
/page2
|
||||
">Multiline</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
# Both should be extracted and normalized
|
||||
assert len(result["internal"]) >= 1
|
||||
232
tests/test_prefetch_regression.py
Normal file
232
tests/test_prefetch_regression.py
Normal file
@@ -0,0 +1,232 @@
|
||||
"""Regression tests to ensure prefetch mode doesn't break existing functionality."""
|
||||
|
||||
import pytest
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
TEST_URL = "https://docs.crawl4ai.com"
|
||||
|
||||
|
||||
class TestNoRegressions:
|
||||
"""Ensure prefetch mode doesn't break existing functionality."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_default_mode_unchanged(self):
|
||||
"""Test that default mode (prefetch=False) works exactly as before."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig() # Default config
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
# All standard fields should be populated
|
||||
assert result.html is not None
|
||||
assert result.cleaned_html is not None
|
||||
assert result.links is not None
|
||||
assert result.success is True
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_explicit_prefetch_false(self):
|
||||
"""Test explicit prefetch=False works like default."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(prefetch=False)
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
assert result.cleaned_html is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_config_clone_preserves_prefetch(self):
|
||||
"""Test that config.clone() preserves prefetch setting."""
|
||||
config = CrawlerRunConfig(prefetch=True)
|
||||
cloned = config.clone()
|
||||
|
||||
assert cloned.prefetch == True
|
||||
|
||||
# Clone with override
|
||||
cloned_false = config.clone(prefetch=False)
|
||||
assert cloned_false.prefetch == False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_config_to_dict_includes_prefetch(self):
|
||||
"""Test that to_dict() includes prefetch."""
|
||||
config_true = CrawlerRunConfig(prefetch=True)
|
||||
config_false = CrawlerRunConfig(prefetch=False)
|
||||
|
||||
assert config_true.to_dict()["prefetch"] == True
|
||||
assert config_false.to_dict()["prefetch"] == False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_existing_extraction_still_works(self):
|
||||
"""Test that extraction strategies still work in normal mode."""
|
||||
from crawl4ai import JsonCssExtractionStrategy
|
||||
|
||||
schema = {
|
||||
"name": "Links",
|
||||
"baseSelector": "a",
|
||||
"fields": [
|
||||
{"name": "href", "selector": "", "type": "attribute", "attribute": "href"},
|
||||
{"name": "text", "selector": "", "type": "text"}
|
||||
]
|
||||
}
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
extraction_strategy=JsonCssExtractionStrategy(schema=schema)
|
||||
)
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
assert result.extracted_content is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_existing_deep_crawl_still_works(self):
|
||||
"""Test that deep crawl without prefetch still does full processing."""
|
||||
from crawl4ai import BFSDeepCrawlStrategy
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=1,
|
||||
max_pages=2
|
||||
)
|
||||
# No prefetch - should do full processing
|
||||
)
|
||||
|
||||
result_container = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
# Handle both list and iterator results
|
||||
if hasattr(result_container, '__aiter__'):
|
||||
results = [r async for r in result_container]
|
||||
else:
|
||||
results = list(result_container) if hasattr(result_container, '__iter__') else [result_container]
|
||||
|
||||
# Each result should have full processing
|
||||
for result in results:
|
||||
assert result.cleaned_html is not None
|
||||
|
||||
assert len(results) >= 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_url_scheme_still_works(self):
|
||||
"""Test that raw: URL scheme works for processing stored HTML."""
|
||||
sample_html = """
|
||||
<html>
|
||||
<head><title>Test Page</title></head>
|
||||
<body>
|
||||
<h1>Hello World</h1>
|
||||
<p>This is a test paragraph.</p>
|
||||
<a href="/link1">Link 1</a>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig()
|
||||
result = await crawler.arun(f"raw:{sample_html}", config=config)
|
||||
|
||||
assert result.success is True
|
||||
assert result.html is not None
|
||||
assert "Hello World" in result.html
|
||||
assert result.cleaned_html is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_screenshot_still_works(self):
|
||||
"""Test that screenshot option still works in normal mode."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(screenshot=True)
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
assert result.success is True
|
||||
# Screenshot data should be present
|
||||
assert result.screenshot is not None or result.screenshot_data is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_js_execution_still_works(self):
|
||||
"""Test that JavaScript execution still works in normal mode."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
js_code="document.querySelector('h1')?.textContent"
|
||||
)
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
assert result.success is True
|
||||
assert result.html is not None
|
||||
|
||||
|
||||
class TestPrefetchDoesNotAffectOtherModes:
|
||||
"""Test that prefetch doesn't interfere with other configurations."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_with_other_options_ignored(self):
|
||||
"""Test that other options are properly ignored in prefetch mode."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
prefetch=True,
|
||||
# These should be ignored in prefetch mode
|
||||
screenshot=True,
|
||||
pdf=True,
|
||||
only_text=True,
|
||||
word_count_threshold=100
|
||||
)
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
# Should still return HTML and links
|
||||
assert result.html is not None
|
||||
assert result.links is not None
|
||||
|
||||
# But should NOT have processed content
|
||||
assert result.cleaned_html is None
|
||||
assert result.extracted_content is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_stream_mode_still_works(self):
|
||||
"""Test that stream mode still works normally."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(stream=True)
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
assert result.success is True
|
||||
assert result.html is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cache_mode_still_works(self):
|
||||
"""Test that cache mode still works normally."""
|
||||
from crawl4ai import CacheMode
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# First request - bypass cache
|
||||
config1 = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
result1 = await crawler.arun(TEST_URL, config=config1)
|
||||
assert result1.success is True
|
||||
|
||||
# Second request - should work
|
||||
config2 = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
result2 = await crawler.arun(TEST_URL, config=config2)
|
||||
assert result2.success is True
|
||||
|
||||
|
||||
class TestBackwardsCompatibility:
|
||||
"""Test backwards compatibility with existing code patterns."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_config_without_prefetch_works(self):
|
||||
"""Test that configs created without prefetch parameter work."""
|
||||
# Simulating old code that doesn't know about prefetch
|
||||
config = CrawlerRunConfig(
|
||||
word_count_threshold=50,
|
||||
css_selector="body"
|
||||
)
|
||||
|
||||
# Should default to prefetch=False
|
||||
assert config.prefetch == False
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
assert result.success is True
|
||||
assert result.cleaned_html is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_from_kwargs_without_prefetch(self):
|
||||
"""Test CrawlerRunConfig.from_kwargs works without prefetch."""
|
||||
config = CrawlerRunConfig.from_kwargs({
|
||||
"word_count_threshold": 50,
|
||||
"verbose": False
|
||||
})
|
||||
|
||||
assert config.prefetch == False
|
||||
Reference in New Issue
Block a user