Add prefetch mode for two-phase deep crawling
- Add `prefetch` parameter to CrawlerRunConfig - Add `quick_extract_links()` function for fast link extraction - Add short-circuit in aprocess_html() for prefetch mode - Add 42 tests (unit, integration, regression) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1291,6 +1291,7 @@ class CrawlerRunConfig():
|
||||
# Connection Parameters
|
||||
method: str = "GET",
|
||||
stream: bool = False,
|
||||
prefetch: bool = False, # When True, return only HTML + links (skip heavy processing)
|
||||
url: str = None,
|
||||
base_url: str = None, # Base URL for markdown link resolution (used with raw: HTML)
|
||||
check_robots_txt: bool = False,
|
||||
@@ -1422,6 +1423,7 @@ class CrawlerRunConfig():
|
||||
|
||||
# Connection Parameters
|
||||
self.stream = stream
|
||||
self.prefetch = prefetch # Prefetch mode: return only HTML + links
|
||||
self.method = method
|
||||
|
||||
# Robots.txt Handling Parameters
|
||||
@@ -1694,6 +1696,7 @@ class CrawlerRunConfig():
|
||||
# Connection Parameters
|
||||
method=kwargs.get("method", "GET"),
|
||||
stream=kwargs.get("stream", False),
|
||||
prefetch=kwargs.get("prefetch", False),
|
||||
check_robots_txt=kwargs.get("check_robots_txt", False),
|
||||
user_agent=kwargs.get("user_agent"),
|
||||
user_agent_mode=kwargs.get("user_agent_mode"),
|
||||
@@ -1799,6 +1802,7 @@ class CrawlerRunConfig():
|
||||
"capture_console_messages": self.capture_console_messages,
|
||||
"method": self.method,
|
||||
"stream": self.stream,
|
||||
"prefetch": self.prefetch,
|
||||
"check_robots_txt": self.check_robots_txt,
|
||||
"user_agent": self.user_agent,
|
||||
"user_agent_mode": self.user_agent_mode,
|
||||
|
||||
@@ -514,6 +514,27 @@ class AsyncWebCrawler:
|
||||
Returns:
|
||||
CrawlResult: Processed result containing extracted and formatted content
|
||||
"""
|
||||
# === PREFETCH MODE SHORT-CIRCUIT ===
|
||||
if getattr(config, 'prefetch', False):
|
||||
from .utils import quick_extract_links
|
||||
|
||||
# Use base_url from config (for raw: URLs), redirected_url, or original url
|
||||
effective_url = getattr(config, 'base_url', None) or kwargs.get('redirected_url') or url
|
||||
links = quick_extract_links(html, effective_url)
|
||||
|
||||
return CrawlResult(
|
||||
url=url,
|
||||
html=html,
|
||||
success=True,
|
||||
links=links,
|
||||
status_code=kwargs.get('status_code'),
|
||||
response_headers=kwargs.get('response_headers'),
|
||||
redirected_url=kwargs.get('redirected_url'),
|
||||
ssl_certificate=kwargs.get('ssl_certificate'),
|
||||
# All other fields default to None
|
||||
)
|
||||
# === END PREFETCH SHORT-CIRCUIT ===
|
||||
|
||||
cleaned_html = ""
|
||||
try:
|
||||
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
|
||||
|
||||
@@ -2461,6 +2461,54 @@ def normalize_url_tmp(href, base_url):
|
||||
return href.strip()
|
||||
|
||||
|
||||
def quick_extract_links(html: str, base_url: str) -> Dict[str, List[Dict[str, str]]]:
|
||||
"""
|
||||
Fast link extraction for prefetch mode.
|
||||
Only extracts <a href> tags - no media, no cleaning, no heavy processing.
|
||||
|
||||
Args:
|
||||
html: Raw HTML string
|
||||
base_url: Base URL for resolving relative links
|
||||
|
||||
Returns:
|
||||
{"internal": [{"href": "...", "text": "..."}], "external": [...]}
|
||||
"""
|
||||
from lxml.html import document_fromstring
|
||||
|
||||
try:
|
||||
doc = document_fromstring(html)
|
||||
except Exception:
|
||||
return {"internal": [], "external": []}
|
||||
|
||||
base_domain = get_base_domain(base_url)
|
||||
internal: List[Dict[str, str]] = []
|
||||
external: List[Dict[str, str]] = []
|
||||
seen: Set[str] = set()
|
||||
|
||||
for a in doc.xpath("//a[@href]"):
|
||||
href = a.get("href", "").strip()
|
||||
if not href or href.startswith(("#", "javascript:", "mailto:", "tel:")):
|
||||
continue
|
||||
|
||||
# Normalize URL
|
||||
normalized = normalize_url_for_deep_crawl(href, base_url)
|
||||
if not normalized or normalized in seen:
|
||||
continue
|
||||
seen.add(normalized)
|
||||
|
||||
# Extract text (truncated for memory efficiency)
|
||||
text = (a.text_content() or "").strip()[:200]
|
||||
|
||||
link_data = {"href": normalized, "text": text}
|
||||
|
||||
if is_external_url(normalized, base_domain):
|
||||
external.append(link_data)
|
||||
else:
|
||||
internal.append(link_data)
|
||||
|
||||
return {"internal": internal, "external": external}
|
||||
|
||||
|
||||
def get_base_domain(url: str) -> str:
|
||||
"""
|
||||
Extract the base domain from a given URL, handling common edge cases.
|
||||
|
||||
236
tests/test_prefetch_integration.py
Normal file
236
tests/test_prefetch_integration.py
Normal file
@@ -0,0 +1,236 @@
|
||||
"""Integration tests for prefetch mode with the crawler."""
|
||||
|
||||
import pytest
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig
|
||||
|
||||
# Use crawl4ai docs as test domain
|
||||
TEST_DOMAIN = "https://docs.crawl4ai.com"
|
||||
|
||||
|
||||
class TestPrefetchModeIntegration:
|
||||
"""Integration tests for prefetch mode."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_returns_html_and_links(self):
|
||||
"""Test that prefetch mode returns HTML and links only."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(prefetch=True)
|
||||
result = await crawler.arun(TEST_DOMAIN, config=config)
|
||||
|
||||
# Should have HTML
|
||||
assert result.html is not None
|
||||
assert len(result.html) > 0
|
||||
assert "<html" in result.html.lower() or "<!doctype" in result.html.lower()
|
||||
|
||||
# Should have links
|
||||
assert result.links is not None
|
||||
assert "internal" in result.links
|
||||
assert "external" in result.links
|
||||
|
||||
# Should NOT have processed content
|
||||
assert result.markdown is None or (
|
||||
hasattr(result.markdown, 'raw_markdown') and
|
||||
result.markdown.raw_markdown is None
|
||||
)
|
||||
assert result.cleaned_html is None
|
||||
assert result.extracted_content is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_preserves_metadata(self):
|
||||
"""Test that prefetch mode preserves essential metadata."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(prefetch=True)
|
||||
result = await crawler.arun(TEST_DOMAIN, config=config)
|
||||
|
||||
# Should have success flag
|
||||
assert result.success is True
|
||||
|
||||
# Should have URL
|
||||
assert result.url is not None
|
||||
|
||||
# Status code should be present
|
||||
assert result.status_code is not None or result.status_code == 200
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_with_deep_crawl(self):
|
||||
"""Test prefetch mode with deep crawl strategy."""
|
||||
from crawl4ai import BFSDeepCrawlStrategy
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
prefetch=True,
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=1,
|
||||
max_pages=3
|
||||
)
|
||||
)
|
||||
|
||||
result_container = await crawler.arun(TEST_DOMAIN, config=config)
|
||||
|
||||
# Handle both list and iterator results
|
||||
if hasattr(result_container, '__aiter__'):
|
||||
results = [r async for r in result_container]
|
||||
else:
|
||||
results = list(result_container) if hasattr(result_container, '__iter__') else [result_container]
|
||||
|
||||
# Each result should have HTML and links
|
||||
for result in results:
|
||||
assert result.html is not None
|
||||
assert result.links is not None
|
||||
|
||||
# Should have crawled at least one page
|
||||
assert len(results) >= 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_then_process_with_raw(self):
|
||||
"""Test the full two-phase workflow: prefetch then process."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Phase 1: Prefetch
|
||||
prefetch_config = CrawlerRunConfig(prefetch=True)
|
||||
prefetch_result = await crawler.arun(TEST_DOMAIN, config=prefetch_config)
|
||||
|
||||
stored_html = prefetch_result.html
|
||||
|
||||
assert stored_html is not None
|
||||
assert len(stored_html) > 0
|
||||
|
||||
# Phase 2: Process with raw: URL
|
||||
process_config = CrawlerRunConfig(
|
||||
# No prefetch - full processing
|
||||
base_url=TEST_DOMAIN # Provide base URL for link resolution
|
||||
)
|
||||
processed_result = await crawler.arun(
|
||||
f"raw:{stored_html}",
|
||||
config=process_config
|
||||
)
|
||||
|
||||
# Should now have full processing
|
||||
assert processed_result.html is not None
|
||||
assert processed_result.success is True
|
||||
# Note: cleaned_html and markdown depend on the content
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_links_structure(self):
|
||||
"""Test that links have the expected structure."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(prefetch=True)
|
||||
result = await crawler.arun(TEST_DOMAIN, config=config)
|
||||
|
||||
assert result.links is not None
|
||||
|
||||
# Check internal links structure
|
||||
if result.links["internal"]:
|
||||
link = result.links["internal"][0]
|
||||
assert "href" in link
|
||||
assert "text" in link
|
||||
assert link["href"].startswith("http")
|
||||
|
||||
# Check external links structure (if any)
|
||||
if result.links["external"]:
|
||||
link = result.links["external"][0]
|
||||
assert "href" in link
|
||||
assert "text" in link
|
||||
assert link["href"].startswith("http")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_config_clone(self):
|
||||
"""Test that config.clone() preserves prefetch setting."""
|
||||
config = CrawlerRunConfig(prefetch=True)
|
||||
cloned = config.clone()
|
||||
|
||||
assert cloned.prefetch == True
|
||||
|
||||
# Clone with override
|
||||
cloned_false = config.clone(prefetch=False)
|
||||
assert cloned_false.prefetch == False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_to_dict(self):
|
||||
"""Test that to_dict() includes prefetch."""
|
||||
config = CrawlerRunConfig(prefetch=True)
|
||||
config_dict = config.to_dict()
|
||||
|
||||
assert "prefetch" in config_dict
|
||||
assert config_dict["prefetch"] == True
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_default_false(self):
|
||||
"""Test that prefetch defaults to False."""
|
||||
config = CrawlerRunConfig()
|
||||
assert config.prefetch == False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_explicit_false(self):
|
||||
"""Test explicit prefetch=False works like default."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(prefetch=False)
|
||||
result = await crawler.arun(TEST_DOMAIN, config=config)
|
||||
|
||||
# Should have full processing
|
||||
assert result.html is not None
|
||||
# cleaned_html should be populated in normal mode
|
||||
assert result.cleaned_html is not None
|
||||
|
||||
|
||||
class TestPrefetchPerformance:
|
||||
"""Performance-related tests for prefetch mode."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_returns_quickly(self):
|
||||
"""Test that prefetch mode returns results quickly."""
|
||||
import time
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Prefetch mode
|
||||
start = time.time()
|
||||
prefetch_config = CrawlerRunConfig(prefetch=True)
|
||||
await crawler.arun(TEST_DOMAIN, config=prefetch_config)
|
||||
prefetch_time = time.time() - start
|
||||
|
||||
# Full mode
|
||||
start = time.time()
|
||||
full_config = CrawlerRunConfig()
|
||||
await crawler.arun(TEST_DOMAIN, config=full_config)
|
||||
full_time = time.time() - start
|
||||
|
||||
# Log times for debugging
|
||||
print(f"\nPrefetch: {prefetch_time:.3f}s, Full: {full_time:.3f}s")
|
||||
|
||||
# Prefetch should not be significantly slower
|
||||
# (may be same or slightly faster depending on content)
|
||||
# This is a soft check - mostly for logging
|
||||
|
||||
|
||||
class TestPrefetchWithRawHTML:
|
||||
"""Test prefetch mode with raw HTML input."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_with_raw_html(self):
|
||||
"""Test prefetch mode works with raw: URL scheme."""
|
||||
sample_html = """
|
||||
<html>
|
||||
<head><title>Test Page</title></head>
|
||||
<body>
|
||||
<h1>Hello World</h1>
|
||||
<a href="/link1">Link 1</a>
|
||||
<a href="/link2">Link 2</a>
|
||||
<a href="https://external.com/page">External</a>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
prefetch=True,
|
||||
base_url="https://example.com"
|
||||
)
|
||||
result = await crawler.arun(f"raw:{sample_html}", config=config)
|
||||
|
||||
assert result.success is True
|
||||
assert result.html is not None
|
||||
assert result.links is not None
|
||||
|
||||
# Should have extracted links
|
||||
assert len(result.links["internal"]) >= 2
|
||||
assert len(result.links["external"]) >= 1
|
||||
275
tests/test_prefetch_mode.py
Normal file
275
tests/test_prefetch_mode.py
Normal file
@@ -0,0 +1,275 @@
|
||||
"""Unit tests for the quick_extract_links function used in prefetch mode."""
|
||||
|
||||
import pytest
|
||||
from crawl4ai.utils import quick_extract_links
|
||||
|
||||
|
||||
class TestQuickExtractLinks:
|
||||
"""Unit tests for the quick_extract_links function."""
|
||||
|
||||
def test_basic_internal_links(self):
|
||||
"""Test extraction of internal links."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="/page1">Page 1</a>
|
||||
<a href="/page2">Page 2</a>
|
||||
<a href="https://example.com/page3">Page 3</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
assert len(result["internal"]) == 3
|
||||
assert result["internal"][0]["href"] == "https://example.com/page1"
|
||||
assert result["internal"][0]["text"] == "Page 1"
|
||||
|
||||
def test_external_links(self):
|
||||
"""Test extraction and classification of external links."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="https://other.com/page">External</a>
|
||||
<a href="/internal">Internal</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
assert len(result["internal"]) == 1
|
||||
assert len(result["external"]) == 1
|
||||
assert result["external"][0]["href"] == "https://other.com/page"
|
||||
|
||||
def test_ignores_javascript_and_mailto(self):
|
||||
"""Test that javascript: and mailto: links are ignored."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="javascript:void(0)">Click</a>
|
||||
<a href="mailto:test@example.com">Email</a>
|
||||
<a href="tel:+1234567890">Call</a>
|
||||
<a href="/valid">Valid</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
assert len(result["internal"]) == 1
|
||||
assert result["internal"][0]["href"] == "https://example.com/valid"
|
||||
|
||||
def test_ignores_anchor_only_links(self):
|
||||
"""Test that anchor-only links (#section) are ignored."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="#section1">Section 1</a>
|
||||
<a href="#section2">Section 2</a>
|
||||
<a href="/page#section">Page with anchor</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
# Only the page link should be included, anchor-only links are skipped
|
||||
assert len(result["internal"]) == 1
|
||||
assert "/page" in result["internal"][0]["href"]
|
||||
|
||||
def test_deduplication(self):
|
||||
"""Test that duplicate URLs are deduplicated."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="/page">Link 1</a>
|
||||
<a href="/page">Link 2</a>
|
||||
<a href="/page">Link 3</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
assert len(result["internal"]) == 1
|
||||
|
||||
def test_handles_malformed_html(self):
|
||||
"""Test graceful handling of malformed HTML."""
|
||||
html = "not valid html at all <><><"
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
# Should not raise, should return empty
|
||||
assert result["internal"] == []
|
||||
assert result["external"] == []
|
||||
|
||||
def test_empty_html(self):
|
||||
"""Test handling of empty HTML."""
|
||||
result = quick_extract_links("", "https://example.com")
|
||||
assert result == {"internal": [], "external": []}
|
||||
|
||||
def test_relative_url_resolution(self):
|
||||
"""Test that relative URLs are resolved correctly."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="page1.html">Relative</a>
|
||||
<a href="./page2.html">Dot Relative</a>
|
||||
<a href="../page3.html">Parent Relative</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com/docs/")
|
||||
|
||||
assert len(result["internal"]) >= 1
|
||||
# All should be internal and properly resolved
|
||||
for link in result["internal"]:
|
||||
assert link["href"].startswith("https://example.com")
|
||||
|
||||
def test_text_truncation(self):
|
||||
"""Test that long link text is truncated to 200 chars."""
|
||||
long_text = "A" * 300
|
||||
html = f'''
|
||||
<html>
|
||||
<body>
|
||||
<a href="/page">{long_text}</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
assert len(result["internal"]) == 1
|
||||
assert len(result["internal"][0]["text"]) == 200
|
||||
|
||||
def test_empty_href_ignored(self):
|
||||
"""Test that empty href attributes are ignored."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="">Empty</a>
|
||||
<a href=" ">Whitespace</a>
|
||||
<a href="/valid">Valid</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
assert len(result["internal"]) == 1
|
||||
assert result["internal"][0]["href"] == "https://example.com/valid"
|
||||
|
||||
def test_mixed_internal_external(self):
|
||||
"""Test correct classification of mixed internal and external links."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="/internal1">Internal 1</a>
|
||||
<a href="https://example.com/internal2">Internal 2</a>
|
||||
<a href="https://google.com">Google</a>
|
||||
<a href="https://github.com/repo">GitHub</a>
|
||||
<a href="/internal3">Internal 3</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
assert len(result["internal"]) == 3
|
||||
assert len(result["external"]) == 2
|
||||
|
||||
def test_subdomain_handling(self):
|
||||
"""Test that subdomains are handled correctly."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="https://docs.example.com/page">Docs subdomain</a>
|
||||
<a href="https://api.example.com/v1">API subdomain</a>
|
||||
<a href="https://example.com/main">Main domain</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
# All should be internal (same base domain)
|
||||
total_links = len(result["internal"]) + len(result["external"])
|
||||
assert total_links == 3
|
||||
|
||||
|
||||
class TestQuickExtractLinksEdgeCases:
|
||||
"""Edge case tests for quick_extract_links."""
|
||||
|
||||
def test_no_links_in_page(self):
|
||||
"""Test page with no links."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<h1>No Links Here</h1>
|
||||
<p>Just some text content.</p>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
assert result["internal"] == []
|
||||
assert result["external"] == []
|
||||
|
||||
def test_links_in_nested_elements(self):
|
||||
"""Test links nested in various elements."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<nav>
|
||||
<ul>
|
||||
<li><a href="/home">Home</a></li>
|
||||
<li><a href="/about">About</a></li>
|
||||
</ul>
|
||||
</nav>
|
||||
<div class="content">
|
||||
<p>Check out <a href="/products">our products</a>.</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
assert len(result["internal"]) == 3
|
||||
|
||||
def test_link_with_nested_elements(self):
|
||||
"""Test links containing nested elements."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="/page"><span>Nested</span> <strong>Text</strong></a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
assert len(result["internal"]) == 1
|
||||
assert "Nested" in result["internal"][0]["text"]
|
||||
assert "Text" in result["internal"][0]["text"]
|
||||
|
||||
def test_protocol_relative_urls(self):
|
||||
"""Test handling of protocol-relative URLs (//example.com)."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href="//cdn.example.com/asset">CDN Link</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
# Should be resolved with https:
|
||||
total = len(result["internal"]) + len(result["external"])
|
||||
assert total >= 1
|
||||
|
||||
def test_whitespace_in_href(self):
|
||||
"""Test handling of whitespace around href values."""
|
||||
html = '''
|
||||
<html>
|
||||
<body>
|
||||
<a href=" /page1 ">Padded</a>
|
||||
<a href="
|
||||
/page2
|
||||
">Multiline</a>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
result = quick_extract_links(html, "https://example.com")
|
||||
|
||||
# Both should be extracted and normalized
|
||||
assert len(result["internal"]) >= 1
|
||||
232
tests/test_prefetch_regression.py
Normal file
232
tests/test_prefetch_regression.py
Normal file
@@ -0,0 +1,232 @@
|
||||
"""Regression tests to ensure prefetch mode doesn't break existing functionality."""
|
||||
|
||||
import pytest
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
TEST_URL = "https://docs.crawl4ai.com"
|
||||
|
||||
|
||||
class TestNoRegressions:
|
||||
"""Ensure prefetch mode doesn't break existing functionality."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_default_mode_unchanged(self):
|
||||
"""Test that default mode (prefetch=False) works exactly as before."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig() # Default config
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
# All standard fields should be populated
|
||||
assert result.html is not None
|
||||
assert result.cleaned_html is not None
|
||||
assert result.links is not None
|
||||
assert result.success is True
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_explicit_prefetch_false(self):
|
||||
"""Test explicit prefetch=False works like default."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(prefetch=False)
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
assert result.cleaned_html is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_config_clone_preserves_prefetch(self):
|
||||
"""Test that config.clone() preserves prefetch setting."""
|
||||
config = CrawlerRunConfig(prefetch=True)
|
||||
cloned = config.clone()
|
||||
|
||||
assert cloned.prefetch == True
|
||||
|
||||
# Clone with override
|
||||
cloned_false = config.clone(prefetch=False)
|
||||
assert cloned_false.prefetch == False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_config_to_dict_includes_prefetch(self):
|
||||
"""Test that to_dict() includes prefetch."""
|
||||
config_true = CrawlerRunConfig(prefetch=True)
|
||||
config_false = CrawlerRunConfig(prefetch=False)
|
||||
|
||||
assert config_true.to_dict()["prefetch"] == True
|
||||
assert config_false.to_dict()["prefetch"] == False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_existing_extraction_still_works(self):
|
||||
"""Test that extraction strategies still work in normal mode."""
|
||||
from crawl4ai import JsonCssExtractionStrategy
|
||||
|
||||
schema = {
|
||||
"name": "Links",
|
||||
"baseSelector": "a",
|
||||
"fields": [
|
||||
{"name": "href", "selector": "", "type": "attribute", "attribute": "href"},
|
||||
{"name": "text", "selector": "", "type": "text"}
|
||||
]
|
||||
}
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
extraction_strategy=JsonCssExtractionStrategy(schema=schema)
|
||||
)
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
assert result.extracted_content is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_existing_deep_crawl_still_works(self):
|
||||
"""Test that deep crawl without prefetch still does full processing."""
|
||||
from crawl4ai import BFSDeepCrawlStrategy
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=1,
|
||||
max_pages=2
|
||||
)
|
||||
# No prefetch - should do full processing
|
||||
)
|
||||
|
||||
result_container = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
# Handle both list and iterator results
|
||||
if hasattr(result_container, '__aiter__'):
|
||||
results = [r async for r in result_container]
|
||||
else:
|
||||
results = list(result_container) if hasattr(result_container, '__iter__') else [result_container]
|
||||
|
||||
# Each result should have full processing
|
||||
for result in results:
|
||||
assert result.cleaned_html is not None
|
||||
|
||||
assert len(results) >= 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_url_scheme_still_works(self):
|
||||
"""Test that raw: URL scheme works for processing stored HTML."""
|
||||
sample_html = """
|
||||
<html>
|
||||
<head><title>Test Page</title></head>
|
||||
<body>
|
||||
<h1>Hello World</h1>
|
||||
<p>This is a test paragraph.</p>
|
||||
<a href="/link1">Link 1</a>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig()
|
||||
result = await crawler.arun(f"raw:{sample_html}", config=config)
|
||||
|
||||
assert result.success is True
|
||||
assert result.html is not None
|
||||
assert "Hello World" in result.html
|
||||
assert result.cleaned_html is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_screenshot_still_works(self):
|
||||
"""Test that screenshot option still works in normal mode."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(screenshot=True)
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
assert result.success is True
|
||||
# Screenshot data should be present
|
||||
assert result.screenshot is not None or result.screenshot_data is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_js_execution_still_works(self):
|
||||
"""Test that JavaScript execution still works in normal mode."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
js_code="document.querySelector('h1')?.textContent"
|
||||
)
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
assert result.success is True
|
||||
assert result.html is not None
|
||||
|
||||
|
||||
class TestPrefetchDoesNotAffectOtherModes:
|
||||
"""Test that prefetch doesn't interfere with other configurations."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prefetch_with_other_options_ignored(self):
|
||||
"""Test that other options are properly ignored in prefetch mode."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
prefetch=True,
|
||||
# These should be ignored in prefetch mode
|
||||
screenshot=True,
|
||||
pdf=True,
|
||||
only_text=True,
|
||||
word_count_threshold=100
|
||||
)
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
# Should still return HTML and links
|
||||
assert result.html is not None
|
||||
assert result.links is not None
|
||||
|
||||
# But should NOT have processed content
|
||||
assert result.cleaned_html is None
|
||||
assert result.extracted_content is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_stream_mode_still_works(self):
|
||||
"""Test that stream mode still works normally."""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(stream=True)
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
|
||||
assert result.success is True
|
||||
assert result.html is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cache_mode_still_works(self):
|
||||
"""Test that cache mode still works normally."""
|
||||
from crawl4ai import CacheMode
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# First request - bypass cache
|
||||
config1 = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
result1 = await crawler.arun(TEST_URL, config=config1)
|
||||
assert result1.success is True
|
||||
|
||||
# Second request - should work
|
||||
config2 = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
result2 = await crawler.arun(TEST_URL, config=config2)
|
||||
assert result2.success is True
|
||||
|
||||
|
||||
class TestBackwardsCompatibility:
|
||||
"""Test backwards compatibility with existing code patterns."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_config_without_prefetch_works(self):
|
||||
"""Test that configs created without prefetch parameter work."""
|
||||
# Simulating old code that doesn't know about prefetch
|
||||
config = CrawlerRunConfig(
|
||||
word_count_threshold=50,
|
||||
css_selector="body"
|
||||
)
|
||||
|
||||
# Should default to prefetch=False
|
||||
assert config.prefetch == False
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(TEST_URL, config=config)
|
||||
assert result.success is True
|
||||
assert result.cleaned_html is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_from_kwargs_without_prefetch(self):
|
||||
"""Test CrawlerRunConfig.from_kwargs works without prefetch."""
|
||||
config = CrawlerRunConfig.from_kwargs({
|
||||
"word_count_threshold": 50,
|
||||
"verbose": False
|
||||
})
|
||||
|
||||
assert config.prefetch == False
|
||||
Reference in New Issue
Block a user