Add prefetch mode for two-phase deep crawling

- Add `prefetch` parameter to CrawlerRunConfig
- Add `quick_extract_links()` function for fast link extraction
- Add short-circuit in aprocess_html() for prefetch mode
- Add 42 tests (unit, integration, regression)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
unclecode
2025-12-25 01:55:08 +00:00
parent 3937efcf0b
commit fde4e9f0c6
6 changed files with 816 additions and 0 deletions

View File

@@ -1291,6 +1291,7 @@ class CrawlerRunConfig():
# Connection Parameters
method: str = "GET",
stream: bool = False,
prefetch: bool = False, # When True, return only HTML + links (skip heavy processing)
url: str = None,
base_url: str = None, # Base URL for markdown link resolution (used with raw: HTML)
check_robots_txt: bool = False,
@@ -1422,6 +1423,7 @@ class CrawlerRunConfig():
# Connection Parameters
self.stream = stream
self.prefetch = prefetch # Prefetch mode: return only HTML + links
self.method = method
# Robots.txt Handling Parameters
@@ -1694,6 +1696,7 @@ class CrawlerRunConfig():
# Connection Parameters
method=kwargs.get("method", "GET"),
stream=kwargs.get("stream", False),
prefetch=kwargs.get("prefetch", False),
check_robots_txt=kwargs.get("check_robots_txt", False),
user_agent=kwargs.get("user_agent"),
user_agent_mode=kwargs.get("user_agent_mode"),
@@ -1799,6 +1802,7 @@ class CrawlerRunConfig():
"capture_console_messages": self.capture_console_messages,
"method": self.method,
"stream": self.stream,
"prefetch": self.prefetch,
"check_robots_txt": self.check_robots_txt,
"user_agent": self.user_agent,
"user_agent_mode": self.user_agent_mode,

View File

@@ -514,6 +514,27 @@ class AsyncWebCrawler:
Returns:
CrawlResult: Processed result containing extracted and formatted content
"""
# === PREFETCH MODE SHORT-CIRCUIT ===
if getattr(config, 'prefetch', False):
from .utils import quick_extract_links
# Use base_url from config (for raw: URLs), redirected_url, or original url
effective_url = getattr(config, 'base_url', None) or kwargs.get('redirected_url') or url
links = quick_extract_links(html, effective_url)
return CrawlResult(
url=url,
html=html,
success=True,
links=links,
status_code=kwargs.get('status_code'),
response_headers=kwargs.get('response_headers'),
redirected_url=kwargs.get('redirected_url'),
ssl_certificate=kwargs.get('ssl_certificate'),
# All other fields default to None
)
# === END PREFETCH SHORT-CIRCUIT ===
cleaned_html = ""
try:
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"

View File

@@ -2461,6 +2461,54 @@ def normalize_url_tmp(href, base_url):
return href.strip()
def quick_extract_links(html: str, base_url: str) -> Dict[str, List[Dict[str, str]]]:
"""
Fast link extraction for prefetch mode.
Only extracts <a href> tags - no media, no cleaning, no heavy processing.
Args:
html: Raw HTML string
base_url: Base URL for resolving relative links
Returns:
{"internal": [{"href": "...", "text": "..."}], "external": [...]}
"""
from lxml.html import document_fromstring
try:
doc = document_fromstring(html)
except Exception:
return {"internal": [], "external": []}
base_domain = get_base_domain(base_url)
internal: List[Dict[str, str]] = []
external: List[Dict[str, str]] = []
seen: Set[str] = set()
for a in doc.xpath("//a[@href]"):
href = a.get("href", "").strip()
if not href or href.startswith(("#", "javascript:", "mailto:", "tel:")):
continue
# Normalize URL
normalized = normalize_url_for_deep_crawl(href, base_url)
if not normalized or normalized in seen:
continue
seen.add(normalized)
# Extract text (truncated for memory efficiency)
text = (a.text_content() or "").strip()[:200]
link_data = {"href": normalized, "text": text}
if is_external_url(normalized, base_domain):
external.append(link_data)
else:
internal.append(link_data)
return {"internal": internal, "external": external}
def get_base_domain(url: str) -> str:
"""
Extract the base domain from a given URL, handling common edge cases.

View File

@@ -0,0 +1,236 @@
"""Integration tests for prefetch mode with the crawler."""
import pytest
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig
# Use crawl4ai docs as test domain
TEST_DOMAIN = "https://docs.crawl4ai.com"
class TestPrefetchModeIntegration:
"""Integration tests for prefetch mode."""
@pytest.mark.asyncio
async def test_prefetch_returns_html_and_links(self):
"""Test that prefetch mode returns HTML and links only."""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(prefetch=True)
result = await crawler.arun(TEST_DOMAIN, config=config)
# Should have HTML
assert result.html is not None
assert len(result.html) > 0
assert "<html" in result.html.lower() or "<!doctype" in result.html.lower()
# Should have links
assert result.links is not None
assert "internal" in result.links
assert "external" in result.links
# Should NOT have processed content
assert result.markdown is None or (
hasattr(result.markdown, 'raw_markdown') and
result.markdown.raw_markdown is None
)
assert result.cleaned_html is None
assert result.extracted_content is None
@pytest.mark.asyncio
async def test_prefetch_preserves_metadata(self):
"""Test that prefetch mode preserves essential metadata."""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(prefetch=True)
result = await crawler.arun(TEST_DOMAIN, config=config)
# Should have success flag
assert result.success is True
# Should have URL
assert result.url is not None
# Status code should be present
assert result.status_code is not None or result.status_code == 200
@pytest.mark.asyncio
async def test_prefetch_with_deep_crawl(self):
"""Test prefetch mode with deep crawl strategy."""
from crawl4ai import BFSDeepCrawlStrategy
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
prefetch=True,
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=1,
max_pages=3
)
)
result_container = await crawler.arun(TEST_DOMAIN, config=config)
# Handle both list and iterator results
if hasattr(result_container, '__aiter__'):
results = [r async for r in result_container]
else:
results = list(result_container) if hasattr(result_container, '__iter__') else [result_container]
# Each result should have HTML and links
for result in results:
assert result.html is not None
assert result.links is not None
# Should have crawled at least one page
assert len(results) >= 1
@pytest.mark.asyncio
async def test_prefetch_then_process_with_raw(self):
"""Test the full two-phase workflow: prefetch then process."""
async with AsyncWebCrawler() as crawler:
# Phase 1: Prefetch
prefetch_config = CrawlerRunConfig(prefetch=True)
prefetch_result = await crawler.arun(TEST_DOMAIN, config=prefetch_config)
stored_html = prefetch_result.html
assert stored_html is not None
assert len(stored_html) > 0
# Phase 2: Process with raw: URL
process_config = CrawlerRunConfig(
# No prefetch - full processing
base_url=TEST_DOMAIN # Provide base URL for link resolution
)
processed_result = await crawler.arun(
f"raw:{stored_html}",
config=process_config
)
# Should now have full processing
assert processed_result.html is not None
assert processed_result.success is True
# Note: cleaned_html and markdown depend on the content
@pytest.mark.asyncio
async def test_prefetch_links_structure(self):
"""Test that links have the expected structure."""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(prefetch=True)
result = await crawler.arun(TEST_DOMAIN, config=config)
assert result.links is not None
# Check internal links structure
if result.links["internal"]:
link = result.links["internal"][0]
assert "href" in link
assert "text" in link
assert link["href"].startswith("http")
# Check external links structure (if any)
if result.links["external"]:
link = result.links["external"][0]
assert "href" in link
assert "text" in link
assert link["href"].startswith("http")
@pytest.mark.asyncio
async def test_prefetch_config_clone(self):
"""Test that config.clone() preserves prefetch setting."""
config = CrawlerRunConfig(prefetch=True)
cloned = config.clone()
assert cloned.prefetch == True
# Clone with override
cloned_false = config.clone(prefetch=False)
assert cloned_false.prefetch == False
@pytest.mark.asyncio
async def test_prefetch_to_dict(self):
"""Test that to_dict() includes prefetch."""
config = CrawlerRunConfig(prefetch=True)
config_dict = config.to_dict()
assert "prefetch" in config_dict
assert config_dict["prefetch"] == True
@pytest.mark.asyncio
async def test_prefetch_default_false(self):
"""Test that prefetch defaults to False."""
config = CrawlerRunConfig()
assert config.prefetch == False
@pytest.mark.asyncio
async def test_prefetch_explicit_false(self):
"""Test explicit prefetch=False works like default."""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(prefetch=False)
result = await crawler.arun(TEST_DOMAIN, config=config)
# Should have full processing
assert result.html is not None
# cleaned_html should be populated in normal mode
assert result.cleaned_html is not None
class TestPrefetchPerformance:
"""Performance-related tests for prefetch mode."""
@pytest.mark.asyncio
async def test_prefetch_returns_quickly(self):
"""Test that prefetch mode returns results quickly."""
import time
async with AsyncWebCrawler() as crawler:
# Prefetch mode
start = time.time()
prefetch_config = CrawlerRunConfig(prefetch=True)
await crawler.arun(TEST_DOMAIN, config=prefetch_config)
prefetch_time = time.time() - start
# Full mode
start = time.time()
full_config = CrawlerRunConfig()
await crawler.arun(TEST_DOMAIN, config=full_config)
full_time = time.time() - start
# Log times for debugging
print(f"\nPrefetch: {prefetch_time:.3f}s, Full: {full_time:.3f}s")
# Prefetch should not be significantly slower
# (may be same or slightly faster depending on content)
# This is a soft check - mostly for logging
class TestPrefetchWithRawHTML:
"""Test prefetch mode with raw HTML input."""
@pytest.mark.asyncio
async def test_prefetch_with_raw_html(self):
"""Test prefetch mode works with raw: URL scheme."""
sample_html = """
<html>
<head><title>Test Page</title></head>
<body>
<h1>Hello World</h1>
<a href="/link1">Link 1</a>
<a href="/link2">Link 2</a>
<a href="https://external.com/page">External</a>
</body>
</html>
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
prefetch=True,
base_url="https://example.com"
)
result = await crawler.arun(f"raw:{sample_html}", config=config)
assert result.success is True
assert result.html is not None
assert result.links is not None
# Should have extracted links
assert len(result.links["internal"]) >= 2
assert len(result.links["external"]) >= 1

275
tests/test_prefetch_mode.py Normal file
View File

@@ -0,0 +1,275 @@
"""Unit tests for the quick_extract_links function used in prefetch mode."""
import pytest
from crawl4ai.utils import quick_extract_links
class TestQuickExtractLinks:
"""Unit tests for the quick_extract_links function."""
def test_basic_internal_links(self):
"""Test extraction of internal links."""
html = '''
<html>
<body>
<a href="/page1">Page 1</a>
<a href="/page2">Page 2</a>
<a href="https://example.com/page3">Page 3</a>
</body>
</html>
'''
result = quick_extract_links(html, "https://example.com")
assert len(result["internal"]) == 3
assert result["internal"][0]["href"] == "https://example.com/page1"
assert result["internal"][0]["text"] == "Page 1"
def test_external_links(self):
"""Test extraction and classification of external links."""
html = '''
<html>
<body>
<a href="https://other.com/page">External</a>
<a href="/internal">Internal</a>
</body>
</html>
'''
result = quick_extract_links(html, "https://example.com")
assert len(result["internal"]) == 1
assert len(result["external"]) == 1
assert result["external"][0]["href"] == "https://other.com/page"
def test_ignores_javascript_and_mailto(self):
"""Test that javascript: and mailto: links are ignored."""
html = '''
<html>
<body>
<a href="javascript:void(0)">Click</a>
<a href="mailto:test@example.com">Email</a>
<a href="tel:+1234567890">Call</a>
<a href="/valid">Valid</a>
</body>
</html>
'''
result = quick_extract_links(html, "https://example.com")
assert len(result["internal"]) == 1
assert result["internal"][0]["href"] == "https://example.com/valid"
def test_ignores_anchor_only_links(self):
"""Test that anchor-only links (#section) are ignored."""
html = '''
<html>
<body>
<a href="#section1">Section 1</a>
<a href="#section2">Section 2</a>
<a href="/page#section">Page with anchor</a>
</body>
</html>
'''
result = quick_extract_links(html, "https://example.com")
# Only the page link should be included, anchor-only links are skipped
assert len(result["internal"]) == 1
assert "/page" in result["internal"][0]["href"]
def test_deduplication(self):
"""Test that duplicate URLs are deduplicated."""
html = '''
<html>
<body>
<a href="/page">Link 1</a>
<a href="/page">Link 2</a>
<a href="/page">Link 3</a>
</body>
</html>
'''
result = quick_extract_links(html, "https://example.com")
assert len(result["internal"]) == 1
def test_handles_malformed_html(self):
"""Test graceful handling of malformed HTML."""
html = "not valid html at all <><><"
result = quick_extract_links(html, "https://example.com")
# Should not raise, should return empty
assert result["internal"] == []
assert result["external"] == []
def test_empty_html(self):
"""Test handling of empty HTML."""
result = quick_extract_links("", "https://example.com")
assert result == {"internal": [], "external": []}
def test_relative_url_resolution(self):
"""Test that relative URLs are resolved correctly."""
html = '''
<html>
<body>
<a href="page1.html">Relative</a>
<a href="./page2.html">Dot Relative</a>
<a href="../page3.html">Parent Relative</a>
</body>
</html>
'''
result = quick_extract_links(html, "https://example.com/docs/")
assert len(result["internal"]) >= 1
# All should be internal and properly resolved
for link in result["internal"]:
assert link["href"].startswith("https://example.com")
def test_text_truncation(self):
"""Test that long link text is truncated to 200 chars."""
long_text = "A" * 300
html = f'''
<html>
<body>
<a href="/page">{long_text}</a>
</body>
</html>
'''
result = quick_extract_links(html, "https://example.com")
assert len(result["internal"]) == 1
assert len(result["internal"][0]["text"]) == 200
def test_empty_href_ignored(self):
"""Test that empty href attributes are ignored."""
html = '''
<html>
<body>
<a href="">Empty</a>
<a href=" ">Whitespace</a>
<a href="/valid">Valid</a>
</body>
</html>
'''
result = quick_extract_links(html, "https://example.com")
assert len(result["internal"]) == 1
assert result["internal"][0]["href"] == "https://example.com/valid"
def test_mixed_internal_external(self):
"""Test correct classification of mixed internal and external links."""
html = '''
<html>
<body>
<a href="/internal1">Internal 1</a>
<a href="https://example.com/internal2">Internal 2</a>
<a href="https://google.com">Google</a>
<a href="https://github.com/repo">GitHub</a>
<a href="/internal3">Internal 3</a>
</body>
</html>
'''
result = quick_extract_links(html, "https://example.com")
assert len(result["internal"]) == 3
assert len(result["external"]) == 2
def test_subdomain_handling(self):
"""Test that subdomains are handled correctly."""
html = '''
<html>
<body>
<a href="https://docs.example.com/page">Docs subdomain</a>
<a href="https://api.example.com/v1">API subdomain</a>
<a href="https://example.com/main">Main domain</a>
</body>
</html>
'''
result = quick_extract_links(html, "https://example.com")
# All should be internal (same base domain)
total_links = len(result["internal"]) + len(result["external"])
assert total_links == 3
class TestQuickExtractLinksEdgeCases:
"""Edge case tests for quick_extract_links."""
def test_no_links_in_page(self):
"""Test page with no links."""
html = '''
<html>
<body>
<h1>No Links Here</h1>
<p>Just some text content.</p>
</body>
</html>
'''
result = quick_extract_links(html, "https://example.com")
assert result["internal"] == []
assert result["external"] == []
def test_links_in_nested_elements(self):
"""Test links nested in various elements."""
html = '''
<html>
<body>
<nav>
<ul>
<li><a href="/home">Home</a></li>
<li><a href="/about">About</a></li>
</ul>
</nav>
<div class="content">
<p>Check out <a href="/products">our products</a>.</p>
</div>
</body>
</html>
'''
result = quick_extract_links(html, "https://example.com")
assert len(result["internal"]) == 3
def test_link_with_nested_elements(self):
"""Test links containing nested elements."""
html = '''
<html>
<body>
<a href="/page"><span>Nested</span> <strong>Text</strong></a>
</body>
</html>
'''
result = quick_extract_links(html, "https://example.com")
assert len(result["internal"]) == 1
assert "Nested" in result["internal"][0]["text"]
assert "Text" in result["internal"][0]["text"]
def test_protocol_relative_urls(self):
"""Test handling of protocol-relative URLs (//example.com)."""
html = '''
<html>
<body>
<a href="//cdn.example.com/asset">CDN Link</a>
</body>
</html>
'''
result = quick_extract_links(html, "https://example.com")
# Should be resolved with https:
total = len(result["internal"]) + len(result["external"])
assert total >= 1
def test_whitespace_in_href(self):
"""Test handling of whitespace around href values."""
html = '''
<html>
<body>
<a href=" /page1 ">Padded</a>
<a href="
/page2
">Multiline</a>
</body>
</html>
'''
result = quick_extract_links(html, "https://example.com")
# Both should be extracted and normalized
assert len(result["internal"]) >= 1

View File

@@ -0,0 +1,232 @@
"""Regression tests to ensure prefetch mode doesn't break existing functionality."""
import pytest
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
TEST_URL = "https://docs.crawl4ai.com"
class TestNoRegressions:
"""Ensure prefetch mode doesn't break existing functionality."""
@pytest.mark.asyncio
async def test_default_mode_unchanged(self):
"""Test that default mode (prefetch=False) works exactly as before."""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig() # Default config
result = await crawler.arun(TEST_URL, config=config)
# All standard fields should be populated
assert result.html is not None
assert result.cleaned_html is not None
assert result.links is not None
assert result.success is True
@pytest.mark.asyncio
async def test_explicit_prefetch_false(self):
"""Test explicit prefetch=False works like default."""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(prefetch=False)
result = await crawler.arun(TEST_URL, config=config)
assert result.cleaned_html is not None
@pytest.mark.asyncio
async def test_config_clone_preserves_prefetch(self):
"""Test that config.clone() preserves prefetch setting."""
config = CrawlerRunConfig(prefetch=True)
cloned = config.clone()
assert cloned.prefetch == True
# Clone with override
cloned_false = config.clone(prefetch=False)
assert cloned_false.prefetch == False
@pytest.mark.asyncio
async def test_config_to_dict_includes_prefetch(self):
"""Test that to_dict() includes prefetch."""
config_true = CrawlerRunConfig(prefetch=True)
config_false = CrawlerRunConfig(prefetch=False)
assert config_true.to_dict()["prefetch"] == True
assert config_false.to_dict()["prefetch"] == False
@pytest.mark.asyncio
async def test_existing_extraction_still_works(self):
"""Test that extraction strategies still work in normal mode."""
from crawl4ai import JsonCssExtractionStrategy
schema = {
"name": "Links",
"baseSelector": "a",
"fields": [
{"name": "href", "selector": "", "type": "attribute", "attribute": "href"},
{"name": "text", "selector": "", "type": "text"}
]
}
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
extraction_strategy=JsonCssExtractionStrategy(schema=schema)
)
result = await crawler.arun(TEST_URL, config=config)
assert result.extracted_content is not None
@pytest.mark.asyncio
async def test_existing_deep_crawl_still_works(self):
"""Test that deep crawl without prefetch still does full processing."""
from crawl4ai import BFSDeepCrawlStrategy
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=1,
max_pages=2
)
# No prefetch - should do full processing
)
result_container = await crawler.arun(TEST_URL, config=config)
# Handle both list and iterator results
if hasattr(result_container, '__aiter__'):
results = [r async for r in result_container]
else:
results = list(result_container) if hasattr(result_container, '__iter__') else [result_container]
# Each result should have full processing
for result in results:
assert result.cleaned_html is not None
assert len(results) >= 1
@pytest.mark.asyncio
async def test_raw_url_scheme_still_works(self):
"""Test that raw: URL scheme works for processing stored HTML."""
sample_html = """
<html>
<head><title>Test Page</title></head>
<body>
<h1>Hello World</h1>
<p>This is a test paragraph.</p>
<a href="/link1">Link 1</a>
</body>
</html>
"""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig()
result = await crawler.arun(f"raw:{sample_html}", config=config)
assert result.success is True
assert result.html is not None
assert "Hello World" in result.html
assert result.cleaned_html is not None
@pytest.mark.asyncio
async def test_screenshot_still_works(self):
"""Test that screenshot option still works in normal mode."""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(screenshot=True)
result = await crawler.arun(TEST_URL, config=config)
assert result.success is True
# Screenshot data should be present
assert result.screenshot is not None or result.screenshot_data is not None
@pytest.mark.asyncio
async def test_js_execution_still_works(self):
"""Test that JavaScript execution still works in normal mode."""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
js_code="document.querySelector('h1')?.textContent"
)
result = await crawler.arun(TEST_URL, config=config)
assert result.success is True
assert result.html is not None
class TestPrefetchDoesNotAffectOtherModes:
"""Test that prefetch doesn't interfere with other configurations."""
@pytest.mark.asyncio
async def test_prefetch_with_other_options_ignored(self):
"""Test that other options are properly ignored in prefetch mode."""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
prefetch=True,
# These should be ignored in prefetch mode
screenshot=True,
pdf=True,
only_text=True,
word_count_threshold=100
)
result = await crawler.arun(TEST_URL, config=config)
# Should still return HTML and links
assert result.html is not None
assert result.links is not None
# But should NOT have processed content
assert result.cleaned_html is None
assert result.extracted_content is None
@pytest.mark.asyncio
async def test_stream_mode_still_works(self):
"""Test that stream mode still works normally."""
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(stream=True)
result = await crawler.arun(TEST_URL, config=config)
assert result.success is True
assert result.html is not None
@pytest.mark.asyncio
async def test_cache_mode_still_works(self):
"""Test that cache mode still works normally."""
from crawl4ai import CacheMode
async with AsyncWebCrawler() as crawler:
# First request - bypass cache
config1 = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
result1 = await crawler.arun(TEST_URL, config=config1)
assert result1.success is True
# Second request - should work
config2 = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
result2 = await crawler.arun(TEST_URL, config=config2)
assert result2.success is True
class TestBackwardsCompatibility:
"""Test backwards compatibility with existing code patterns."""
@pytest.mark.asyncio
async def test_config_without_prefetch_works(self):
"""Test that configs created without prefetch parameter work."""
# Simulating old code that doesn't know about prefetch
config = CrawlerRunConfig(
word_count_threshold=50,
css_selector="body"
)
# Should default to prefetch=False
assert config.prefetch == False
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(TEST_URL, config=config)
assert result.success is True
assert result.cleaned_html is not None
@pytest.mark.asyncio
async def test_from_kwargs_without_prefetch(self):
"""Test CrawlerRunConfig.from_kwargs works without prefetch."""
config = CrawlerRunConfig.from_kwargs({
"word_count_threshold": 50,
"verbose": False
})
assert config.prefetch == False