Check out our products.
+diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index f289fc28..749ae717 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1291,6 +1291,7 @@ class CrawlerRunConfig():
# Connection Parameters
method: str = "GET",
stream: bool = False,
+ prefetch: bool = False, # When True, return only HTML + links (skip heavy processing)
url: str = None,
base_url: str = None, # Base URL for markdown link resolution (used with raw: HTML)
check_robots_txt: bool = False,
@@ -1422,6 +1423,7 @@ class CrawlerRunConfig():
# Connection Parameters
self.stream = stream
+ self.prefetch = prefetch # Prefetch mode: return only HTML + links
self.method = method
# Robots.txt Handling Parameters
@@ -1694,6 +1696,7 @@ class CrawlerRunConfig():
# Connection Parameters
method=kwargs.get("method", "GET"),
stream=kwargs.get("stream", False),
+ prefetch=kwargs.get("prefetch", False),
check_robots_txt=kwargs.get("check_robots_txt", False),
user_agent=kwargs.get("user_agent"),
user_agent_mode=kwargs.get("user_agent_mode"),
@@ -1799,6 +1802,7 @@ class CrawlerRunConfig():
"capture_console_messages": self.capture_console_messages,
"method": self.method,
"stream": self.stream,
+ "prefetch": self.prefetch,
"check_robots_txt": self.check_robots_txt,
"user_agent": self.user_agent,
"user_agent_mode": self.user_agent_mode,
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 94aec156..95468a28 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -514,6 +514,27 @@ class AsyncWebCrawler:
Returns:
CrawlResult: Processed result containing extracted and formatted content
"""
+ # === PREFETCH MODE SHORT-CIRCUIT ===
+ if getattr(config, 'prefetch', False):
+ from .utils import quick_extract_links
+
+ # Use base_url from config (for raw: URLs), redirected_url, or original url
+ effective_url = getattr(config, 'base_url', None) or kwargs.get('redirected_url') or url
+ links = quick_extract_links(html, effective_url)
+
+ return CrawlResult(
+ url=url,
+ html=html,
+ success=True,
+ links=links,
+ status_code=kwargs.get('status_code'),
+ response_headers=kwargs.get('response_headers'),
+ redirected_url=kwargs.get('redirected_url'),
+ ssl_certificate=kwargs.get('ssl_certificate'),
+ # All other fields default to None
+ )
+ # === END PREFETCH SHORT-CIRCUIT ===
+
cleaned_html = ""
try:
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 7f88ae18..ed12892e 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -2461,6 +2461,54 @@ def normalize_url_tmp(href, base_url):
return href.strip()
+def quick_extract_links(html: str, base_url: str) -> Dict[str, List[Dict[str, str]]]:
+ """
+ Fast link extraction for prefetch mode.
+ Only extracts tags - no media, no cleaning, no heavy processing.
+
+ Args:
+ html: Raw HTML string
+ base_url: Base URL for resolving relative links
+
+ Returns:
+ {"internal": [{"href": "...", "text": "..."}], "external": [...]}
+ """
+ from lxml.html import document_fromstring
+
+ try:
+ doc = document_fromstring(html)
+ except Exception:
+ return {"internal": [], "external": []}
+
+ base_domain = get_base_domain(base_url)
+ internal: List[Dict[str, str]] = []
+ external: List[Dict[str, str]] = []
+ seen: Set[str] = set()
+
+ for a in doc.xpath("//a[@href]"):
+ href = a.get("href", "").strip()
+ if not href or href.startswith(("#", "javascript:", "mailto:", "tel:")):
+ continue
+
+ # Normalize URL
+ normalized = normalize_url_for_deep_crawl(href, base_url)
+ if not normalized or normalized in seen:
+ continue
+ seen.add(normalized)
+
+ # Extract text (truncated for memory efficiency)
+ text = (a.text_content() or "").strip()[:200]
+
+ link_data = {"href": normalized, "text": text}
+
+ if is_external_url(normalized, base_domain):
+ external.append(link_data)
+ else:
+ internal.append(link_data)
+
+ return {"internal": internal, "external": external}
+
+
def get_base_domain(url: str) -> str:
"""
Extract the base domain from a given URL, handling common edge cases.
diff --git a/tests/test_prefetch_integration.py b/tests/test_prefetch_integration.py
new file mode 100644
index 00000000..77ed9428
--- /dev/null
+++ b/tests/test_prefetch_integration.py
@@ -0,0 +1,236 @@
+"""Integration tests for prefetch mode with the crawler."""
+
+import pytest
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig
+
+# Use crawl4ai docs as test domain
+TEST_DOMAIN = "https://docs.crawl4ai.com"
+
+
+class TestPrefetchModeIntegration:
+ """Integration tests for prefetch mode."""
+
+ @pytest.mark.asyncio
+ async def test_prefetch_returns_html_and_links(self):
+ """Test that prefetch mode returns HTML and links only."""
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(prefetch=True)
+ result = await crawler.arun(TEST_DOMAIN, config=config)
+
+ # Should have HTML
+ assert result.html is not None
+ assert len(result.html) > 0
+ assert "= 1
+
+ @pytest.mark.asyncio
+ async def test_prefetch_then_process_with_raw(self):
+ """Test the full two-phase workflow: prefetch then process."""
+ async with AsyncWebCrawler() as crawler:
+ # Phase 1: Prefetch
+ prefetch_config = CrawlerRunConfig(prefetch=True)
+ prefetch_result = await crawler.arun(TEST_DOMAIN, config=prefetch_config)
+
+ stored_html = prefetch_result.html
+
+ assert stored_html is not None
+ assert len(stored_html) > 0
+
+ # Phase 2: Process with raw: URL
+ process_config = CrawlerRunConfig(
+ # No prefetch - full processing
+ base_url=TEST_DOMAIN # Provide base URL for link resolution
+ )
+ processed_result = await crawler.arun(
+ f"raw:{stored_html}",
+ config=process_config
+ )
+
+ # Should now have full processing
+ assert processed_result.html is not None
+ assert processed_result.success is True
+ # Note: cleaned_html and markdown depend on the content
+
+ @pytest.mark.asyncio
+ async def test_prefetch_links_structure(self):
+ """Test that links have the expected structure."""
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(prefetch=True)
+ result = await crawler.arun(TEST_DOMAIN, config=config)
+
+ assert result.links is not None
+
+ # Check internal links structure
+ if result.links["internal"]:
+ link = result.links["internal"][0]
+ assert "href" in link
+ assert "text" in link
+ assert link["href"].startswith("http")
+
+ # Check external links structure (if any)
+ if result.links["external"]:
+ link = result.links["external"][0]
+ assert "href" in link
+ assert "text" in link
+ assert link["href"].startswith("http")
+
+ @pytest.mark.asyncio
+ async def test_prefetch_config_clone(self):
+ """Test that config.clone() preserves prefetch setting."""
+ config = CrawlerRunConfig(prefetch=True)
+ cloned = config.clone()
+
+ assert cloned.prefetch == True
+
+ # Clone with override
+ cloned_false = config.clone(prefetch=False)
+ assert cloned_false.prefetch == False
+
+ @pytest.mark.asyncio
+ async def test_prefetch_to_dict(self):
+ """Test that to_dict() includes prefetch."""
+ config = CrawlerRunConfig(prefetch=True)
+ config_dict = config.to_dict()
+
+ assert "prefetch" in config_dict
+ assert config_dict["prefetch"] == True
+
+ @pytest.mark.asyncio
+ async def test_prefetch_default_false(self):
+ """Test that prefetch defaults to False."""
+ config = CrawlerRunConfig()
+ assert config.prefetch == False
+
+ @pytest.mark.asyncio
+ async def test_prefetch_explicit_false(self):
+ """Test explicit prefetch=False works like default."""
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(prefetch=False)
+ result = await crawler.arun(TEST_DOMAIN, config=config)
+
+ # Should have full processing
+ assert result.html is not None
+ # cleaned_html should be populated in normal mode
+ assert result.cleaned_html is not None
+
+
+class TestPrefetchPerformance:
+ """Performance-related tests for prefetch mode."""
+
+ @pytest.mark.asyncio
+ async def test_prefetch_returns_quickly(self):
+ """Test that prefetch mode returns results quickly."""
+ import time
+
+ async with AsyncWebCrawler() as crawler:
+ # Prefetch mode
+ start = time.time()
+ prefetch_config = CrawlerRunConfig(prefetch=True)
+ await crawler.arun(TEST_DOMAIN, config=prefetch_config)
+ prefetch_time = time.time() - start
+
+ # Full mode
+ start = time.time()
+ full_config = CrawlerRunConfig()
+ await crawler.arun(TEST_DOMAIN, config=full_config)
+ full_time = time.time() - start
+
+ # Log times for debugging
+ print(f"\nPrefetch: {prefetch_time:.3f}s, Full: {full_time:.3f}s")
+
+ # Prefetch should not be significantly slower
+ # (may be same or slightly faster depending on content)
+ # This is a soft check - mostly for logging
+
+
+class TestPrefetchWithRawHTML:
+ """Test prefetch mode with raw HTML input."""
+
+ @pytest.mark.asyncio
+ async def test_prefetch_with_raw_html(self):
+ """Test prefetch mode works with raw: URL scheme."""
+ sample_html = """
+
+ Hello World
+ Link 1
+ Link 2
+ External
+
+
+ """
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ prefetch=True,
+ base_url="https://example.com"
+ )
+ result = await crawler.arun(f"raw:{sample_html}", config=config)
+
+ assert result.success is True
+ assert result.html is not None
+ assert result.links is not None
+
+ # Should have extracted links
+ assert len(result.links["internal"]) >= 2
+ assert len(result.links["external"]) >= 1
diff --git a/tests/test_prefetch_mode.py b/tests/test_prefetch_mode.py
new file mode 100644
index 00000000..fdbaa963
--- /dev/null
+++ b/tests/test_prefetch_mode.py
@@ -0,0 +1,275 @@
+"""Unit tests for the quick_extract_links function used in prefetch mode."""
+
+import pytest
+from crawl4ai.utils import quick_extract_links
+
+
+class TestQuickExtractLinks:
+ """Unit tests for the quick_extract_links function."""
+
+ def test_basic_internal_links(self):
+ """Test extraction of internal links."""
+ html = '''
+
+
Just some text content.
+ + + ''' + result = quick_extract_links(html, "https://example.com") + + assert result["internal"] == [] + assert result["external"] == [] + + def test_links_in_nested_elements(self): + """Test links nested in various elements.""" + html = ''' + + + +Check out our products.
+This is a test paragraph.
+ Link 1 + + + """ + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig() + result = await crawler.arun(f"raw:{sample_html}", config=config) + + assert result.success is True + assert result.html is not None + assert "Hello World" in result.html + assert result.cleaned_html is not None + + @pytest.mark.asyncio + async def test_screenshot_still_works(self): + """Test that screenshot option still works in normal mode.""" + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(screenshot=True) + result = await crawler.arun(TEST_URL, config=config) + + assert result.success is True + # Screenshot data should be present + assert result.screenshot is not None or result.screenshot_data is not None + + @pytest.mark.asyncio + async def test_js_execution_still_works(self): + """Test that JavaScript execution still works in normal mode.""" + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + js_code="document.querySelector('h1')?.textContent" + ) + result = await crawler.arun(TEST_URL, config=config) + + assert result.success is True + assert result.html is not None + + +class TestPrefetchDoesNotAffectOtherModes: + """Test that prefetch doesn't interfere with other configurations.""" + + @pytest.mark.asyncio + async def test_prefetch_with_other_options_ignored(self): + """Test that other options are properly ignored in prefetch mode.""" + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + prefetch=True, + # These should be ignored in prefetch mode + screenshot=True, + pdf=True, + only_text=True, + word_count_threshold=100 + ) + result = await crawler.arun(TEST_URL, config=config) + + # Should still return HTML and links + assert result.html is not None + assert result.links is not None + + # But should NOT have processed content + assert result.cleaned_html is None + assert result.extracted_content is None + + @pytest.mark.asyncio + async def test_stream_mode_still_works(self): + """Test that stream mode still works normally.""" + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(stream=True) + result = await crawler.arun(TEST_URL, config=config) + + assert result.success is True + assert result.html is not None + + @pytest.mark.asyncio + async def test_cache_mode_still_works(self): + """Test that cache mode still works normally.""" + from crawl4ai import CacheMode + + async with AsyncWebCrawler() as crawler: + # First request - bypass cache + config1 = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + result1 = await crawler.arun(TEST_URL, config=config1) + assert result1.success is True + + # Second request - should work + config2 = CrawlerRunConfig(cache_mode=CacheMode.ENABLED) + result2 = await crawler.arun(TEST_URL, config=config2) + assert result2.success is True + + +class TestBackwardsCompatibility: + """Test backwards compatibility with existing code patterns.""" + + @pytest.mark.asyncio + async def test_config_without_prefetch_works(self): + """Test that configs created without prefetch parameter work.""" + # Simulating old code that doesn't know about prefetch + config = CrawlerRunConfig( + word_count_threshold=50, + css_selector="body" + ) + + # Should default to prefetch=False + assert config.prefetch == False + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(TEST_URL, config=config) + assert result.success is True + assert result.cleaned_html is not None + + @pytest.mark.asyncio + async def test_from_kwargs_without_prefetch(self): + """Test CrawlerRunConfig.from_kwargs works without prefetch.""" + config = CrawlerRunConfig.from_kwargs({ + "word_count_threshold": 50, + "verbose": False + }) + + assert config.prefetch == False