diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index f289fc28..749ae717 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1291,6 +1291,7 @@ class CrawlerRunConfig(): # Connection Parameters method: str = "GET", stream: bool = False, + prefetch: bool = False, # When True, return only HTML + links (skip heavy processing) url: str = None, base_url: str = None, # Base URL for markdown link resolution (used with raw: HTML) check_robots_txt: bool = False, @@ -1422,6 +1423,7 @@ class CrawlerRunConfig(): # Connection Parameters self.stream = stream + self.prefetch = prefetch # Prefetch mode: return only HTML + links self.method = method # Robots.txt Handling Parameters @@ -1694,6 +1696,7 @@ class CrawlerRunConfig(): # Connection Parameters method=kwargs.get("method", "GET"), stream=kwargs.get("stream", False), + prefetch=kwargs.get("prefetch", False), check_robots_txt=kwargs.get("check_robots_txt", False), user_agent=kwargs.get("user_agent"), user_agent_mode=kwargs.get("user_agent_mode"), @@ -1799,6 +1802,7 @@ class CrawlerRunConfig(): "capture_console_messages": self.capture_console_messages, "method": self.method, "stream": self.stream, + "prefetch": self.prefetch, "check_robots_txt": self.check_robots_txt, "user_agent": self.user_agent, "user_agent_mode": self.user_agent_mode, diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 94aec156..95468a28 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -514,6 +514,27 @@ class AsyncWebCrawler: Returns: CrawlResult: Processed result containing extracted and formatted content """ + # === PREFETCH MODE SHORT-CIRCUIT === + if getattr(config, 'prefetch', False): + from .utils import quick_extract_links + + # Use base_url from config (for raw: URLs), redirected_url, or original url + effective_url = getattr(config, 'base_url', None) or kwargs.get('redirected_url') or url + links = quick_extract_links(html, effective_url) + + return CrawlResult( + url=url, + html=html, + success=True, + links=links, + status_code=kwargs.get('status_code'), + response_headers=kwargs.get('response_headers'), + redirected_url=kwargs.get('redirected_url'), + ssl_certificate=kwargs.get('ssl_certificate'), + # All other fields default to None + ) + # === END PREFETCH SHORT-CIRCUIT === + cleaned_html = "" try: _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 7f88ae18..ed12892e 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -2461,6 +2461,54 @@ def normalize_url_tmp(href, base_url): return href.strip() +def quick_extract_links(html: str, base_url: str) -> Dict[str, List[Dict[str, str]]]: + """ + Fast link extraction for prefetch mode. + Only extracts tags - no media, no cleaning, no heavy processing. + + Args: + html: Raw HTML string + base_url: Base URL for resolving relative links + + Returns: + {"internal": [{"href": "...", "text": "..."}], "external": [...]} + """ + from lxml.html import document_fromstring + + try: + doc = document_fromstring(html) + except Exception: + return {"internal": [], "external": []} + + base_domain = get_base_domain(base_url) + internal: List[Dict[str, str]] = [] + external: List[Dict[str, str]] = [] + seen: Set[str] = set() + + for a in doc.xpath("//a[@href]"): + href = a.get("href", "").strip() + if not href or href.startswith(("#", "javascript:", "mailto:", "tel:")): + continue + + # Normalize URL + normalized = normalize_url_for_deep_crawl(href, base_url) + if not normalized or normalized in seen: + continue + seen.add(normalized) + + # Extract text (truncated for memory efficiency) + text = (a.text_content() or "").strip()[:200] + + link_data = {"href": normalized, "text": text} + + if is_external_url(normalized, base_domain): + external.append(link_data) + else: + internal.append(link_data) + + return {"internal": internal, "external": external} + + def get_base_domain(url: str) -> str: """ Extract the base domain from a given URL, handling common edge cases. diff --git a/tests/test_prefetch_integration.py b/tests/test_prefetch_integration.py new file mode 100644 index 00000000..77ed9428 --- /dev/null +++ b/tests/test_prefetch_integration.py @@ -0,0 +1,236 @@ +"""Integration tests for prefetch mode with the crawler.""" + +import pytest +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig + +# Use crawl4ai docs as test domain +TEST_DOMAIN = "https://docs.crawl4ai.com" + + +class TestPrefetchModeIntegration: + """Integration tests for prefetch mode.""" + + @pytest.mark.asyncio + async def test_prefetch_returns_html_and_links(self): + """Test that prefetch mode returns HTML and links only.""" + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(prefetch=True) + result = await crawler.arun(TEST_DOMAIN, config=config) + + # Should have HTML + assert result.html is not None + assert len(result.html) > 0 + assert "= 1 + + @pytest.mark.asyncio + async def test_prefetch_then_process_with_raw(self): + """Test the full two-phase workflow: prefetch then process.""" + async with AsyncWebCrawler() as crawler: + # Phase 1: Prefetch + prefetch_config = CrawlerRunConfig(prefetch=True) + prefetch_result = await crawler.arun(TEST_DOMAIN, config=prefetch_config) + + stored_html = prefetch_result.html + + assert stored_html is not None + assert len(stored_html) > 0 + + # Phase 2: Process with raw: URL + process_config = CrawlerRunConfig( + # No prefetch - full processing + base_url=TEST_DOMAIN # Provide base URL for link resolution + ) + processed_result = await crawler.arun( + f"raw:{stored_html}", + config=process_config + ) + + # Should now have full processing + assert processed_result.html is not None + assert processed_result.success is True + # Note: cleaned_html and markdown depend on the content + + @pytest.mark.asyncio + async def test_prefetch_links_structure(self): + """Test that links have the expected structure.""" + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(prefetch=True) + result = await crawler.arun(TEST_DOMAIN, config=config) + + assert result.links is not None + + # Check internal links structure + if result.links["internal"]: + link = result.links["internal"][0] + assert "href" in link + assert "text" in link + assert link["href"].startswith("http") + + # Check external links structure (if any) + if result.links["external"]: + link = result.links["external"][0] + assert "href" in link + assert "text" in link + assert link["href"].startswith("http") + + @pytest.mark.asyncio + async def test_prefetch_config_clone(self): + """Test that config.clone() preserves prefetch setting.""" + config = CrawlerRunConfig(prefetch=True) + cloned = config.clone() + + assert cloned.prefetch == True + + # Clone with override + cloned_false = config.clone(prefetch=False) + assert cloned_false.prefetch == False + + @pytest.mark.asyncio + async def test_prefetch_to_dict(self): + """Test that to_dict() includes prefetch.""" + config = CrawlerRunConfig(prefetch=True) + config_dict = config.to_dict() + + assert "prefetch" in config_dict + assert config_dict["prefetch"] == True + + @pytest.mark.asyncio + async def test_prefetch_default_false(self): + """Test that prefetch defaults to False.""" + config = CrawlerRunConfig() + assert config.prefetch == False + + @pytest.mark.asyncio + async def test_prefetch_explicit_false(self): + """Test explicit prefetch=False works like default.""" + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(prefetch=False) + result = await crawler.arun(TEST_DOMAIN, config=config) + + # Should have full processing + assert result.html is not None + # cleaned_html should be populated in normal mode + assert result.cleaned_html is not None + + +class TestPrefetchPerformance: + """Performance-related tests for prefetch mode.""" + + @pytest.mark.asyncio + async def test_prefetch_returns_quickly(self): + """Test that prefetch mode returns results quickly.""" + import time + + async with AsyncWebCrawler() as crawler: + # Prefetch mode + start = time.time() + prefetch_config = CrawlerRunConfig(prefetch=True) + await crawler.arun(TEST_DOMAIN, config=prefetch_config) + prefetch_time = time.time() - start + + # Full mode + start = time.time() + full_config = CrawlerRunConfig() + await crawler.arun(TEST_DOMAIN, config=full_config) + full_time = time.time() - start + + # Log times for debugging + print(f"\nPrefetch: {prefetch_time:.3f}s, Full: {full_time:.3f}s") + + # Prefetch should not be significantly slower + # (may be same or slightly faster depending on content) + # This is a soft check - mostly for logging + + +class TestPrefetchWithRawHTML: + """Test prefetch mode with raw HTML input.""" + + @pytest.mark.asyncio + async def test_prefetch_with_raw_html(self): + """Test prefetch mode works with raw: URL scheme.""" + sample_html = """ + + Test Page + +

Hello World

+
Link 1 + Link 2 + External + + + """ + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + prefetch=True, + base_url="https://example.com" + ) + result = await crawler.arun(f"raw:{sample_html}", config=config) + + assert result.success is True + assert result.html is not None + assert result.links is not None + + # Should have extracted links + assert len(result.links["internal"]) >= 2 + assert len(result.links["external"]) >= 1 diff --git a/tests/test_prefetch_mode.py b/tests/test_prefetch_mode.py new file mode 100644 index 00000000..fdbaa963 --- /dev/null +++ b/tests/test_prefetch_mode.py @@ -0,0 +1,275 @@ +"""Unit tests for the quick_extract_links function used in prefetch mode.""" + +import pytest +from crawl4ai.utils import quick_extract_links + + +class TestQuickExtractLinks: + """Unit tests for the quick_extract_links function.""" + + def test_basic_internal_links(self): + """Test extraction of internal links.""" + html = ''' + + + Page 1 + Page 2 + Page 3 + + + ''' + result = quick_extract_links(html, "https://example.com") + + assert len(result["internal"]) == 3 + assert result["internal"][0]["href"] == "https://example.com/page1" + assert result["internal"][0]["text"] == "Page 1" + + def test_external_links(self): + """Test extraction and classification of external links.""" + html = ''' + + + External + Internal + + + ''' + result = quick_extract_links(html, "https://example.com") + + assert len(result["internal"]) == 1 + assert len(result["external"]) == 1 + assert result["external"][0]["href"] == "https://other.com/page" + + def test_ignores_javascript_and_mailto(self): + """Test that javascript: and mailto: links are ignored.""" + html = ''' + + + Click + Email + Call + Valid + + + ''' + result = quick_extract_links(html, "https://example.com") + + assert len(result["internal"]) == 1 + assert result["internal"][0]["href"] == "https://example.com/valid" + + def test_ignores_anchor_only_links(self): + """Test that anchor-only links (#section) are ignored.""" + html = ''' + + + Section 1 + Section 2 + Page with anchor + + + ''' + result = quick_extract_links(html, "https://example.com") + + # Only the page link should be included, anchor-only links are skipped + assert len(result["internal"]) == 1 + assert "/page" in result["internal"][0]["href"] + + def test_deduplication(self): + """Test that duplicate URLs are deduplicated.""" + html = ''' + + + Link 1 + Link 2 + Link 3 + + + ''' + result = quick_extract_links(html, "https://example.com") + + assert len(result["internal"]) == 1 + + def test_handles_malformed_html(self): + """Test graceful handling of malformed HTML.""" + html = "not valid html at all <><><" + result = quick_extract_links(html, "https://example.com") + + # Should not raise, should return empty + assert result["internal"] == [] + assert result["external"] == [] + + def test_empty_html(self): + """Test handling of empty HTML.""" + result = quick_extract_links("", "https://example.com") + assert result == {"internal": [], "external": []} + + def test_relative_url_resolution(self): + """Test that relative URLs are resolved correctly.""" + html = ''' + + + Relative + Dot Relative + Parent Relative + + + ''' + result = quick_extract_links(html, "https://example.com/docs/") + + assert len(result["internal"]) >= 1 + # All should be internal and properly resolved + for link in result["internal"]: + assert link["href"].startswith("https://example.com") + + def test_text_truncation(self): + """Test that long link text is truncated to 200 chars.""" + long_text = "A" * 300 + html = f''' + + + {long_text} + + + ''' + result = quick_extract_links(html, "https://example.com") + + assert len(result["internal"]) == 1 + assert len(result["internal"][0]["text"]) == 200 + + def test_empty_href_ignored(self): + """Test that empty href attributes are ignored.""" + html = ''' + + + Empty + Whitespace + Valid + + + ''' + result = quick_extract_links(html, "https://example.com") + + assert len(result["internal"]) == 1 + assert result["internal"][0]["href"] == "https://example.com/valid" + + def test_mixed_internal_external(self): + """Test correct classification of mixed internal and external links.""" + html = ''' + + + Internal 1 + Internal 2 + Google + GitHub + Internal 3 + + + ''' + result = quick_extract_links(html, "https://example.com") + + assert len(result["internal"]) == 3 + assert len(result["external"]) == 2 + + def test_subdomain_handling(self): + """Test that subdomains are handled correctly.""" + html = ''' + + + Docs subdomain + API subdomain + Main domain + + + ''' + result = quick_extract_links(html, "https://example.com") + + # All should be internal (same base domain) + total_links = len(result["internal"]) + len(result["external"]) + assert total_links == 3 + + +class TestQuickExtractLinksEdgeCases: + """Edge case tests for quick_extract_links.""" + + def test_no_links_in_page(self): + """Test page with no links.""" + html = ''' + + +

No Links Here

+

Just some text content.

+ + + ''' + result = quick_extract_links(html, "https://example.com") + + assert result["internal"] == [] + assert result["external"] == [] + + def test_links_in_nested_elements(self): + """Test links nested in various elements.""" + html = ''' + + + +
+

Check out our products.

+
+ + + ''' + result = quick_extract_links(html, "https://example.com") + + assert len(result["internal"]) == 3 + + def test_link_with_nested_elements(self): + """Test links containing nested elements.""" + html = ''' + + + Nested Text + + + ''' + result = quick_extract_links(html, "https://example.com") + + assert len(result["internal"]) == 1 + assert "Nested" in result["internal"][0]["text"] + assert "Text" in result["internal"][0]["text"] + + def test_protocol_relative_urls(self): + """Test handling of protocol-relative URLs (//example.com).""" + html = ''' + + + CDN Link + + + ''' + result = quick_extract_links(html, "https://example.com") + + # Should be resolved with https: + total = len(result["internal"]) + len(result["external"]) + assert total >= 1 + + def test_whitespace_in_href(self): + """Test handling of whitespace around href values.""" + html = ''' + + + Padded + Multiline + + + ''' + result = quick_extract_links(html, "https://example.com") + + # Both should be extracted and normalized + assert len(result["internal"]) >= 1 diff --git a/tests/test_prefetch_regression.py b/tests/test_prefetch_regression.py new file mode 100644 index 00000000..515e90cd --- /dev/null +++ b/tests/test_prefetch_regression.py @@ -0,0 +1,232 @@ +"""Regression tests to ensure prefetch mode doesn't break existing functionality.""" + +import pytest +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +TEST_URL = "https://docs.crawl4ai.com" + + +class TestNoRegressions: + """Ensure prefetch mode doesn't break existing functionality.""" + + @pytest.mark.asyncio + async def test_default_mode_unchanged(self): + """Test that default mode (prefetch=False) works exactly as before.""" + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig() # Default config + result = await crawler.arun(TEST_URL, config=config) + + # All standard fields should be populated + assert result.html is not None + assert result.cleaned_html is not None + assert result.links is not None + assert result.success is True + + @pytest.mark.asyncio + async def test_explicit_prefetch_false(self): + """Test explicit prefetch=False works like default.""" + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(prefetch=False) + result = await crawler.arun(TEST_URL, config=config) + + assert result.cleaned_html is not None + + @pytest.mark.asyncio + async def test_config_clone_preserves_prefetch(self): + """Test that config.clone() preserves prefetch setting.""" + config = CrawlerRunConfig(prefetch=True) + cloned = config.clone() + + assert cloned.prefetch == True + + # Clone with override + cloned_false = config.clone(prefetch=False) + assert cloned_false.prefetch == False + + @pytest.mark.asyncio + async def test_config_to_dict_includes_prefetch(self): + """Test that to_dict() includes prefetch.""" + config_true = CrawlerRunConfig(prefetch=True) + config_false = CrawlerRunConfig(prefetch=False) + + assert config_true.to_dict()["prefetch"] == True + assert config_false.to_dict()["prefetch"] == False + + @pytest.mark.asyncio + async def test_existing_extraction_still_works(self): + """Test that extraction strategies still work in normal mode.""" + from crawl4ai import JsonCssExtractionStrategy + + schema = { + "name": "Links", + "baseSelector": "a", + "fields": [ + {"name": "href", "selector": "", "type": "attribute", "attribute": "href"}, + {"name": "text", "selector": "", "type": "text"} + ] + } + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + extraction_strategy=JsonCssExtractionStrategy(schema=schema) + ) + result = await crawler.arun(TEST_URL, config=config) + + assert result.extracted_content is not None + + @pytest.mark.asyncio + async def test_existing_deep_crawl_still_works(self): + """Test that deep crawl without prefetch still does full processing.""" + from crawl4ai import BFSDeepCrawlStrategy + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=1, + max_pages=2 + ) + # No prefetch - should do full processing + ) + + result_container = await crawler.arun(TEST_URL, config=config) + + # Handle both list and iterator results + if hasattr(result_container, '__aiter__'): + results = [r async for r in result_container] + else: + results = list(result_container) if hasattr(result_container, '__iter__') else [result_container] + + # Each result should have full processing + for result in results: + assert result.cleaned_html is not None + + assert len(results) >= 1 + + @pytest.mark.asyncio + async def test_raw_url_scheme_still_works(self): + """Test that raw: URL scheme works for processing stored HTML.""" + sample_html = """ + + Test Page + +

Hello World

+

This is a test paragraph.

+ Link 1 + + + """ + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig() + result = await crawler.arun(f"raw:{sample_html}", config=config) + + assert result.success is True + assert result.html is not None + assert "Hello World" in result.html + assert result.cleaned_html is not None + + @pytest.mark.asyncio + async def test_screenshot_still_works(self): + """Test that screenshot option still works in normal mode.""" + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(screenshot=True) + result = await crawler.arun(TEST_URL, config=config) + + assert result.success is True + # Screenshot data should be present + assert result.screenshot is not None or result.screenshot_data is not None + + @pytest.mark.asyncio + async def test_js_execution_still_works(self): + """Test that JavaScript execution still works in normal mode.""" + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + js_code="document.querySelector('h1')?.textContent" + ) + result = await crawler.arun(TEST_URL, config=config) + + assert result.success is True + assert result.html is not None + + +class TestPrefetchDoesNotAffectOtherModes: + """Test that prefetch doesn't interfere with other configurations.""" + + @pytest.mark.asyncio + async def test_prefetch_with_other_options_ignored(self): + """Test that other options are properly ignored in prefetch mode.""" + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + prefetch=True, + # These should be ignored in prefetch mode + screenshot=True, + pdf=True, + only_text=True, + word_count_threshold=100 + ) + result = await crawler.arun(TEST_URL, config=config) + + # Should still return HTML and links + assert result.html is not None + assert result.links is not None + + # But should NOT have processed content + assert result.cleaned_html is None + assert result.extracted_content is None + + @pytest.mark.asyncio + async def test_stream_mode_still_works(self): + """Test that stream mode still works normally.""" + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(stream=True) + result = await crawler.arun(TEST_URL, config=config) + + assert result.success is True + assert result.html is not None + + @pytest.mark.asyncio + async def test_cache_mode_still_works(self): + """Test that cache mode still works normally.""" + from crawl4ai import CacheMode + + async with AsyncWebCrawler() as crawler: + # First request - bypass cache + config1 = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + result1 = await crawler.arun(TEST_URL, config=config1) + assert result1.success is True + + # Second request - should work + config2 = CrawlerRunConfig(cache_mode=CacheMode.ENABLED) + result2 = await crawler.arun(TEST_URL, config=config2) + assert result2.success is True + + +class TestBackwardsCompatibility: + """Test backwards compatibility with existing code patterns.""" + + @pytest.mark.asyncio + async def test_config_without_prefetch_works(self): + """Test that configs created without prefetch parameter work.""" + # Simulating old code that doesn't know about prefetch + config = CrawlerRunConfig( + word_count_threshold=50, + css_selector="body" + ) + + # Should default to prefetch=False + assert config.prefetch == False + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(TEST_URL, config=config) + assert result.success is True + assert result.cleaned_html is not None + + @pytest.mark.asyncio + async def test_from_kwargs_without_prefetch(self): + """Test CrawlerRunConfig.from_kwargs works without prefetch.""" + config = CrawlerRunConfig.from_kwargs({ + "word_count_threshold": 50, + "verbose": False + }) + + assert config.prefetch == False