Add prefetch mode for two-phase deep crawling

- Add `prefetch` parameter to CrawlerRunConfig - Add `quick_extract_links()` function for fast link extraction - Add short-circuit in aprocess_html() for prefetch mode - Add 42 tests (unit, integration, regression) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-25 01:55:08 +00:00
parent 3937efcf0b
commit fde4e9f0c6
6 changed files with 816 additions and 0 deletions
--- a/tests/test_prefetch_integration.py
+++ b/tests/test_prefetch_integration.py
@@ -0,0 +1,236 @@
+"""Integration tests for prefetch mode with the crawler."""
+
+import pytest
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig
+
+# Use crawl4ai docs as test domain
+TEST_DOMAIN = "https://docs.crawl4ai.com"
+
+
+class TestPrefetchModeIntegration:
+    """Integration tests for prefetch mode."""
+
+    @pytest.mark.asyncio
+    async def test_prefetch_returns_html_and_links(self):
+        """Test that prefetch mode returns HTML and links only."""
+        async with AsyncWebCrawler() as crawler:
+            config = CrawlerRunConfig(prefetch=True)
+            result = await crawler.arun(TEST_DOMAIN, config=config)
+
+            # Should have HTML
+            assert result.html is not None
+            assert len(result.html) > 0
+            assert "<html" in result.html.lower() or "<!doctype" in result.html.lower()
+
+            # Should have links
+            assert result.links is not None
+            assert "internal" in result.links
+            assert "external" in result.links
+
+            # Should NOT have processed content
+            assert result.markdown is None or (
+                hasattr(result.markdown, 'raw_markdown') and
+                result.markdown.raw_markdown is None
+            )
+            assert result.cleaned_html is None
+            assert result.extracted_content is None
+
+    @pytest.mark.asyncio
+    async def test_prefetch_preserves_metadata(self):
+        """Test that prefetch mode preserves essential metadata."""
+        async with AsyncWebCrawler() as crawler:
+            config = CrawlerRunConfig(prefetch=True)
+            result = await crawler.arun(TEST_DOMAIN, config=config)
+
+            # Should have success flag
+            assert result.success is True
+
+            # Should have URL
+            assert result.url is not None
+
+            # Status code should be present
+            assert result.status_code is not None or result.status_code == 200
+
+    @pytest.mark.asyncio
+    async def test_prefetch_with_deep_crawl(self):
+        """Test prefetch mode with deep crawl strategy."""
+        from crawl4ai import BFSDeepCrawlStrategy
+
+        async with AsyncWebCrawler() as crawler:
+            config = CrawlerRunConfig(
+                prefetch=True,
+                deep_crawl_strategy=BFSDeepCrawlStrategy(
+                    max_depth=1,
+                    max_pages=3
+                )
+            )
+
+            result_container = await crawler.arun(TEST_DOMAIN, config=config)
+
+            # Handle both list and iterator results
+            if hasattr(result_container, '__aiter__'):
+                results = [r async for r in result_container]
+            else:
+                results = list(result_container) if hasattr(result_container, '__iter__') else [result_container]
+
+            # Each result should have HTML and links
+            for result in results:
+                assert result.html is not None
+                assert result.links is not None
+
+            # Should have crawled at least one page
+            assert len(results) >= 1
+
+    @pytest.mark.asyncio
+    async def test_prefetch_then_process_with_raw(self):
+        """Test the full two-phase workflow: prefetch then process."""
+        async with AsyncWebCrawler() as crawler:
+            # Phase 1: Prefetch
+            prefetch_config = CrawlerRunConfig(prefetch=True)
+            prefetch_result = await crawler.arun(TEST_DOMAIN, config=prefetch_config)
+
+            stored_html = prefetch_result.html
+
+            assert stored_html is not None
+            assert len(stored_html) > 0
+
+            # Phase 2: Process with raw: URL
+            process_config = CrawlerRunConfig(
+                # No prefetch - full processing
+                base_url=TEST_DOMAIN  # Provide base URL for link resolution
+            )
+            processed_result = await crawler.arun(
+                f"raw:{stored_html}",
+                config=process_config
+            )
+
+            # Should now have full processing
+            assert processed_result.html is not None
+            assert processed_result.success is True
+            # Note: cleaned_html and markdown depend on the content
+
+    @pytest.mark.asyncio
+    async def test_prefetch_links_structure(self):
+        """Test that links have the expected structure."""
+        async with AsyncWebCrawler() as crawler:
+            config = CrawlerRunConfig(prefetch=True)
+            result = await crawler.arun(TEST_DOMAIN, config=config)
+
+            assert result.links is not None
+
+            # Check internal links structure
+            if result.links["internal"]:
+                link = result.links["internal"][0]
+                assert "href" in link
+                assert "text" in link
+                assert link["href"].startswith("http")
+
+            # Check external links structure (if any)
+            if result.links["external"]:
+                link = result.links["external"][0]
+                assert "href" in link
+                assert "text" in link
+                assert link["href"].startswith("http")
+
+    @pytest.mark.asyncio
+    async def test_prefetch_config_clone(self):
+        """Test that config.clone() preserves prefetch setting."""
+        config = CrawlerRunConfig(prefetch=True)
+        cloned = config.clone()
+
+        assert cloned.prefetch == True
+
+        # Clone with override
+        cloned_false = config.clone(prefetch=False)
+        assert cloned_false.prefetch == False
+
+    @pytest.mark.asyncio
+    async def test_prefetch_to_dict(self):
+        """Test that to_dict() includes prefetch."""
+        config = CrawlerRunConfig(prefetch=True)
+        config_dict = config.to_dict()
+
+        assert "prefetch" in config_dict
+        assert config_dict["prefetch"] == True
+
+    @pytest.mark.asyncio
+    async def test_prefetch_default_false(self):
+        """Test that prefetch defaults to False."""
+        config = CrawlerRunConfig()
+        assert config.prefetch == False
+
+    @pytest.mark.asyncio
+    async def test_prefetch_explicit_false(self):
+        """Test explicit prefetch=False works like default."""
+        async with AsyncWebCrawler() as crawler:
+            config = CrawlerRunConfig(prefetch=False)
+            result = await crawler.arun(TEST_DOMAIN, config=config)
+
+            # Should have full processing
+            assert result.html is not None
+            # cleaned_html should be populated in normal mode
+            assert result.cleaned_html is not None
+
+
+class TestPrefetchPerformance:
+    """Performance-related tests for prefetch mode."""
+
+    @pytest.mark.asyncio
+    async def test_prefetch_returns_quickly(self):
+        """Test that prefetch mode returns results quickly."""
+        import time
+
+        async with AsyncWebCrawler() as crawler:
+            # Prefetch mode
+            start = time.time()
+            prefetch_config = CrawlerRunConfig(prefetch=True)
+            await crawler.arun(TEST_DOMAIN, config=prefetch_config)
+            prefetch_time = time.time() - start
+
+            # Full mode
+            start = time.time()
+            full_config = CrawlerRunConfig()
+            await crawler.arun(TEST_DOMAIN, config=full_config)
+            full_time = time.time() - start
+
+            # Log times for debugging
+            print(f"\nPrefetch: {prefetch_time:.3f}s, Full: {full_time:.3f}s")
+
+            # Prefetch should not be significantly slower
+            # (may be same or slightly faster depending on content)
+            # This is a soft check - mostly for logging
+
+
+class TestPrefetchWithRawHTML:
+    """Test prefetch mode with raw HTML input."""
+
+    @pytest.mark.asyncio
+    async def test_prefetch_with_raw_html(self):
+        """Test prefetch mode works with raw: URL scheme."""
+        sample_html = """
+        <html>
+            <head><title>Test Page</title></head>
+            <body>
+                <h1>Hello World</h1>
+                <a href="/link1">Link 1</a>
+                <a href="/link2">Link 2</a>
+                <a href="https://external.com/page">External</a>
+            </body>
+        </html>
+        """
+
+        async with AsyncWebCrawler() as crawler:
+            config = CrawlerRunConfig(
+                prefetch=True,
+                base_url="https://example.com"
+            )
+            result = await crawler.arun(f"raw:{sample_html}", config=config)
+
+            assert result.success is True
+            assert result.html is not None
+            assert result.links is not None
+
+            # Should have extracted links
+            assert len(result.links["internal"]) >= 2
+            assert len(result.links["external"]) >= 1
--- a/tests/test_prefetch_mode.py
+++ b/tests/test_prefetch_mode.py
@@ -0,0 +1,275 @@
+"""Unit tests for the quick_extract_links function used in prefetch mode."""
+
+import pytest
+from crawl4ai.utils import quick_extract_links
+
+
+class TestQuickExtractLinks:
+    """Unit tests for the quick_extract_links function."""
+
+    def test_basic_internal_links(self):
+        """Test extraction of internal links."""
+        html = '''
+        <html>
+            <body>
+                <a href="/page1">Page 1</a>
+                <a href="/page2">Page 2</a>
+                <a href="https://example.com/page3">Page 3</a>
+            </body>
+        </html>
+        '''
+        result = quick_extract_links(html, "https://example.com")
+
+        assert len(result["internal"]) == 3
+        assert result["internal"][0]["href"] == "https://example.com/page1"
+        assert result["internal"][0]["text"] == "Page 1"
+
+    def test_external_links(self):
+        """Test extraction and classification of external links."""
+        html = '''
+        <html>
+            <body>
+                <a href="https://other.com/page">External</a>
+                <a href="/internal">Internal</a>
+            </body>
+        </html>
+        '''
+        result = quick_extract_links(html, "https://example.com")
+
+        assert len(result["internal"]) == 1
+        assert len(result["external"]) == 1
+        assert result["external"][0]["href"] == "https://other.com/page"
+
+    def test_ignores_javascript_and_mailto(self):
+        """Test that javascript: and mailto: links are ignored."""
+        html = '''
+        <html>
+            <body>
+                <a href="javascript:void(0)">Click</a>
+                <a href="mailto:test@example.com">Email</a>
+                <a href="tel:+1234567890">Call</a>
+                <a href="/valid">Valid</a>
+            </body>
+        </html>
+        '''
+        result = quick_extract_links(html, "https://example.com")
+
+        assert len(result["internal"]) == 1
+        assert result["internal"][0]["href"] == "https://example.com/valid"
+
+    def test_ignores_anchor_only_links(self):
+        """Test that anchor-only links (#section) are ignored."""
+        html = '''
+        <html>
+            <body>
+                <a href="#section1">Section 1</a>
+                <a href="#section2">Section 2</a>
+                <a href="/page#section">Page with anchor</a>
+            </body>
+        </html>
+        '''
+        result = quick_extract_links(html, "https://example.com")
+
+        # Only the page link should be included, anchor-only links are skipped
+        assert len(result["internal"]) == 1
+        assert "/page" in result["internal"][0]["href"]
+
+    def test_deduplication(self):
+        """Test that duplicate URLs are deduplicated."""
+        html = '''
+        <html>
+            <body>
+                <a href="/page">Link 1</a>
+                <a href="/page">Link 2</a>
+                <a href="/page">Link 3</a>
+            </body>
+        </html>
+        '''
+        result = quick_extract_links(html, "https://example.com")
+
+        assert len(result["internal"]) == 1
+
+    def test_handles_malformed_html(self):
+        """Test graceful handling of malformed HTML."""
+        html = "not valid html at all <><><"
+        result = quick_extract_links(html, "https://example.com")
+
+        # Should not raise, should return empty
+        assert result["internal"] == []
+        assert result["external"] == []
+
+    def test_empty_html(self):
+        """Test handling of empty HTML."""
+        result = quick_extract_links("", "https://example.com")
+        assert result == {"internal": [], "external": []}
+
+    def test_relative_url_resolution(self):
+        """Test that relative URLs are resolved correctly."""
+        html = '''
+        <html>
+            <body>
+                <a href="page1.html">Relative</a>
+                <a href="./page2.html">Dot Relative</a>
+                <a href="../page3.html">Parent Relative</a>
+            </body>
+        </html>
+        '''
+        result = quick_extract_links(html, "https://example.com/docs/")
+
+        assert len(result["internal"]) >= 1
+        # All should be internal and properly resolved
+        for link in result["internal"]:
+            assert link["href"].startswith("https://example.com")
+
+    def test_text_truncation(self):
+        """Test that long link text is truncated to 200 chars."""
+        long_text = "A" * 300
+        html = f'''
+        <html>
+            <body>
+                <a href="/page">{long_text}</a>
+            </body>
+        </html>
+        '''
+        result = quick_extract_links(html, "https://example.com")
+
+        assert len(result["internal"]) == 1
+        assert len(result["internal"][0]["text"]) == 200
+
+    def test_empty_href_ignored(self):
+        """Test that empty href attributes are ignored."""
+        html = '''
+        <html>
+            <body>
+                <a href="">Empty</a>
+                <a href="   ">Whitespace</a>
+                <a href="/valid">Valid</a>
+            </body>
+        </html>
+        '''
+        result = quick_extract_links(html, "https://example.com")
+
+        assert len(result["internal"]) == 1
+        assert result["internal"][0]["href"] == "https://example.com/valid"
+
+    def test_mixed_internal_external(self):
+        """Test correct classification of mixed internal and external links."""
+        html = '''
+        <html>
+            <body>
+                <a href="/internal1">Internal 1</a>
+                <a href="https://example.com/internal2">Internal 2</a>
+                <a href="https://google.com">Google</a>
+                <a href="https://github.com/repo">GitHub</a>
+                <a href="/internal3">Internal 3</a>
+            </body>
+        </html>
+        '''
+        result = quick_extract_links(html, "https://example.com")
+
+        assert len(result["internal"]) == 3
+        assert len(result["external"]) == 2
+
+    def test_subdomain_handling(self):
+        """Test that subdomains are handled correctly."""
+        html = '''
+        <html>
+            <body>
+                <a href="https://docs.example.com/page">Docs subdomain</a>
+                <a href="https://api.example.com/v1">API subdomain</a>
+                <a href="https://example.com/main">Main domain</a>
+            </body>
+        </html>
+        '''
+        result = quick_extract_links(html, "https://example.com")
+
+        # All should be internal (same base domain)
+        total_links = len(result["internal"]) + len(result["external"])
+        assert total_links == 3
+
+
+class TestQuickExtractLinksEdgeCases:
+    """Edge case tests for quick_extract_links."""
+
+    def test_no_links_in_page(self):
+        """Test page with no links."""
+        html = '''
+        <html>
+            <body>
+                <h1>No Links Here</h1>
+                <p>Just some text content.</p>
+            </body>
+        </html>
+        '''
+        result = quick_extract_links(html, "https://example.com")
+
+        assert result["internal"] == []
+        assert result["external"] == []
+
+    def test_links_in_nested_elements(self):
+        """Test links nested in various elements."""
+        html = '''
+        <html>
+            <body>
+                <nav>
+                    <ul>
+                        <li><a href="/home">Home</a></li>
+                        <li><a href="/about">About</a></li>
+                    </ul>
+                </nav>
+                <div class="content">
+                    <p>Check out <a href="/products">our products</a>.</p>
+                </div>
+            </body>
+        </html>
+        '''
+        result = quick_extract_links(html, "https://example.com")
+
+        assert len(result["internal"]) == 3
+
+    def test_link_with_nested_elements(self):
+        """Test links containing nested elements."""
+        html = '''
+        <html>
+            <body>
+                <a href="/page"><span>Nested</span> <strong>Text</strong></a>
+            </body>
+        </html>
+        '''
+        result = quick_extract_links(html, "https://example.com")
+
+        assert len(result["internal"]) == 1
+        assert "Nested" in result["internal"][0]["text"]
+        assert "Text" in result["internal"][0]["text"]
+
+    def test_protocol_relative_urls(self):
+        """Test handling of protocol-relative URLs (//example.com)."""
+        html = '''
+        <html>
+            <body>
+                <a href="//cdn.example.com/asset">CDN Link</a>
+            </body>
+        </html>
+        '''
+        result = quick_extract_links(html, "https://example.com")
+
+        # Should be resolved with https:
+        total = len(result["internal"]) + len(result["external"])
+        assert total >= 1
+
+    def test_whitespace_in_href(self):
+        """Test handling of whitespace around href values."""
+        html = '''
+        <html>
+            <body>
+                <a href="  /page1  ">Padded</a>
+                <a href="
+                    /page2
+                ">Multiline</a>
+            </body>
+        </html>
+        '''
+        result = quick_extract_links(html, "https://example.com")
+
+        # Both should be extracted and normalized
+        assert len(result["internal"]) >= 1
--- a/tests/test_prefetch_regression.py
+++ b/tests/test_prefetch_regression.py
@@ -0,0 +1,232 @@
+"""Regression tests to ensure prefetch mode doesn't break existing functionality."""
+
+import pytest
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+TEST_URL = "https://docs.crawl4ai.com"
+
+
+class TestNoRegressions:
+    """Ensure prefetch mode doesn't break existing functionality."""
+
+    @pytest.mark.asyncio
+    async def test_default_mode_unchanged(self):
+        """Test that default mode (prefetch=False) works exactly as before."""
+        async with AsyncWebCrawler() as crawler:
+            config = CrawlerRunConfig()  # Default config
+            result = await crawler.arun(TEST_URL, config=config)
+
+            # All standard fields should be populated
+            assert result.html is not None
+            assert result.cleaned_html is not None
+            assert result.links is not None
+            assert result.success is True
+
+    @pytest.mark.asyncio
+    async def test_explicit_prefetch_false(self):
+        """Test explicit prefetch=False works like default."""
+        async with AsyncWebCrawler() as crawler:
+            config = CrawlerRunConfig(prefetch=False)
+            result = await crawler.arun(TEST_URL, config=config)
+
+            assert result.cleaned_html is not None
+
+    @pytest.mark.asyncio
+    async def test_config_clone_preserves_prefetch(self):
+        """Test that config.clone() preserves prefetch setting."""
+        config = CrawlerRunConfig(prefetch=True)
+        cloned = config.clone()
+
+        assert cloned.prefetch == True
+
+        # Clone with override
+        cloned_false = config.clone(prefetch=False)
+        assert cloned_false.prefetch == False
+
+    @pytest.mark.asyncio
+    async def test_config_to_dict_includes_prefetch(self):
+        """Test that to_dict() includes prefetch."""
+        config_true = CrawlerRunConfig(prefetch=True)
+        config_false = CrawlerRunConfig(prefetch=False)
+
+        assert config_true.to_dict()["prefetch"] == True
+        assert config_false.to_dict()["prefetch"] == False
+
+    @pytest.mark.asyncio
+    async def test_existing_extraction_still_works(self):
+        """Test that extraction strategies still work in normal mode."""
+        from crawl4ai import JsonCssExtractionStrategy
+
+        schema = {
+            "name": "Links",
+            "baseSelector": "a",
+            "fields": [
+                {"name": "href", "selector": "", "type": "attribute", "attribute": "href"},
+                {"name": "text", "selector": "", "type": "text"}
+            ]
+        }
+
+        async with AsyncWebCrawler() as crawler:
+            config = CrawlerRunConfig(
+                extraction_strategy=JsonCssExtractionStrategy(schema=schema)
+            )
+            result = await crawler.arun(TEST_URL, config=config)
+
+            assert result.extracted_content is not None
+
+    @pytest.mark.asyncio
+    async def test_existing_deep_crawl_still_works(self):
+        """Test that deep crawl without prefetch still does full processing."""
+        from crawl4ai import BFSDeepCrawlStrategy
+
+        async with AsyncWebCrawler() as crawler:
+            config = CrawlerRunConfig(
+                deep_crawl_strategy=BFSDeepCrawlStrategy(
+                    max_depth=1,
+                    max_pages=2
+                )
+                # No prefetch - should do full processing
+            )
+
+            result_container = await crawler.arun(TEST_URL, config=config)
+
+            # Handle both list and iterator results
+            if hasattr(result_container, '__aiter__'):
+                results = [r async for r in result_container]
+            else:
+                results = list(result_container) if hasattr(result_container, '__iter__') else [result_container]
+
+            # Each result should have full processing
+            for result in results:
+                assert result.cleaned_html is not None
+
+            assert len(results) >= 1
+
+    @pytest.mark.asyncio
+    async def test_raw_url_scheme_still_works(self):
+        """Test that raw: URL scheme works for processing stored HTML."""
+        sample_html = """
+        <html>
+            <head><title>Test Page</title></head>
+            <body>
+                <h1>Hello World</h1>
+                <p>This is a test paragraph.</p>
+                <a href="/link1">Link 1</a>
+            </body>
+        </html>
+        """
+
+        async with AsyncWebCrawler() as crawler:
+            config = CrawlerRunConfig()
+            result = await crawler.arun(f"raw:{sample_html}", config=config)
+
+            assert result.success is True
+            assert result.html is not None
+            assert "Hello World" in result.html
+            assert result.cleaned_html is not None
+
+    @pytest.mark.asyncio
+    async def test_screenshot_still_works(self):
+        """Test that screenshot option still works in normal mode."""
+        async with AsyncWebCrawler() as crawler:
+            config = CrawlerRunConfig(screenshot=True)
+            result = await crawler.arun(TEST_URL, config=config)
+
+            assert result.success is True
+            # Screenshot data should be present
+            assert result.screenshot is not None or result.screenshot_data is not None
+
+    @pytest.mark.asyncio
+    async def test_js_execution_still_works(self):
+        """Test that JavaScript execution still works in normal mode."""
+        async with AsyncWebCrawler() as crawler:
+            config = CrawlerRunConfig(
+                js_code="document.querySelector('h1')?.textContent"
+            )
+            result = await crawler.arun(TEST_URL, config=config)
+
+            assert result.success is True
+            assert result.html is not None
+
+
+class TestPrefetchDoesNotAffectOtherModes:
+    """Test that prefetch doesn't interfere with other configurations."""
+
+    @pytest.mark.asyncio
+    async def test_prefetch_with_other_options_ignored(self):
+        """Test that other options are properly ignored in prefetch mode."""
+        async with AsyncWebCrawler() as crawler:
+            config = CrawlerRunConfig(
+                prefetch=True,
+                # These should be ignored in prefetch mode
+                screenshot=True,
+                pdf=True,
+                only_text=True,
+                word_count_threshold=100
+            )
+            result = await crawler.arun(TEST_URL, config=config)
+
+            # Should still return HTML and links
+            assert result.html is not None
+            assert result.links is not None
+
+            # But should NOT have processed content
+            assert result.cleaned_html is None
+            assert result.extracted_content is None
+
+    @pytest.mark.asyncio
+    async def test_stream_mode_still_works(self):
+        """Test that stream mode still works normally."""
+        async with AsyncWebCrawler() as crawler:
+            config = CrawlerRunConfig(stream=True)
+            result = await crawler.arun(TEST_URL, config=config)
+
+            assert result.success is True
+            assert result.html is not None
+
+    @pytest.mark.asyncio
+    async def test_cache_mode_still_works(self):
+        """Test that cache mode still works normally."""
+        from crawl4ai import CacheMode
+
+        async with AsyncWebCrawler() as crawler:
+            # First request - bypass cache
+            config1 = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+            result1 = await crawler.arun(TEST_URL, config=config1)
+            assert result1.success is True
+
+            # Second request - should work
+            config2 = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
+            result2 = await crawler.arun(TEST_URL, config=config2)
+            assert result2.success is True
+
+
+class TestBackwardsCompatibility:
+    """Test backwards compatibility with existing code patterns."""
+
+    @pytest.mark.asyncio
+    async def test_config_without_prefetch_works(self):
+        """Test that configs created without prefetch parameter work."""
+        # Simulating old code that doesn't know about prefetch
+        config = CrawlerRunConfig(
+            word_count_threshold=50,
+            css_selector="body"
+        )
+
+        # Should default to prefetch=False
+        assert config.prefetch == False
+
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(TEST_URL, config=config)
+            assert result.success is True
+            assert result.cleaned_html is not None
+
+    @pytest.mark.asyncio
+    async def test_from_kwargs_without_prefetch(self):
+        """Test CrawlerRunConfig.from_kwargs works without prefetch."""
+        config = CrawlerRunConfig.from_kwargs({
+            "word_count_threshold": 50,
+            "verbose": False
+        })
+
+        assert config.prefetch == False