"""Unit tests for the quick_extract_links function used in prefetch mode.""" import pytest from crawl4ai.utils import quick_extract_links class TestQuickExtractLinks: """Unit tests for the quick_extract_links function.""" def test_basic_internal_links(self): """Test extraction of internal links.""" html = ''' Page 1 Page 2 Page 3 ''' result = quick_extract_links(html, "https://example.com") assert len(result["internal"]) == 3 assert result["internal"][0]["href"] == "https://example.com/page1" assert result["internal"][0]["text"] == "Page 1" def test_external_links(self): """Test extraction and classification of external links.""" html = ''' External Internal ''' result = quick_extract_links(html, "https://example.com") assert len(result["internal"]) == 1 assert len(result["external"]) == 1 assert result["external"][0]["href"] == "https://other.com/page" def test_ignores_javascript_and_mailto(self): """Test that javascript: and mailto: links are ignored.""" html = ''' Click Email Call Valid ''' result = quick_extract_links(html, "https://example.com") assert len(result["internal"]) == 1 assert result["internal"][0]["href"] == "https://example.com/valid" def test_ignores_anchor_only_links(self): """Test that anchor-only links (#section) are ignored.""" html = ''' Section 1 Section 2 Page with anchor ''' result = quick_extract_links(html, "https://example.com") # Only the page link should be included, anchor-only links are skipped assert len(result["internal"]) == 1 assert "/page" in result["internal"][0]["href"] def test_deduplication(self): """Test that duplicate URLs are deduplicated.""" html = ''' Link 1 Link 2 Link 3 ''' result = quick_extract_links(html, "https://example.com") assert len(result["internal"]) == 1 def test_handles_malformed_html(self): """Test graceful handling of malformed HTML.""" html = "not valid html at all <><><" result = quick_extract_links(html, "https://example.com") # Should not raise, should return empty assert result["internal"] == [] assert result["external"] == [] def test_empty_html(self): """Test handling of empty HTML.""" result = quick_extract_links("", "https://example.com") assert result == {"internal": [], "external": []} def test_relative_url_resolution(self): """Test that relative URLs are resolved correctly.""" html = ''' Relative Dot Relative Parent Relative ''' result = quick_extract_links(html, "https://example.com/docs/") assert len(result["internal"]) >= 1 # All should be internal and properly resolved for link in result["internal"]: assert link["href"].startswith("https://example.com") def test_text_truncation(self): """Test that long link text is truncated to 200 chars.""" long_text = "A" * 300 html = f''' {long_text} ''' result = quick_extract_links(html, "https://example.com") assert len(result["internal"]) == 1 assert len(result["internal"][0]["text"]) == 200 def test_empty_href_ignored(self): """Test that empty href attributes are ignored.""" html = ''' Empty Whitespace Valid ''' result = quick_extract_links(html, "https://example.com") assert len(result["internal"]) == 1 assert result["internal"][0]["href"] == "https://example.com/valid" def test_mixed_internal_external(self): """Test correct classification of mixed internal and external links.""" html = ''' Internal 1 Internal 2 Google GitHub Internal 3 ''' result = quick_extract_links(html, "https://example.com") assert len(result["internal"]) == 3 assert len(result["external"]) == 2 def test_subdomain_handling(self): """Test that subdomains are handled correctly.""" html = ''' Docs subdomain API subdomain Main domain ''' result = quick_extract_links(html, "https://example.com") # All should be internal (same base domain) total_links = len(result["internal"]) + len(result["external"]) assert total_links == 3 class TestQuickExtractLinksEdgeCases: """Edge case tests for quick_extract_links.""" def test_no_links_in_page(self): """Test page with no links.""" html = '''

No Links Here

Just some text content.

''' result = quick_extract_links(html, "https://example.com") assert result["internal"] == [] assert result["external"] == [] def test_links_in_nested_elements(self): """Test links nested in various elements.""" html = '''

Check out our products.

''' result = quick_extract_links(html, "https://example.com") assert len(result["internal"]) == 3 def test_link_with_nested_elements(self): """Test links containing nested elements.""" html = ''' Nested Text ''' result = quick_extract_links(html, "https://example.com") assert len(result["internal"]) == 1 assert "Nested" in result["internal"][0]["text"] assert "Text" in result["internal"][0]["text"] def test_protocol_relative_urls(self): """Test handling of protocol-relative URLs (//example.com).""" html = ''' CDN Link ''' result = quick_extract_links(html, "https://example.com") # Should be resolved with https: total = len(result["internal"]) + len(result["external"]) assert total >= 1 def test_whitespace_in_href(self): """Test handling of whitespace around href values.""" html = ''' Padded Multiline ''' result = quick_extract_links(html, "https://example.com") # Both should be extracted and normalized assert len(result["internal"]) >= 1