diff --git a/CHANGELOG.md b/CHANGELOG.md index 9788caf2..ce63516f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ All notable changes to Crawl4AI will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Added +- **๐Ÿ”’ HTTPS Preservation for Internal Links**: New `preserve_https_for_internal_links` configuration flag + - Maintains HTTPS scheme for internal links even when servers redirect to HTTP + - Prevents security downgrades during deep crawling + - Useful for security-conscious crawling and sites supporting both protocols + - Fully backward compatible with opt-in flag (default: `False`) + - Fixes issue #1410 where HTTPS URLs were being downgraded to HTTP + ## [0.7.3] - 2025-08-09 ### Added diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 0c843b2b..0cf0b9ab 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1124,6 +1124,7 @@ class CrawlerRunConfig(): exclude_domains: list = None, exclude_internal_links: bool = False, score_links: bool = False, + preserve_https_for_internal_links: bool = False, # Debugging and Logging Parameters verbose: bool = True, log_console: bool = False, @@ -1247,6 +1248,7 @@ class CrawlerRunConfig(): self.exclude_domains = exclude_domains or [] self.exclude_internal_links = exclude_internal_links self.score_links = score_links + self.preserve_https_for_internal_links = preserve_https_for_internal_links # Debugging and Logging Parameters self.verbose = verbose @@ -1520,6 +1522,7 @@ class CrawlerRunConfig(): exclude_domains=kwargs.get("exclude_domains", []), exclude_internal_links=kwargs.get("exclude_internal_links", False), score_links=kwargs.get("score_links", False), + preserve_https_for_internal_links=kwargs.get("preserve_https_for_internal_links", False), # Debugging and Logging Parameters verbose=kwargs.get("verbose", True), log_console=kwargs.get("log_console", False), @@ -1626,6 +1629,7 @@ class CrawlerRunConfig(): "exclude_domains": self.exclude_domains, "exclude_internal_links": self.exclude_internal_links, "score_links": self.score_links, + "preserve_https_for_internal_links": self.preserve_https_for_internal_links, "verbose": self.verbose, "log_console": self.log_console, "capture_network_requests": self.capture_network_requests, diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index ebd2859d..f12fc488 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -354,6 +354,7 @@ class AsyncWebCrawler: ############################################################### # Process the HTML content, Call CrawlerStrategy.process_html # ############################################################### + from urllib.parse import urlparse crawl_result: CrawlResult = await self.aprocess_html( url=url, html=html, @@ -364,6 +365,7 @@ class AsyncWebCrawler: verbose=config.verbose, is_raw_html=True if url.startswith("raw:") else False, redirected_url=async_response.redirected_url, + original_scheme=urlparse(url).scheme, **kwargs, ) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 9ef0e616..d9095e49 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -258,7 +258,11 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy): continue try: - normalized_href = normalize_url(href, url) + normalized_href = normalize_url( + href, url, + preserve_https=kwargs.get('preserve_https_for_internal_links', False), + original_scheme=kwargs.get('original_scheme') + ) link_data = { "href": normalized_href, "text": link.text_content().strip(), diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 09e6e4b7..a871bc91 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -2146,7 +2146,9 @@ def normalize_url( drop_query_tracking=True, sort_query=True, keep_fragment=False, - extra_drop_params=None + extra_drop_params=None, + preserve_https=False, + original_scheme=None ): """ Extended URL normalizer @@ -2176,6 +2178,17 @@ def normalize_url( # Resolve relative paths first full_url = urljoin(base_url, href.strip()) + + # Preserve HTTPS if requested and original scheme was HTTPS + if preserve_https and original_scheme == 'https': + parsed_full = urlparse(full_url) + parsed_base = urlparse(base_url) + # Only preserve HTTPS for same-domain links (not protocol-relative URLs) + # Protocol-relative URLs (//example.com) should follow the base URL's scheme + if (parsed_full.scheme == 'http' and + parsed_full.netloc == parsed_base.netloc and + not href.strip().startswith('//')): + full_url = full_url.replace('http://', 'https://', 1) # Parse once, edit parts, then rebuild parsed = urlparse(full_url) @@ -2227,7 +2240,7 @@ def normalize_url( return normalized -def normalize_url_for_deep_crawl(href, base_url): +def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None): """Normalize URLs to ensure consistent format""" from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode @@ -2238,6 +2251,17 @@ def normalize_url_for_deep_crawl(href, base_url): # Use urljoin to handle relative URLs full_url = urljoin(base_url, href.strip()) + # Preserve HTTPS if requested and original scheme was HTTPS + if preserve_https and original_scheme == 'https': + parsed_full = urlparse(full_url) + parsed_base = urlparse(base_url) + # Only preserve HTTPS for same-domain links (not protocol-relative URLs) + # Protocol-relative URLs (//example.com) should follow the base URL's scheme + if (parsed_full.scheme == 'http' and + parsed_full.netloc == parsed_base.netloc and + not href.strip().startswith('//')): + full_url = full_url.replace('http://', 'https://', 1) + # Parse the URL for normalization parsed = urlparse(full_url) @@ -2275,7 +2299,7 @@ def normalize_url_for_deep_crawl(href, base_url): return normalized @lru_cache(maxsize=10000) -def efficient_normalize_url_for_deep_crawl(href, base_url): +def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None): """Efficient URL normalization with proper parsing""" from urllib.parse import urljoin @@ -2285,6 +2309,17 @@ def efficient_normalize_url_for_deep_crawl(href, base_url): # Resolve relative URLs full_url = urljoin(base_url, href.strip()) + # Preserve HTTPS if requested and original scheme was HTTPS + if preserve_https and original_scheme == 'https': + parsed_full = urlparse(full_url) + parsed_base = urlparse(base_url) + # Only preserve HTTPS for same-domain links (not protocol-relative URLs) + # Protocol-relative URLs (//example.com) should follow the base URL's scheme + if (parsed_full.scheme == 'http' and + parsed_full.netloc == parsed_base.netloc and + not href.strip().startswith('//')): + full_url = full_url.replace('http://', 'https://', 1) + # Use proper URL parsing parsed = urlparse(full_url) diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index ba526fb7..47f719c8 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -155,6 +155,7 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i | **`exclude_external_links`** | `bool` (False) | Removes all links pointing outside the current domain. | | **`exclude_social_media_links`** | `bool` (False) | Strips links specifically to social sites (like Facebook or Twitter). | | **`exclude_domains`** | `list` ([]) | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`). | +| **`preserve_https_for_internal_links`** | `bool` (False) | If `True`, preserves HTTPS scheme for internal links even when the server redirects to HTTP. Useful for security-conscious crawling. | Use these for link-level content filtering (often to keep crawls โ€œinternalโ€ or to remove spammy domains). diff --git a/docs/md_v2/core/deep-crawling.md b/docs/md_v2/core/deep-crawling.md index 00834787..93760f23 100644 --- a/docs/md_v2/core/deep-crawling.md +++ b/docs/md_v2/core/deep-crawling.md @@ -472,6 +472,17 @@ Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pag 5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling. +6.**Preserve HTTPS for security.** If crawling HTTPS sites that redirect to HTTP, use `preserve_https_for_internal_links=True` to maintain secure connections: + +```python +config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2), + preserve_https_for_internal_links=True # Keep HTTPS even if server redirects to HTTP +) +``` + +This is especially useful for security-conscious crawling or when dealing with sites that support both protocols. + --- ## 10. Summary & Next Steps diff --git a/tests/test_preserve_https_for_internal_links.py b/tests/test_preserve_https_for_internal_links.py new file mode 100644 index 00000000..8988f1c9 --- /dev/null +++ b/tests/test_preserve_https_for_internal_links.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python3 +""" +Final test and demo for HTTPS preservation feature (Issue #1410) + +This demonstrates how the preserve_https_for_internal_links flag +prevents HTTPS downgrade when servers redirect to HTTP. +""" + +import sys +import os +from urllib.parse import urljoin, urlparse + +def demonstrate_issue(): + """Show the problem: HTTPS -> HTTP redirect causes HTTP links""" + + print("=" * 60) + print("DEMONSTRATING THE ISSUE") + print("=" * 60) + + # Simulate what happens during crawling + original_url = "https://quotes.toscrape.com/tag/deep-thoughts" + redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/" # Server redirects to HTTP + + # Extract a relative link + relative_link = "/author/Albert-Einstein" + + # Standard URL joining uses the redirected (HTTP) base + resolved_url = urljoin(redirected_url, relative_link) + + print(f"Original URL: {original_url}") + print(f"Redirected to: {redirected_url}") + print(f"Relative link: {relative_link}") + print(f"Resolved link: {resolved_url}") + print(f"\nโŒ Problem: Link is now HTTP instead of HTTPS!") + + return resolved_url + +def demonstrate_solution(): + """Show the solution: preserve HTTPS for internal links""" + + print("\n" + "=" * 60) + print("DEMONSTRATING THE SOLUTION") + print("=" * 60) + + # Our normalize_url with HTTPS preservation + def normalize_url_with_preservation(href, base_url, preserve_https=False, original_scheme=None): + """Normalize URL with optional HTTPS preservation""" + + # Standard resolution + full_url = urljoin(base_url, href.strip()) + + # Preserve HTTPS if requested + if preserve_https and original_scheme == 'https': + parsed_full = urlparse(full_url) + parsed_base = urlparse(base_url) + + # Only for same-domain links + if parsed_full.scheme == 'http' and parsed_full.netloc == parsed_base.netloc: + full_url = full_url.replace('http://', 'https://', 1) + print(f" โ†’ Preserved HTTPS for {parsed_full.netloc}") + + return full_url + + # Same scenario as before + original_url = "https://quotes.toscrape.com/tag/deep-thoughts" + redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/" + relative_link = "/author/Albert-Einstein" + + # Without preservation (current behavior) + resolved_without = normalize_url_with_preservation( + relative_link, redirected_url, + preserve_https=False, original_scheme='https' + ) + + print(f"\nWithout preservation:") + print(f" Result: {resolved_without}") + + # With preservation (new feature) + resolved_with = normalize_url_with_preservation( + relative_link, redirected_url, + preserve_https=True, original_scheme='https' + ) + + print(f"\nWith preservation (preserve_https_for_internal_links=True):") + print(f" Result: {resolved_with}") + print(f"\nโœ… Solution: Internal link stays HTTPS!") + + return resolved_with + +def test_edge_cases(): + """Test important edge cases""" + + print("\n" + "=" * 60) + print("EDGE CASES") + print("=" * 60) + + from urllib.parse import urljoin, urlparse + + def preserve_https(href, base_url, original_scheme): + """Helper to test preservation logic""" + full_url = urljoin(base_url, href) + + if original_scheme == 'https': + parsed_full = urlparse(full_url) + parsed_base = urlparse(base_url) + # Fixed: check for protocol-relative URLs + if (parsed_full.scheme == 'http' and + parsed_full.netloc == parsed_base.netloc and + not href.strip().startswith('//')): + full_url = full_url.replace('http://', 'https://', 1) + + return full_url + + test_cases = [ + # (description, href, base_url, original_scheme, should_be_https) + ("External link", "http://other.com/page", "http://example.com", "https", False), + ("Already HTTPS", "/page", "https://example.com", "https", True), + ("No original HTTPS", "/page", "http://example.com", "http", False), + ("Subdomain", "/page", "http://sub.example.com", "https", True), + ("Protocol-relative", "//example.com/page", "http://example.com", "https", False), + ] + + for desc, href, base_url, orig_scheme, should_be_https in test_cases: + result = preserve_https(href, base_url, orig_scheme) + is_https = result.startswith('https://') + status = "โœ…" if is_https == should_be_https else "โŒ" + + print(f"\n{status} {desc}:") + print(f" Input: {href} + {base_url}") + print(f" Result: {result}") + print(f" Expected HTTPS: {should_be_https}, Got: {is_https}") + +def usage_example(): + """Show how to use the feature in crawl4ai""" + + print("\n" + "=" * 60) + print("USAGE IN CRAWL4AI") + print("=" * 60) + + print(""" +To enable HTTPS preservation in your crawl4ai code: + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + preserve_https_for_internal_links=True # Enable HTTPS preservation + ) + + result = await crawler.arun( + url="https://example.com", + config=config + ) + + # All internal links will maintain HTTPS even if + # the server redirects to HTTP +``` + +This is especially useful for: +- Sites that redirect HTTPS to HTTP but still support HTTPS +- Security-conscious crawling where you want to stay on HTTPS +- Avoiding mixed content issues in downstream processing +""") + +if __name__ == "__main__": + # Run all demonstrations + demonstrate_issue() + demonstrate_solution() + test_edge_cases() + usage_example() + + print("\n" + "=" * 60) + print("โœ… All tests complete!") + print("=" * 60) \ No newline at end of file