Compare commits

..

10 Commits

Author SHA1 Message Date
ntohidi
bdacf61ca9 feat: update documentation for preserve_https_for_internal_links. ref #1410 2025-08-28 17:48:12 +08:00
ntohidi
f566c5a376 feat: add preserve_https_for_internal_links flag to maintain HTTPS during crawling. Ref #1410
Added a new `preserve_https_for_internal_links` configuration flag that preserves the original HTTPS scheme for same-domain links even when the server redirects to HTTP.
2025-08-28 17:38:40 +08:00
Nasrin
ef174a4c7a Merge pull request #1104 from emmanuel-ferdman/main
fix(docker-api): migrate to modern datetime library API
2025-08-20 10:57:39 +08:00
Nasrin
f4206d6ba1 Merge pull request #1369 from NezarAli/main
Fix examples in README.md
2025-08-18 14:22:54 +08:00
Nasrin
dad7c51481 Merge pull request #1398 from unclecode/fix/update-url-seeding-docs
Update URL seeding examples to use proper async context managers
2025-08-18 13:00:26 +08:00
ntohidi
f4a432829e fix(crawler): Removed the incorrect reference in browser_config variable #1310 2025-08-18 10:59:14 +08:00
Soham Kukreti
ecbe5ffb84 docs: Update URL seeding examples to use proper async context managers
- Wrap all AsyncUrlSeeder usage with async context managers
- Update URL seeding adventure example to use "sitemap+cc" source, focus on course posts, and add stream=True parameter to fix runtime error
2025-08-13 18:16:46 +05:30
Nezar Ali
7a8190ecb6 Fix examples in README.md 2025-08-06 11:58:29 +03:00
Emmanuel Ferdman
8e3c411a3e Merge branch 'main' into main 2025-07-29 14:05:35 +03:00
Emmanuel Ferdman
1e1c887a2f fix(docker-api): migrate to modern datetime library API
Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>
2025-05-13 00:04:58 -07:00
12 changed files with 284 additions and 40 deletions

View File

@@ -5,6 +5,16 @@ All notable changes to Crawl4AI will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
### Added
- **🔒 HTTPS Preservation for Internal Links**: New `preserve_https_for_internal_links` configuration flag
- Maintains HTTPS scheme for internal links even when servers redirect to HTTP
- Prevents security downgrades during deep crawling
- Useful for security-conscious crawling and sites supporting both protocols
- Fully backward compatible with opt-in flag (default: `False`)
- Fixes issue #1410 where HTTPS URLs were being downgraded to HTTP
## [0.7.3] - 2025-08-09 ## [0.7.3] - 2025-08-09
### Added ### Added

View File

@@ -373,7 +373,7 @@ async def main():
async with AsyncWebCrawler(config=browser_config) as crawler: async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun( result = await crawler.arun(
url="https://docs.micronaut.io/4.7.6/guide/", url="https://docs.micronaut.io/4.9.9/guide/",
config=run_config config=run_config
) )
print(len(result.markdown.raw_markdown)) print(len(result.markdown.raw_markdown))
@@ -425,7 +425,7 @@ async def main():
"type": "attribute", "type": "attribute",
"attribute": "src" "attribute": "src"
} }
} ]
} }
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)

View File

@@ -1121,6 +1121,7 @@ class CrawlerRunConfig():
exclude_domains: list = None, exclude_domains: list = None,
exclude_internal_links: bool = False, exclude_internal_links: bool = False,
score_links: bool = False, score_links: bool = False,
preserve_https_for_internal_links: bool = False,
# Debugging and Logging Parameters # Debugging and Logging Parameters
verbose: bool = True, verbose: bool = True,
log_console: bool = False, log_console: bool = False,
@@ -1244,6 +1245,7 @@ class CrawlerRunConfig():
self.exclude_domains = exclude_domains or [] self.exclude_domains = exclude_domains or []
self.exclude_internal_links = exclude_internal_links self.exclude_internal_links = exclude_internal_links
self.score_links = score_links self.score_links = score_links
self.preserve_https_for_internal_links = preserve_https_for_internal_links
# Debugging and Logging Parameters # Debugging and Logging Parameters
self.verbose = verbose self.verbose = verbose
@@ -1517,6 +1519,7 @@ class CrawlerRunConfig():
exclude_domains=kwargs.get("exclude_domains", []), exclude_domains=kwargs.get("exclude_domains", []),
exclude_internal_links=kwargs.get("exclude_internal_links", False), exclude_internal_links=kwargs.get("exclude_internal_links", False),
score_links=kwargs.get("score_links", False), score_links=kwargs.get("score_links", False),
preserve_https_for_internal_links=kwargs.get("preserve_https_for_internal_links", False),
# Debugging and Logging Parameters # Debugging and Logging Parameters
verbose=kwargs.get("verbose", True), verbose=kwargs.get("verbose", True),
log_console=kwargs.get("log_console", False), log_console=kwargs.get("log_console", False),
@@ -1623,6 +1626,7 @@ class CrawlerRunConfig():
"exclude_domains": self.exclude_domains, "exclude_domains": self.exclude_domains,
"exclude_internal_links": self.exclude_internal_links, "exclude_internal_links": self.exclude_internal_links,
"score_links": self.score_links, "score_links": self.score_links,
"preserve_https_for_internal_links": self.preserve_https_for_internal_links,
"verbose": self.verbose, "verbose": self.verbose,
"log_console": self.log_console, "log_console": self.log_console,
"capture_network_requests": self.capture_network_requests, "capture_network_requests": self.capture_network_requests,

View File

@@ -824,7 +824,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
except Error: except Error:
visibility_info = await self.check_visibility(page) visibility_info = await self.check_visibility(page)
if self.browser_config.config.verbose: if self.browser_config.verbose:
self.logger.debug( self.logger.debug(
message="Body visibility info: {info}", message="Body visibility info: {info}",
tag="DEBUG", tag="DEBUG",

View File

@@ -354,6 +354,7 @@ class AsyncWebCrawler:
############################################################### ###############################################################
# Process the HTML content, Call CrawlerStrategy.process_html # # Process the HTML content, Call CrawlerStrategy.process_html #
############################################################### ###############################################################
from urllib.parse import urlparse
crawl_result: CrawlResult = await self.aprocess_html( crawl_result: CrawlResult = await self.aprocess_html(
url=url, url=url,
html=html, html=html,
@@ -364,6 +365,7 @@ class AsyncWebCrawler:
verbose=config.verbose, verbose=config.verbose,
is_raw_html=True if url.startswith("raw:") else False, is_raw_html=True if url.startswith("raw:") else False,
redirected_url=async_response.redirected_url, redirected_url=async_response.redirected_url,
original_scheme=urlparse(url).scheme,
**kwargs, **kwargs,
) )

View File

@@ -258,7 +258,11 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
continue continue
try: try:
normalized_href = normalize_url(href, url) normalized_href = normalize_url(
href, url,
preserve_https=kwargs.get('preserve_https_for_internal_links', False),
original_scheme=kwargs.get('original_scheme')
)
link_data = { link_data = {
"href": normalized_href, "href": normalized_href,
"text": link.text_content().strip(), "text": link.text_content().strip(),

View File

@@ -2146,7 +2146,9 @@ def normalize_url(
drop_query_tracking=True, drop_query_tracking=True,
sort_query=True, sort_query=True,
keep_fragment=False, keep_fragment=False,
extra_drop_params=None extra_drop_params=None,
preserve_https=False,
original_scheme=None
): ):
""" """
Extended URL normalizer Extended URL normalizer
@@ -2176,6 +2178,17 @@ def normalize_url(
# Resolve relative paths first # Resolve relative paths first
full_url = urljoin(base_url, href.strip()) full_url = urljoin(base_url, href.strip())
# Preserve HTTPS if requested and original scheme was HTTPS
if preserve_https and original_scheme == 'https':
parsed_full = urlparse(full_url)
parsed_base = urlparse(base_url)
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
if (parsed_full.scheme == 'http' and
parsed_full.netloc == parsed_base.netloc and
not href.strip().startswith('//')):
full_url = full_url.replace('http://', 'https://', 1)
# Parse once, edit parts, then rebuild # Parse once, edit parts, then rebuild
parsed = urlparse(full_url) parsed = urlparse(full_url)
@@ -2225,7 +2238,7 @@ def normalize_url(
return normalized return normalized
def normalize_url_for_deep_crawl(href, base_url): def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
"""Normalize URLs to ensure consistent format""" """Normalize URLs to ensure consistent format"""
from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
@@ -2236,6 +2249,17 @@ def normalize_url_for_deep_crawl(href, base_url):
# Use urljoin to handle relative URLs # Use urljoin to handle relative URLs
full_url = urljoin(base_url, href.strip()) full_url = urljoin(base_url, href.strip())
# Preserve HTTPS if requested and original scheme was HTTPS
if preserve_https and original_scheme == 'https':
parsed_full = urlparse(full_url)
parsed_base = urlparse(base_url)
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
if (parsed_full.scheme == 'http' and
parsed_full.netloc == parsed_base.netloc and
not href.strip().startswith('//')):
full_url = full_url.replace('http://', 'https://', 1)
# Parse the URL for normalization # Parse the URL for normalization
parsed = urlparse(full_url) parsed = urlparse(full_url)
@@ -2273,7 +2297,7 @@ def normalize_url_for_deep_crawl(href, base_url):
return normalized return normalized
@lru_cache(maxsize=10000) @lru_cache(maxsize=10000)
def efficient_normalize_url_for_deep_crawl(href, base_url): def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
"""Efficient URL normalization with proper parsing""" """Efficient URL normalization with proper parsing"""
from urllib.parse import urljoin from urllib.parse import urljoin
@@ -2283,6 +2307,17 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
# Resolve relative URLs # Resolve relative URLs
full_url = urljoin(base_url, href.strip()) full_url = urljoin(base_url, href.strip())
# Preserve HTTPS if requested and original scheme was HTTPS
if preserve_https and original_scheme == 'https':
parsed_full = urlparse(full_url)
parsed_base = urlparse(base_url)
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
if (parsed_full.scheme == 'http' and
parsed_full.netloc == parsed_base.netloc and
not href.strip().startswith('//')):
full_url = full_url.replace('http://', 'https://', 1)
# Use proper URL parsing # Use proper URL parsing
parsed = urlparse(full_url) parsed = urlparse(full_url)

View File

@@ -4,7 +4,7 @@ import asyncio
from typing import List, Tuple, Dict from typing import List, Tuple, Dict
from functools import partial from functools import partial
from uuid import uuid4 from uuid import uuid4
from datetime import datetime from datetime import datetime, timezone
from base64 import b64encode from base64 import b64encode
import logging import logging
@@ -576,7 +576,7 @@ async def handle_crawl_job(
task_id = f"crawl_{uuid4().hex[:8]}" task_id = f"crawl_{uuid4().hex[:8]}"
await redis.hset(f"task:{task_id}", mapping={ await redis.hset(f"task:{task_id}", mapping={
"status": TaskStatus.PROCESSING, # <-- keep enum values consistent "status": TaskStatus.PROCESSING, # <-- keep enum values consistent
"created_at": datetime.utcnow().isoformat(), "created_at": datetime.now(timezone.utc).replace(tzinfo=None).isoformat(),
"url": json.dumps(urls), # store list as JSON string "url": json.dumps(urls), # store list as JSON string
"result": "", "result": "",
"error": "", "error": "",

View File

@@ -155,6 +155,7 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
| **`exclude_external_links`** | `bool` (False) | Removes all links pointing outside the current domain. | | **`exclude_external_links`** | `bool` (False) | Removes all links pointing outside the current domain. |
| **`exclude_social_media_links`** | `bool` (False) | Strips links specifically to social sites (like Facebook or Twitter). | | **`exclude_social_media_links`** | `bool` (False) | Strips links specifically to social sites (like Facebook or Twitter). |
| **`exclude_domains`** | `list` ([]) | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`). | | **`exclude_domains`** | `list` ([]) | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`). |
| **`preserve_https_for_internal_links`** | `bool` (False) | If `True`, preserves HTTPS scheme for internal links even when the server redirects to HTTP. Useful for security-conscious crawling. |
Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains). Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains).

View File

@@ -472,6 +472,17 @@ Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pag
5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling. 5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.
6.**Preserve HTTPS for security.** If crawling HTTPS sites that redirect to HTTP, use `preserve_https_for_internal_links=True` to maintain secure connections:
```python
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2),
preserve_https_for_internal_links=True # Keep HTTPS even if server redirects to HTTP
)
```
This is especially useful for security-conscious crawling or when dealing with sites that support both protocols.
--- ---
## 10. Summary & Next Steps ## 10. Summary & Next Steps

View File

@@ -102,16 +102,16 @@ async def smart_blog_crawler():
# Step 2: Configure discovery - let's find all blog posts # Step 2: Configure discovery - let's find all blog posts
config = SeedingConfig( config = SeedingConfig(
source="sitemap", # Use the website's sitemap source="sitemap+cc", # Use the website's sitemap+cc
pattern="*/blog/*.html", # Only blog posts pattern="*/courses/*", # Only courses related posts
extract_head=True, # Get page metadata extract_head=True, # Get page metadata
max_urls=100 # Limit for this example max_urls=100 # Limit for this example
) )
# Step 3: Discover URLs from the Python blog # Step 3: Discover URLs from the Python blog
print("🔍 Discovering blog posts...") print("🔍 Discovering course posts...")
urls = await seeder.urls("realpython.com", config) urls = await seeder.urls("realpython.com", config)
print(f"✅ Found {len(urls)} blog posts") print(f"✅ Found {len(urls)} course posts")
# Step 4: Filter for Python tutorials (using metadata!) # Step 4: Filter for Python tutorials (using metadata!)
tutorials = [ tutorials = [
@@ -134,7 +134,8 @@ async def smart_blog_crawler():
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig( config = CrawlerRunConfig(
only_text=True, only_text=True,
word_count_threshold=300 # Only substantial articles word_count_threshold=300, # Only substantial articles
stream=True
) )
# Extract URLs and crawl them # Extract URLs and crawl them
@@ -155,7 +156,7 @@ asyncio.run(smart_blog_crawler())
**What just happened?** **What just happened?**
1. We discovered all blog URLs from the sitemap 1. We discovered all blog URLs from the sitemap+cc
2. We filtered using metadata (no crawling needed!) 2. We filtered using metadata (no crawling needed!)
3. We crawled only the relevant tutorials 3. We crawled only the relevant tutorials
4. We saved tons of time and bandwidth 4. We saved tons of time and bandwidth
@@ -282,8 +283,8 @@ config = SeedingConfig(
live_check=True, # Verify each URL is accessible live_check=True, # Verify each URL is accessible
concurrency=20 # Check 20 URLs in parallel concurrency=20 # Check 20 URLs in parallel
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("example.com", config) urls = await seeder.urls("example.com", config)
# Now you can filter by status # Now you can filter by status
live_urls = [u for u in urls if u["status"] == "valid"] live_urls = [u for u in urls if u["status"] == "valid"]
@@ -311,8 +312,8 @@ This is where URL seeding gets really powerful. Instead of crawling entire pages
config = SeedingConfig( config = SeedingConfig(
extract_head=True # Extract metadata from <head> section extract_head=True # Extract metadata from <head> section
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("example.com", config) urls = await seeder.urls("example.com", config)
# Now each URL has rich metadata # Now each URL has rich metadata
for url in urls[:3]: for url in urls[:3]:
@@ -387,8 +388,8 @@ config = SeedingConfig(
scoring_method="bm25", scoring_method="bm25",
score_threshold=0.3 score_threshold=0.3
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("example.com", config) urls = await seeder.urls("example.com", config)
# URLs are scored based on: # URLs are scored based on:
# 1. Domain parts matching (e.g., 'python' in python.example.com) # 1. Domain parts matching (e.g., 'python' in python.example.com)
@@ -429,8 +430,8 @@ config = SeedingConfig(
extract_head=True, extract_head=True,
live_check=True live_check=True
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("blog.example.com", config) urls = await seeder.urls("blog.example.com", config)
# Analyze the results # Analyze the results
for url in urls[:5]: for url in urls[:5]:
@@ -488,8 +489,8 @@ config = SeedingConfig(
scoring_method="bm25", # Use BM25 algorithm scoring_method="bm25", # Use BM25 algorithm
score_threshold=0.3 # Minimum relevance score score_threshold=0.3 # Minimum relevance score
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("realpython.com", config) urls = await seeder.urls("realpython.com", config)
# Results are automatically sorted by relevance! # Results are automatically sorted by relevance!
for url in urls[:5]: for url in urls[:5]:
@@ -511,8 +512,8 @@ config = SeedingConfig(
score_threshold=0.5, score_threshold=0.5,
max_urls=20 max_urls=20
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("docs.example.com", config) urls = await seeder.urls("docs.example.com", config)
# The highest scoring URLs will be API docs! # The highest scoring URLs will be API docs!
``` ```
@@ -529,8 +530,8 @@ config = SeedingConfig(
score_threshold=0.4, score_threshold=0.4,
pattern="*/product/*" # Combine with pattern matching pattern="*/product/*" # Combine with pattern matching
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("shop.example.com", config) urls = await seeder.urls("shop.example.com", config)
# Filter further by price (from metadata) # Filter further by price (from metadata)
affordable = [ affordable = [
@@ -550,8 +551,8 @@ config = SeedingConfig(
scoring_method="bm25", scoring_method="bm25",
score_threshold=0.35 score_threshold=0.35
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("technews.com", config) urls = await seeder.urls("technews.com", config)
# Filter by date # Filter by date
from datetime import datetime, timedelta from datetime import datetime, timedelta
@@ -591,8 +592,8 @@ for query in queries:
score_threshold=0.4, score_threshold=0.4,
max_urls=10 # Top 10 per topic max_urls=10 # Top 10 per topic
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("learning-platform.com", config) urls = await seeder.urls("learning-platform.com", config)
all_tutorials.extend(urls) all_tutorials.extend(urls)
# Remove duplicates while preserving order # Remove duplicates while preserving order
@@ -625,7 +626,8 @@ config = SeedingConfig(
) )
# Returns a dictionary: {domain: [urls]} # Returns a dictionary: {domain: [urls]}
results = await seeder.many_urls(domains, config) async with AsyncUrlSeeder() as seeder:
results = await seeder.many_urls(domains, config)
# Process results # Process results
for domain, urls in results.items(): for domain, urls in results.items():
@@ -654,8 +656,8 @@ config = SeedingConfig(
pattern="*/blog/*", pattern="*/blog/*",
max_urls=100 max_urls=100
) )
async with AsyncUrlSeeder() as seeder:
results = await seeder.many_urls(competitors, config) results = await seeder.many_urls(competitors, config)
# Analyze content types # Analyze content types
for domain, urls in results.items(): for domain, urls in results.items():
@@ -690,8 +692,8 @@ config = SeedingConfig(
score_threshold=0.3, score_threshold=0.3,
max_urls=20 # Per site max_urls=20 # Per site
) )
async with AsyncUrlSeeder() as seeder:
results = await seeder.many_urls(educational_sites, config) results = await seeder.many_urls(educational_sites, config)
# Find the best beginner tutorials # Find the best beginner tutorials
all_tutorials = [] all_tutorials = []
@@ -731,8 +733,8 @@ config = SeedingConfig(
score_threshold=0.5, # High threshold for relevance score_threshold=0.5, # High threshold for relevance
max_urls=10 max_urls=10
) )
async with AsyncUrlSeeder() as seeder:
results = await seeder.many_urls(news_sites, config) results = await seeder.many_urls(news_sites, config)
# Collect all mentions # Collect all mentions
mentions = [] mentions = []

View File

@@ -0,0 +1,175 @@
#!/usr/bin/env python3
"""
Final test and demo for HTTPS preservation feature (Issue #1410)
This demonstrates how the preserve_https_for_internal_links flag
prevents HTTPS downgrade when servers redirect to HTTP.
"""
import sys
import os
from urllib.parse import urljoin, urlparse
def demonstrate_issue():
"""Show the problem: HTTPS -> HTTP redirect causes HTTP links"""
print("=" * 60)
print("DEMONSTRATING THE ISSUE")
print("=" * 60)
# Simulate what happens during crawling
original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/" # Server redirects to HTTP
# Extract a relative link
relative_link = "/author/Albert-Einstein"
# Standard URL joining uses the redirected (HTTP) base
resolved_url = urljoin(redirected_url, relative_link)
print(f"Original URL: {original_url}")
print(f"Redirected to: {redirected_url}")
print(f"Relative link: {relative_link}")
print(f"Resolved link: {resolved_url}")
print(f"\n❌ Problem: Link is now HTTP instead of HTTPS!")
return resolved_url
def demonstrate_solution():
"""Show the solution: preserve HTTPS for internal links"""
print("\n" + "=" * 60)
print("DEMONSTRATING THE SOLUTION")
print("=" * 60)
# Our normalize_url with HTTPS preservation
def normalize_url_with_preservation(href, base_url, preserve_https=False, original_scheme=None):
"""Normalize URL with optional HTTPS preservation"""
# Standard resolution
full_url = urljoin(base_url, href.strip())
# Preserve HTTPS if requested
if preserve_https and original_scheme == 'https':
parsed_full = urlparse(full_url)
parsed_base = urlparse(base_url)
# Only for same-domain links
if parsed_full.scheme == 'http' and parsed_full.netloc == parsed_base.netloc:
full_url = full_url.replace('http://', 'https://', 1)
print(f" → Preserved HTTPS for {parsed_full.netloc}")
return full_url
# Same scenario as before
original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/"
relative_link = "/author/Albert-Einstein"
# Without preservation (current behavior)
resolved_without = normalize_url_with_preservation(
relative_link, redirected_url,
preserve_https=False, original_scheme='https'
)
print(f"\nWithout preservation:")
print(f" Result: {resolved_without}")
# With preservation (new feature)
resolved_with = normalize_url_with_preservation(
relative_link, redirected_url,
preserve_https=True, original_scheme='https'
)
print(f"\nWith preservation (preserve_https_for_internal_links=True):")
print(f" Result: {resolved_with}")
print(f"\n✅ Solution: Internal link stays HTTPS!")
return resolved_with
def test_edge_cases():
"""Test important edge cases"""
print("\n" + "=" * 60)
print("EDGE CASES")
print("=" * 60)
from urllib.parse import urljoin, urlparse
def preserve_https(href, base_url, original_scheme):
"""Helper to test preservation logic"""
full_url = urljoin(base_url, href)
if original_scheme == 'https':
parsed_full = urlparse(full_url)
parsed_base = urlparse(base_url)
# Fixed: check for protocol-relative URLs
if (parsed_full.scheme == 'http' and
parsed_full.netloc == parsed_base.netloc and
not href.strip().startswith('//')):
full_url = full_url.replace('http://', 'https://', 1)
return full_url
test_cases = [
# (description, href, base_url, original_scheme, should_be_https)
("External link", "http://other.com/page", "http://example.com", "https", False),
("Already HTTPS", "/page", "https://example.com", "https", True),
("No original HTTPS", "/page", "http://example.com", "http", False),
("Subdomain", "/page", "http://sub.example.com", "https", True),
("Protocol-relative", "//example.com/page", "http://example.com", "https", False),
]
for desc, href, base_url, orig_scheme, should_be_https in test_cases:
result = preserve_https(href, base_url, orig_scheme)
is_https = result.startswith('https://')
status = "" if is_https == should_be_https else ""
print(f"\n{status} {desc}:")
print(f" Input: {href} + {base_url}")
print(f" Result: {result}")
print(f" Expected HTTPS: {should_be_https}, Got: {is_https}")
def usage_example():
"""Show how to use the feature in crawl4ai"""
print("\n" + "=" * 60)
print("USAGE IN CRAWL4AI")
print("=" * 60)
print("""
To enable HTTPS preservation in your crawl4ai code:
```python
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
preserve_https_for_internal_links=True # Enable HTTPS preservation
)
result = await crawler.arun(
url="https://example.com",
config=config
)
# All internal links will maintain HTTPS even if
# the server redirects to HTTP
```
This is especially useful for:
- Sites that redirect HTTPS to HTTP but still support HTTPS
- Security-conscious crawling where you want to stay on HTTPS
- Avoiding mixed content issues in downstream processing
""")
if __name__ == "__main__":
# Run all demonstrations
demonstrate_issue()
demonstrate_solution()
test_edge_cases()
usage_example()
print("\n" + "=" * 60)
print("✅ All tests complete!")
print("=" * 60)