Compare commits
2 Commits
fix/docker
...
fix/https-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bdacf61ca9 | ||
|
|
f566c5a376 |
10
CHANGELOG.md
10
CHANGELOG.md
@@ -5,6 +5,16 @@ All notable changes to Crawl4AI will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- **🔒 HTTPS Preservation for Internal Links**: New `preserve_https_for_internal_links` configuration flag
|
||||
- Maintains HTTPS scheme for internal links even when servers redirect to HTTP
|
||||
- Prevents security downgrades during deep crawling
|
||||
- Useful for security-conscious crawling and sites supporting both protocols
|
||||
- Fully backward compatible with opt-in flag (default: `False`)
|
||||
- Fixes issue #1410 where HTTPS URLs were being downgraded to HTTP
|
||||
|
||||
## [0.7.3] - 2025-08-09
|
||||
|
||||
### Added
|
||||
|
||||
@@ -1121,6 +1121,7 @@ class CrawlerRunConfig():
|
||||
exclude_domains: list = None,
|
||||
exclude_internal_links: bool = False,
|
||||
score_links: bool = False,
|
||||
preserve_https_for_internal_links: bool = False,
|
||||
# Debugging and Logging Parameters
|
||||
verbose: bool = True,
|
||||
log_console: bool = False,
|
||||
@@ -1244,6 +1245,7 @@ class CrawlerRunConfig():
|
||||
self.exclude_domains = exclude_domains or []
|
||||
self.exclude_internal_links = exclude_internal_links
|
||||
self.score_links = score_links
|
||||
self.preserve_https_for_internal_links = preserve_https_for_internal_links
|
||||
|
||||
# Debugging and Logging Parameters
|
||||
self.verbose = verbose
|
||||
@@ -1517,6 +1519,7 @@ class CrawlerRunConfig():
|
||||
exclude_domains=kwargs.get("exclude_domains", []),
|
||||
exclude_internal_links=kwargs.get("exclude_internal_links", False),
|
||||
score_links=kwargs.get("score_links", False),
|
||||
preserve_https_for_internal_links=kwargs.get("preserve_https_for_internal_links", False),
|
||||
# Debugging and Logging Parameters
|
||||
verbose=kwargs.get("verbose", True),
|
||||
log_console=kwargs.get("log_console", False),
|
||||
@@ -1623,6 +1626,7 @@ class CrawlerRunConfig():
|
||||
"exclude_domains": self.exclude_domains,
|
||||
"exclude_internal_links": self.exclude_internal_links,
|
||||
"score_links": self.score_links,
|
||||
"preserve_https_for_internal_links": self.preserve_https_for_internal_links,
|
||||
"verbose": self.verbose,
|
||||
"log_console": self.log_console,
|
||||
"capture_network_requests": self.capture_network_requests,
|
||||
|
||||
@@ -354,6 +354,7 @@ class AsyncWebCrawler:
|
||||
###############################################################
|
||||
# Process the HTML content, Call CrawlerStrategy.process_html #
|
||||
###############################################################
|
||||
from urllib.parse import urlparse
|
||||
crawl_result: CrawlResult = await self.aprocess_html(
|
||||
url=url,
|
||||
html=html,
|
||||
@@ -364,6 +365,7 @@ class AsyncWebCrawler:
|
||||
verbose=config.verbose,
|
||||
is_raw_html=True if url.startswith("raw:") else False,
|
||||
redirected_url=async_response.redirected_url,
|
||||
original_scheme=urlparse(url).scheme,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
@@ -258,7 +258,11 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
||||
continue
|
||||
|
||||
try:
|
||||
normalized_href = normalize_url(href, url)
|
||||
normalized_href = normalize_url(
|
||||
href, url,
|
||||
preserve_https=kwargs.get('preserve_https_for_internal_links', False),
|
||||
original_scheme=kwargs.get('original_scheme')
|
||||
)
|
||||
link_data = {
|
||||
"href": normalized_href,
|
||||
"text": link.text_content().strip(),
|
||||
|
||||
@@ -2146,7 +2146,9 @@ def normalize_url(
|
||||
drop_query_tracking=True,
|
||||
sort_query=True,
|
||||
keep_fragment=False,
|
||||
extra_drop_params=None
|
||||
extra_drop_params=None,
|
||||
preserve_https=False,
|
||||
original_scheme=None
|
||||
):
|
||||
"""
|
||||
Extended URL normalizer
|
||||
@@ -2176,6 +2178,17 @@ def normalize_url(
|
||||
|
||||
# Resolve relative paths first
|
||||
full_url = urljoin(base_url, href.strip())
|
||||
|
||||
# Preserve HTTPS if requested and original scheme was HTTPS
|
||||
if preserve_https and original_scheme == 'https':
|
||||
parsed_full = urlparse(full_url)
|
||||
parsed_base = urlparse(base_url)
|
||||
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
|
||||
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
|
||||
if (parsed_full.scheme == 'http' and
|
||||
parsed_full.netloc == parsed_base.netloc and
|
||||
not href.strip().startswith('//')):
|
||||
full_url = full_url.replace('http://', 'https://', 1)
|
||||
|
||||
# Parse once, edit parts, then rebuild
|
||||
parsed = urlparse(full_url)
|
||||
@@ -2225,7 +2238,7 @@ def normalize_url(
|
||||
return normalized
|
||||
|
||||
|
||||
def normalize_url_for_deep_crawl(href, base_url):
|
||||
def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
|
||||
"""Normalize URLs to ensure consistent format"""
|
||||
from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
|
||||
|
||||
@@ -2236,6 +2249,17 @@ def normalize_url_for_deep_crawl(href, base_url):
|
||||
# Use urljoin to handle relative URLs
|
||||
full_url = urljoin(base_url, href.strip())
|
||||
|
||||
# Preserve HTTPS if requested and original scheme was HTTPS
|
||||
if preserve_https and original_scheme == 'https':
|
||||
parsed_full = urlparse(full_url)
|
||||
parsed_base = urlparse(base_url)
|
||||
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
|
||||
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
|
||||
if (parsed_full.scheme == 'http' and
|
||||
parsed_full.netloc == parsed_base.netloc and
|
||||
not href.strip().startswith('//')):
|
||||
full_url = full_url.replace('http://', 'https://', 1)
|
||||
|
||||
# Parse the URL for normalization
|
||||
parsed = urlparse(full_url)
|
||||
|
||||
@@ -2273,7 +2297,7 @@ def normalize_url_for_deep_crawl(href, base_url):
|
||||
return normalized
|
||||
|
||||
@lru_cache(maxsize=10000)
|
||||
def efficient_normalize_url_for_deep_crawl(href, base_url):
|
||||
def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
|
||||
"""Efficient URL normalization with proper parsing"""
|
||||
from urllib.parse import urljoin
|
||||
|
||||
@@ -2283,6 +2307,17 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
|
||||
# Resolve relative URLs
|
||||
full_url = urljoin(base_url, href.strip())
|
||||
|
||||
# Preserve HTTPS if requested and original scheme was HTTPS
|
||||
if preserve_https and original_scheme == 'https':
|
||||
parsed_full = urlparse(full_url)
|
||||
parsed_base = urlparse(base_url)
|
||||
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
|
||||
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
|
||||
if (parsed_full.scheme == 'http' and
|
||||
parsed_full.netloc == parsed_base.netloc and
|
||||
not href.strip().startswith('//')):
|
||||
full_url = full_url.replace('http://', 'https://', 1)
|
||||
|
||||
# Use proper URL parsing
|
||||
parsed = urlparse(full_url)
|
||||
|
||||
|
||||
@@ -692,7 +692,8 @@ app:
|
||||
# Default LLM Configuration
|
||||
llm:
|
||||
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
|
||||
# api_key: sk-... # If you pass the API key directly (not recommended)
|
||||
api_key_env: "OPENAI_API_KEY"
|
||||
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
||||
|
||||
# Redis Configuration (Used by internal Redis server managed by supervisord)
|
||||
redis:
|
||||
|
||||
@@ -96,7 +96,7 @@ async def handle_llm_qa(
|
||||
response = perform_completion_with_backoff(
|
||||
provider=config["llm"]["provider"],
|
||||
prompt_with_variables=prompt,
|
||||
api_token=get_llm_api_key(config) # Returns None to let litellm handle it
|
||||
api_token=get_llm_api_key(config)
|
||||
)
|
||||
|
||||
return response.choices[0].message.content
|
||||
@@ -127,7 +127,7 @@ async def process_llm_extraction(
|
||||
"error": error_msg
|
||||
})
|
||||
return
|
||||
api_key = get_llm_api_key(config, provider) # Returns None to let litellm handle it
|
||||
api_key = get_llm_api_key(config, provider)
|
||||
llm_strategy = LLMExtractionStrategy(
|
||||
llm_config=LLMConfig(
|
||||
provider=provider or config["llm"]["provider"],
|
||||
@@ -203,7 +203,7 @@ async def handle_markdown_request(
|
||||
FilterType.LLM: LLMContentFilter(
|
||||
llm_config=LLMConfig(
|
||||
provider=provider or config["llm"]["provider"],
|
||||
api_token=get_llm_api_key(config, provider), # Returns None to let litellm handle it
|
||||
api_token=get_llm_api_key(config, provider),
|
||||
),
|
||||
instruction=query or "Extract main content"
|
||||
)
|
||||
|
||||
@@ -11,7 +11,8 @@ app:
|
||||
# Default LLM Configuration
|
||||
llm:
|
||||
provider: "openai/gpt-4o-mini"
|
||||
# api_key: sk-... # If you pass the API key directly (not recommended)
|
||||
api_key_env: "OPENAI_API_KEY"
|
||||
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
||||
|
||||
# Redis Configuration
|
||||
redis:
|
||||
|
||||
@@ -71,7 +71,7 @@ def decode_redis_hash(hash_data: Dict[bytes, bytes]) -> Dict[str, str]:
|
||||
|
||||
|
||||
|
||||
def get_llm_api_key(config: Dict, provider: Optional[str] = None) -> Optional[str]:
|
||||
def get_llm_api_key(config: Dict, provider: Optional[str] = None) -> str:
|
||||
"""Get the appropriate API key based on the LLM provider.
|
||||
|
||||
Args:
|
||||
@@ -79,14 +79,19 @@ def get_llm_api_key(config: Dict, provider: Optional[str] = None) -> Optional[st
|
||||
provider: Optional provider override (e.g., "openai/gpt-4")
|
||||
|
||||
Returns:
|
||||
The API key if directly configured, otherwise None to let litellm handle it
|
||||
The API key for the provider, or empty string if not found
|
||||
"""
|
||||
# Check if direct API key is configured (for backward compatibility)
|
||||
|
||||
# Use provided provider or fall back to config
|
||||
if not provider:
|
||||
provider = config["llm"]["provider"]
|
||||
|
||||
# Check if direct API key is configured
|
||||
if "api_key" in config["llm"]:
|
||||
return config["llm"]["api_key"]
|
||||
|
||||
# Return None - litellm will automatically find the right environment variable
|
||||
return None
|
||||
# Fall back to the configured api_key_env if no match
|
||||
return os.environ.get(config["llm"].get("api_key_env", ""), "")
|
||||
|
||||
|
||||
def validate_llm_provider(config: Dict, provider: Optional[str] = None) -> tuple[bool, str]:
|
||||
@@ -99,12 +104,16 @@ def validate_llm_provider(config: Dict, provider: Optional[str] = None) -> tuple
|
||||
Returns:
|
||||
Tuple of (is_valid, error_message)
|
||||
"""
|
||||
# If a direct API key is configured, validation passes
|
||||
if "api_key" in config["llm"]:
|
||||
return True, ""
|
||||
# Use provided provider or fall back to config
|
||||
if not provider:
|
||||
provider = config["llm"]["provider"]
|
||||
|
||||
# Get the API key for this provider
|
||||
api_key = get_llm_api_key(config, provider)
|
||||
|
||||
if not api_key:
|
||||
return False, f"No API key found for provider '{provider}'. Please set the appropriate environment variable."
|
||||
|
||||
# Otherwise, trust that litellm will find the appropriate environment variable
|
||||
# We can't easily validate this without reimplementing litellm's logic
|
||||
return True, ""
|
||||
|
||||
|
||||
|
||||
@@ -155,6 +155,7 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
|
||||
| **`exclude_external_links`** | `bool` (False) | Removes all links pointing outside the current domain. |
|
||||
| **`exclude_social_media_links`** | `bool` (False) | Strips links specifically to social sites (like Facebook or Twitter). |
|
||||
| **`exclude_domains`** | `list` ([]) | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`). |
|
||||
| **`preserve_https_for_internal_links`** | `bool` (False) | If `True`, preserves HTTPS scheme for internal links even when the server redirects to HTTP. Useful for security-conscious crawling. |
|
||||
|
||||
Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains).
|
||||
|
||||
|
||||
@@ -472,6 +472,17 @@ Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pag
|
||||
|
||||
5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.
|
||||
|
||||
6.**Preserve HTTPS for security.** If crawling HTTPS sites that redirect to HTTP, use `preserve_https_for_internal_links=True` to maintain secure connections:
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2),
|
||||
preserve_https_for_internal_links=True # Keep HTTPS even if server redirects to HTTP
|
||||
)
|
||||
```
|
||||
|
||||
This is especially useful for security-conscious crawling or when dealing with sites that support both protocols.
|
||||
|
||||
---
|
||||
|
||||
## 10. Summary & Next Steps
|
||||
|
||||
@@ -176,7 +176,7 @@ The Docker setup now supports flexible LLM provider configuration through three
|
||||
|
||||
3. **Config File Default**: Falls back to `config.yml` (default: `openai/gpt-4o-mini`)
|
||||
|
||||
The system automatically selects the appropriate API key based on the provider. LiteLLM handles finding the correct environment variable for each provider (e.g., OPENAI_API_KEY for OpenAI, GEMINI_API_TOKEN for Google Gemini, etc.).
|
||||
The system automatically selects the appropriate API key based on the configured `api_key_env` in the config file.
|
||||
|
||||
#### 3. Build and Run with Compose
|
||||
|
||||
@@ -693,7 +693,8 @@ app:
|
||||
# Default LLM Configuration
|
||||
llm:
|
||||
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
|
||||
# api_key: sk-... # If you pass the API key directly (not recommended)
|
||||
api_key_env: "OPENAI_API_KEY"
|
||||
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
||||
|
||||
# Redis Configuration (Used by internal Redis server managed by supervisord)
|
||||
redis:
|
||||
|
||||
175
tests/test_preserve_https_for_internal_links.py
Normal file
175
tests/test_preserve_https_for_internal_links.py
Normal file
@@ -0,0 +1,175 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Final test and demo for HTTPS preservation feature (Issue #1410)
|
||||
|
||||
This demonstrates how the preserve_https_for_internal_links flag
|
||||
prevents HTTPS downgrade when servers redirect to HTTP.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
def demonstrate_issue():
|
||||
"""Show the problem: HTTPS -> HTTP redirect causes HTTP links"""
|
||||
|
||||
print("=" * 60)
|
||||
print("DEMONSTRATING THE ISSUE")
|
||||
print("=" * 60)
|
||||
|
||||
# Simulate what happens during crawling
|
||||
original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
|
||||
redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/" # Server redirects to HTTP
|
||||
|
||||
# Extract a relative link
|
||||
relative_link = "/author/Albert-Einstein"
|
||||
|
||||
# Standard URL joining uses the redirected (HTTP) base
|
||||
resolved_url = urljoin(redirected_url, relative_link)
|
||||
|
||||
print(f"Original URL: {original_url}")
|
||||
print(f"Redirected to: {redirected_url}")
|
||||
print(f"Relative link: {relative_link}")
|
||||
print(f"Resolved link: {resolved_url}")
|
||||
print(f"\n❌ Problem: Link is now HTTP instead of HTTPS!")
|
||||
|
||||
return resolved_url
|
||||
|
||||
def demonstrate_solution():
|
||||
"""Show the solution: preserve HTTPS for internal links"""
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("DEMONSTRATING THE SOLUTION")
|
||||
print("=" * 60)
|
||||
|
||||
# Our normalize_url with HTTPS preservation
|
||||
def normalize_url_with_preservation(href, base_url, preserve_https=False, original_scheme=None):
|
||||
"""Normalize URL with optional HTTPS preservation"""
|
||||
|
||||
# Standard resolution
|
||||
full_url = urljoin(base_url, href.strip())
|
||||
|
||||
# Preserve HTTPS if requested
|
||||
if preserve_https and original_scheme == 'https':
|
||||
parsed_full = urlparse(full_url)
|
||||
parsed_base = urlparse(base_url)
|
||||
|
||||
# Only for same-domain links
|
||||
if parsed_full.scheme == 'http' and parsed_full.netloc == parsed_base.netloc:
|
||||
full_url = full_url.replace('http://', 'https://', 1)
|
||||
print(f" → Preserved HTTPS for {parsed_full.netloc}")
|
||||
|
||||
return full_url
|
||||
|
||||
# Same scenario as before
|
||||
original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
|
||||
redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/"
|
||||
relative_link = "/author/Albert-Einstein"
|
||||
|
||||
# Without preservation (current behavior)
|
||||
resolved_without = normalize_url_with_preservation(
|
||||
relative_link, redirected_url,
|
||||
preserve_https=False, original_scheme='https'
|
||||
)
|
||||
|
||||
print(f"\nWithout preservation:")
|
||||
print(f" Result: {resolved_without}")
|
||||
|
||||
# With preservation (new feature)
|
||||
resolved_with = normalize_url_with_preservation(
|
||||
relative_link, redirected_url,
|
||||
preserve_https=True, original_scheme='https'
|
||||
)
|
||||
|
||||
print(f"\nWith preservation (preserve_https_for_internal_links=True):")
|
||||
print(f" Result: {resolved_with}")
|
||||
print(f"\n✅ Solution: Internal link stays HTTPS!")
|
||||
|
||||
return resolved_with
|
||||
|
||||
def test_edge_cases():
|
||||
"""Test important edge cases"""
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("EDGE CASES")
|
||||
print("=" * 60)
|
||||
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
def preserve_https(href, base_url, original_scheme):
|
||||
"""Helper to test preservation logic"""
|
||||
full_url = urljoin(base_url, href)
|
||||
|
||||
if original_scheme == 'https':
|
||||
parsed_full = urlparse(full_url)
|
||||
parsed_base = urlparse(base_url)
|
||||
# Fixed: check for protocol-relative URLs
|
||||
if (parsed_full.scheme == 'http' and
|
||||
parsed_full.netloc == parsed_base.netloc and
|
||||
not href.strip().startswith('//')):
|
||||
full_url = full_url.replace('http://', 'https://', 1)
|
||||
|
||||
return full_url
|
||||
|
||||
test_cases = [
|
||||
# (description, href, base_url, original_scheme, should_be_https)
|
||||
("External link", "http://other.com/page", "http://example.com", "https", False),
|
||||
("Already HTTPS", "/page", "https://example.com", "https", True),
|
||||
("No original HTTPS", "/page", "http://example.com", "http", False),
|
||||
("Subdomain", "/page", "http://sub.example.com", "https", True),
|
||||
("Protocol-relative", "//example.com/page", "http://example.com", "https", False),
|
||||
]
|
||||
|
||||
for desc, href, base_url, orig_scheme, should_be_https in test_cases:
|
||||
result = preserve_https(href, base_url, orig_scheme)
|
||||
is_https = result.startswith('https://')
|
||||
status = "✅" if is_https == should_be_https else "❌"
|
||||
|
||||
print(f"\n{status} {desc}:")
|
||||
print(f" Input: {href} + {base_url}")
|
||||
print(f" Result: {result}")
|
||||
print(f" Expected HTTPS: {should_be_https}, Got: {is_https}")
|
||||
|
||||
def usage_example():
|
||||
"""Show how to use the feature in crawl4ai"""
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("USAGE IN CRAWL4AI")
|
||||
print("=" * 60)
|
||||
|
||||
print("""
|
||||
To enable HTTPS preservation in your crawl4ai code:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
preserve_https_for_internal_links=True # Enable HTTPS preservation
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=config
|
||||
)
|
||||
|
||||
# All internal links will maintain HTTPS even if
|
||||
# the server redirects to HTTP
|
||||
```
|
||||
|
||||
This is especially useful for:
|
||||
- Sites that redirect HTTPS to HTTP but still support HTTPS
|
||||
- Security-conscious crawling where you want to stay on HTTPS
|
||||
- Avoiding mixed content issues in downstream processing
|
||||
""")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run all demonstrations
|
||||
demonstrate_issue()
|
||||
demonstrate_solution()
|
||||
test_edge_cases()
|
||||
usage_example()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("✅ All tests complete!")
|
||||
print("=" * 60)
|
||||
Reference in New Issue
Block a user