Compare commits

..

2 Commits

Author SHA1 Message Date
ntohidi
bdacf61ca9 feat: update documentation for preserve_https_for_internal_links. ref #1410 2025-08-28 17:48:12 +08:00
ntohidi
f566c5a376 feat: add preserve_https_for_internal_links flag to maintain HTTPS during crawling. Ref #1410
Added a new `preserve_https_for_internal_links` configuration flag that preserves the original HTTPS scheme for same-domain links even when the server redirects to HTTP.
2025-08-28 17:38:40 +08:00
13 changed files with 255 additions and 247 deletions

View File

@@ -5,6 +5,16 @@ All notable changes to Crawl4AI will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
### Added
- **🔒 HTTPS Preservation for Internal Links**: New `preserve_https_for_internal_links` configuration flag
- Maintains HTTPS scheme for internal links even when servers redirect to HTTP
- Prevents security downgrades during deep crawling
- Useful for security-conscious crawling and sites supporting both protocols
- Fully backward compatible with opt-in flag (default: `False`)
- Fixes issue #1410 where HTTPS URLs were being downgraded to HTTP
## [0.7.3] - 2025-08-09
### Added

View File

@@ -97,16 +97,13 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
if value != param.default and not ignore_default_value:
current_values[name] = to_serializable_dict(value)
# Don't serialize private __slots__ - they're internal implementation details
# not constructor parameters. This was causing URLPatternFilter to fail
# because _simple_suffixes was being serialized as 'simple_suffixes'
# if hasattr(obj, '__slots__'):
# for slot in obj.__slots__:
# if slot.startswith('_'): # Handle private slots
# attr_name = slot[1:] # Remove leading '_'
# value = getattr(obj, slot, None)
# if value is not None:
# current_values[attr_name] = to_serializable_dict(value)
if hasattr(obj, '__slots__'):
for slot in obj.__slots__:
if slot.startswith('_'): # Handle private slots
attr_name = slot[1:] # Remove leading '_'
value = getattr(obj, slot, None)
if value is not None:
current_values[attr_name] = to_serializable_dict(value)
@@ -1124,6 +1121,7 @@ class CrawlerRunConfig():
exclude_domains: list = None,
exclude_internal_links: bool = False,
score_links: bool = False,
preserve_https_for_internal_links: bool = False,
# Debugging and Logging Parameters
verbose: bool = True,
log_console: bool = False,
@@ -1247,6 +1245,7 @@ class CrawlerRunConfig():
self.exclude_domains = exclude_domains or []
self.exclude_internal_links = exclude_internal_links
self.score_links = score_links
self.preserve_https_for_internal_links = preserve_https_for_internal_links
# Debugging and Logging Parameters
self.verbose = verbose
@@ -1520,6 +1519,7 @@ class CrawlerRunConfig():
exclude_domains=kwargs.get("exclude_domains", []),
exclude_internal_links=kwargs.get("exclude_internal_links", False),
score_links=kwargs.get("score_links", False),
preserve_https_for_internal_links=kwargs.get("preserve_https_for_internal_links", False),
# Debugging and Logging Parameters
verbose=kwargs.get("verbose", True),
log_console=kwargs.get("log_console", False),
@@ -1626,6 +1626,7 @@ class CrawlerRunConfig():
"exclude_domains": self.exclude_domains,
"exclude_internal_links": self.exclude_internal_links,
"score_links": self.score_links,
"preserve_https_for_internal_links": self.preserve_https_for_internal_links,
"verbose": self.verbose,
"log_console": self.log_console,
"capture_network_requests": self.capture_network_requests,

View File

@@ -354,6 +354,7 @@ class AsyncWebCrawler:
###############################################################
# Process the HTML content, Call CrawlerStrategy.process_html #
###############################################################
from urllib.parse import urlparse
crawl_result: CrawlResult = await self.aprocess_html(
url=url,
html=html,
@@ -364,6 +365,7 @@ class AsyncWebCrawler:
verbose=config.verbose,
is_raw_html=True if url.startswith("raw:") else False,
redirected_url=async_response.redirected_url,
original_scheme=urlparse(url).scheme,
**kwargs,
)

View File

@@ -258,7 +258,11 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
continue
try:
normalized_href = normalize_url(href, url)
normalized_href = normalize_url(
href, url,
preserve_https=kwargs.get('preserve_https_for_internal_links', False),
original_scheme=kwargs.get('original_scheme')
)
link_data = {
"href": normalized_href,
"text": link.text_content().strip(),

View File

@@ -47,13 +47,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
self.url_scorer = url_scorer
self.include_external = include_external
self.max_pages = max_pages
# self.logger = logger or logging.getLogger(__name__)
# Ensure logger is always a Logger instance, not a dict from serialization
if isinstance(logger, logging.Logger):
self.logger = logger
else:
# Create a new logger if logger is None, dict, or any other non-Logger type
self.logger = logging.getLogger(__name__)
self.logger = logger or logging.getLogger(__name__)
self.stats = TraversalStats(start_time=datetime.now())
self._cancel_event = asyncio.Event()
self._pages_crawled = 0

View File

@@ -38,13 +38,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
self.include_external = include_external
self.score_threshold = score_threshold
self.max_pages = max_pages
# self.logger = logger or logging.getLogger(__name__)
# Ensure logger is always a Logger instance, not a dict from serialization
if isinstance(logger, logging.Logger):
self.logger = logger
else:
# Create a new logger if logger is None, dict, or any other non-Logger type
self.logger = logging.getLogger(__name__)
self.logger = logger or logging.getLogger(__name__)
self.stats = TraversalStats(start_time=datetime.now())
self._cancel_event = asyncio.Event()
self._pages_crawled = 0

View File

@@ -120,9 +120,6 @@ class URLPatternFilter(URLFilter):
"""Pattern filter balancing speed and completeness"""
__slots__ = (
"patterns", # Store original patterns for serialization
"use_glob", # Store original use_glob for serialization
"reverse", # Store original reverse for serialization
"_simple_suffixes",
"_simple_prefixes",
"_domain_patterns",
@@ -145,11 +142,6 @@ class URLPatternFilter(URLFilter):
reverse: bool = False,
):
super().__init__()
# Store original constructor params for serialization
self.patterns = patterns
self.use_glob = use_glob
self.reverse = reverse
self._reverse = reverse
patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns

View File

@@ -253,16 +253,6 @@ class CrawlResult(BaseModel):
requirements change, this is where you would update the logic.
"""
result = super().model_dump(*args, **kwargs)
# Remove any property descriptors that might have been included
# These deprecated properties should not be in the serialized output
for key in ['fit_html', 'fit_markdown', 'markdown_v2']:
if key in result and isinstance(result[key], property):
# del result[key]
# Nasrin: I decided to convert it to string instead of removing it.
result[key] = str(result[key])
# Add the markdown field properly
if self._markdown is not None:
result["markdown"] = self._markdown.model_dump()
return result

View File

@@ -2146,7 +2146,9 @@ def normalize_url(
drop_query_tracking=True,
sort_query=True,
keep_fragment=False,
extra_drop_params=None
extra_drop_params=None,
preserve_https=False,
original_scheme=None
):
"""
Extended URL normalizer
@@ -2176,6 +2178,17 @@ def normalize_url(
# Resolve relative paths first
full_url = urljoin(base_url, href.strip())
# Preserve HTTPS if requested and original scheme was HTTPS
if preserve_https and original_scheme == 'https':
parsed_full = urlparse(full_url)
parsed_base = urlparse(base_url)
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
if (parsed_full.scheme == 'http' and
parsed_full.netloc == parsed_base.netloc and
not href.strip().startswith('//')):
full_url = full_url.replace('http://', 'https://', 1)
# Parse once, edit parts, then rebuild
parsed = urlparse(full_url)
@@ -2225,7 +2238,7 @@ def normalize_url(
return normalized
def normalize_url_for_deep_crawl(href, base_url):
def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
"""Normalize URLs to ensure consistent format"""
from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
@@ -2236,6 +2249,17 @@ def normalize_url_for_deep_crawl(href, base_url):
# Use urljoin to handle relative URLs
full_url = urljoin(base_url, href.strip())
# Preserve HTTPS if requested and original scheme was HTTPS
if preserve_https and original_scheme == 'https':
parsed_full = urlparse(full_url)
parsed_base = urlparse(base_url)
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
if (parsed_full.scheme == 'http' and
parsed_full.netloc == parsed_base.netloc and
not href.strip().startswith('//')):
full_url = full_url.replace('http://', 'https://', 1)
# Parse the URL for normalization
parsed = urlparse(full_url)
@@ -2273,7 +2297,7 @@ def normalize_url_for_deep_crawl(href, base_url):
return normalized
@lru_cache(maxsize=10000)
def efficient_normalize_url_for_deep_crawl(href, base_url):
def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
"""Efficient URL normalization with proper parsing"""
from urllib.parse import urljoin
@@ -2283,6 +2307,17 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
# Resolve relative URLs
full_url = urljoin(base_url, href.strip())
# Preserve HTTPS if requested and original scheme was HTTPS
if preserve_https and original_scheme == 'https':
parsed_full = urlparse(full_url)
parsed_base = urlparse(base_url)
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
if (parsed_full.scheme == 'http' and
parsed_full.netloc == parsed_base.netloc and
not href.strip().startswith('//')):
full_url = full_url.replace('http://', 'https://', 1)
# Use proper URL parsing
parsed = urlparse(full_url)

View File

@@ -155,6 +155,7 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
| **`exclude_external_links`** | `bool` (False) | Removes all links pointing outside the current domain. |
| **`exclude_social_media_links`** | `bool` (False) | Strips links specifically to social sites (like Facebook or Twitter). |
| **`exclude_domains`** | `list` ([]) | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`). |
| **`preserve_https_for_internal_links`** | `bool` (False) | If `True`, preserves HTTPS scheme for internal links even when the server redirects to HTTP. Useful for security-conscious crawling. |
Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains).

View File

@@ -472,6 +472,17 @@ Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pag
5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.
6.**Preserve HTTPS for security.** If crawling HTTPS sites that redirect to HTTP, use `preserve_https_for_internal_links=True` to maintain secure connections:
```python
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2),
preserve_https_for_internal_links=True # Keep HTTPS even if server redirects to HTTP
)
```
This is especially useful for security-conscious crawling or when dealing with sites that support both protocols.
---
## 10. Summary & Next Steps

View File

@@ -1,201 +0,0 @@
"""
Test the complete fix for both the filter serialization and JSON serialization issues.
"""
import asyncio
import httpx
from crawl4ai import BrowserConfig, CacheMode, CrawlerRunConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, FilterChain, URLPatternFilter
BASE_URL = "http://localhost:11234/" # Adjust port as needed
async def test_with_docker_client():
"""Test using the Docker client (same as 1419.py)."""
from crawl4ai.docker_client import Crawl4aiDockerClient
print("=" * 60)
print("Testing with Docker Client")
print("=" * 60)
try:
async with Crawl4aiDockerClient(
base_url=BASE_URL,
verbose=True,
) as client:
# Create filter chain - testing the serialization fix
filter_chain = [
URLPatternFilter(
# patterns=["*about*", "*privacy*", "*terms*"],
patterns=["*advanced*"],
reverse=True
),
]
crawler_config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=2, # Keep it shallow for testing
# max_pages=5, # Limit pages for testing
filter_chain=FilterChain(filter_chain)
),
cache_mode=CacheMode.BYPASS,
)
print("\n1. Testing crawl with filters...")
results = await client.crawl(
["https://docs.crawl4ai.com"], # Simple test page
browser_config=BrowserConfig(headless=True),
crawler_config=crawler_config,
)
if results:
print(f"✅ Crawl succeeded! Type: {type(results)}")
if hasattr(results, 'success'):
print(f"✅ Results success: {results.success}")
# Test that we can iterate results without JSON errors
if hasattr(results, '__iter__'):
for i, result in enumerate(results):
if hasattr(result, 'url'):
print(f" Result {i}: {result.url[:50]}...")
else:
print(f" Result {i}: {str(result)[:50]}...")
else:
# Handle list of results
print(f"✅ Got {len(results)} results")
for i, result in enumerate(results[:3]): # Show first 3
print(f" Result {i}: {result.url[:50]}...")
else:
print("❌ Crawl failed - no results returned")
return False
print("\n✅ Docker client test completed successfully!")
return True
except Exception as e:
print(f"❌ Docker client test failed: {e}")
import traceback
traceback.print_exc()
return False
async def test_with_rest_api():
"""Test using REST API directly."""
print("\n" + "=" * 60)
print("Testing with REST API")
print("=" * 60)
# Create filter configuration
deep_crawl_strategy_payload = {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": 2,
# "max_pages": 5,
"filter_chain": {
"type": "FilterChain",
"params": {
"filters": [
{
"type": "URLPatternFilter",
"params": {
"patterns": ["*advanced*"],
"reverse": True
}
}
]
}
}
}
}
crawl_payload = {
"urls": ["https://docs.crawl4ai.com"],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"deep_crawl_strategy": deep_crawl_strategy_payload,
"cache_mode": "bypass"
}
}
}
try:
async with httpx.AsyncClient() as client:
print("\n1. Sending crawl request to REST API...")
response = await client.post(
f"{BASE_URL}crawl",
json=crawl_payload,
timeout=30
)
if response.status_code == 200:
print(f"✅ REST API returned 200 OK")
data = response.json()
if data.get("success"):
results = data.get("results", [])
print(f"✅ Got {len(results)} results")
for i, result in enumerate(results[:3]):
print(f" Result {i}: {result.get('url', 'unknown')[:50]}...")
else:
print(f"❌ Crawl not successful: {data}")
return False
else:
print(f"❌ REST API returned {response.status_code}")
print(f" Response: {response.text[:500]}")
return False
print("\n✅ REST API test completed successfully!")
return True
except Exception as e:
print(f"❌ REST API test failed: {e}")
import traceback
traceback.print_exc()
return False
async def main():
"""Run all tests."""
print("\n🧪 TESTING COMPLETE FIX FOR DOCKER FILTER AND JSON ISSUES")
print("=" * 60)
print("Make sure the server is running with the updated code!")
print("=" * 60)
results = []
# Test 1: Docker client
docker_passed = await test_with_docker_client()
results.append(("Docker Client", docker_passed))
# Test 2: REST API
rest_passed = await test_with_rest_api()
results.append(("REST API", rest_passed))
# Summary
print("\n" + "=" * 60)
print("FINAL TEST SUMMARY")
print("=" * 60)
all_passed = True
for test_name, passed in results:
status = "✅ PASSED" if passed else "❌ FAILED"
print(f"{test_name:20} {status}")
if not passed:
all_passed = False
print("=" * 60)
if all_passed:
print("🎉 ALL TESTS PASSED! Both issues are fully resolved!")
print("\nThe fixes:")
print("1. Filter serialization: Fixed by not serializing private __slots__")
print("2. JSON serialization: Fixed by removing property descriptors from model_dump()")
else:
print("⚠️ Some tests failed. Please check the server logs for details.")
return 0 if all_passed else 1
if __name__ == "__main__":
import sys
sys.exit(asyncio.run(main()))

View File

@@ -0,0 +1,175 @@
#!/usr/bin/env python3
"""
Final test and demo for HTTPS preservation feature (Issue #1410)
This demonstrates how the preserve_https_for_internal_links flag
prevents HTTPS downgrade when servers redirect to HTTP.
"""
import sys
import os
from urllib.parse import urljoin, urlparse
def demonstrate_issue():
"""Show the problem: HTTPS -> HTTP redirect causes HTTP links"""
print("=" * 60)
print("DEMONSTRATING THE ISSUE")
print("=" * 60)
# Simulate what happens during crawling
original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/" # Server redirects to HTTP
# Extract a relative link
relative_link = "/author/Albert-Einstein"
# Standard URL joining uses the redirected (HTTP) base
resolved_url = urljoin(redirected_url, relative_link)
print(f"Original URL: {original_url}")
print(f"Redirected to: {redirected_url}")
print(f"Relative link: {relative_link}")
print(f"Resolved link: {resolved_url}")
print(f"\n❌ Problem: Link is now HTTP instead of HTTPS!")
return resolved_url
def demonstrate_solution():
"""Show the solution: preserve HTTPS for internal links"""
print("\n" + "=" * 60)
print("DEMONSTRATING THE SOLUTION")
print("=" * 60)
# Our normalize_url with HTTPS preservation
def normalize_url_with_preservation(href, base_url, preserve_https=False, original_scheme=None):
"""Normalize URL with optional HTTPS preservation"""
# Standard resolution
full_url = urljoin(base_url, href.strip())
# Preserve HTTPS if requested
if preserve_https and original_scheme == 'https':
parsed_full = urlparse(full_url)
parsed_base = urlparse(base_url)
# Only for same-domain links
if parsed_full.scheme == 'http' and parsed_full.netloc == parsed_base.netloc:
full_url = full_url.replace('http://', 'https://', 1)
print(f" → Preserved HTTPS for {parsed_full.netloc}")
return full_url
# Same scenario as before
original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/"
relative_link = "/author/Albert-Einstein"
# Without preservation (current behavior)
resolved_without = normalize_url_with_preservation(
relative_link, redirected_url,
preserve_https=False, original_scheme='https'
)
print(f"\nWithout preservation:")
print(f" Result: {resolved_without}")
# With preservation (new feature)
resolved_with = normalize_url_with_preservation(
relative_link, redirected_url,
preserve_https=True, original_scheme='https'
)
print(f"\nWith preservation (preserve_https_for_internal_links=True):")
print(f" Result: {resolved_with}")
print(f"\n✅ Solution: Internal link stays HTTPS!")
return resolved_with
def test_edge_cases():
"""Test important edge cases"""
print("\n" + "=" * 60)
print("EDGE CASES")
print("=" * 60)
from urllib.parse import urljoin, urlparse
def preserve_https(href, base_url, original_scheme):
"""Helper to test preservation logic"""
full_url = urljoin(base_url, href)
if original_scheme == 'https':
parsed_full = urlparse(full_url)
parsed_base = urlparse(base_url)
# Fixed: check for protocol-relative URLs
if (parsed_full.scheme == 'http' and
parsed_full.netloc == parsed_base.netloc and
not href.strip().startswith('//')):
full_url = full_url.replace('http://', 'https://', 1)
return full_url
test_cases = [
# (description, href, base_url, original_scheme, should_be_https)
("External link", "http://other.com/page", "http://example.com", "https", False),
("Already HTTPS", "/page", "https://example.com", "https", True),
("No original HTTPS", "/page", "http://example.com", "http", False),
("Subdomain", "/page", "http://sub.example.com", "https", True),
("Protocol-relative", "//example.com/page", "http://example.com", "https", False),
]
for desc, href, base_url, orig_scheme, should_be_https in test_cases:
result = preserve_https(href, base_url, orig_scheme)
is_https = result.startswith('https://')
status = "" if is_https == should_be_https else ""
print(f"\n{status} {desc}:")
print(f" Input: {href} + {base_url}")
print(f" Result: {result}")
print(f" Expected HTTPS: {should_be_https}, Got: {is_https}")
def usage_example():
"""Show how to use the feature in crawl4ai"""
print("\n" + "=" * 60)
print("USAGE IN CRAWL4AI")
print("=" * 60)
print("""
To enable HTTPS preservation in your crawl4ai code:
```python
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
preserve_https_for_internal_links=True # Enable HTTPS preservation
)
result = await crawler.arun(
url="https://example.com",
config=config
)
# All internal links will maintain HTTPS even if
# the server redirects to HTTP
```
This is especially useful for:
- Sites that redirect HTTPS to HTTP but still support HTTPS
- Security-conscious crawling where you want to stay on HTTPS
- Avoiding mixed content issues in downstream processing
""")
if __name__ == "__main__":
# Run all demonstrations
demonstrate_issue()
demonstrate_solution()
test_edge_cases()
usage_example()
print("\n" + "=" * 60)
print("✅ All tests complete!")
print("=" * 60)