Compare commits
10 Commits
v0.7.4
...
fix/https-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bdacf61ca9 | ||
|
|
f566c5a376 | ||
|
|
ef174a4c7a | ||
|
|
f4206d6ba1 | ||
|
|
dad7c51481 | ||
|
|
f4a432829e | ||
|
|
ecbe5ffb84 | ||
|
|
7a8190ecb6 | ||
|
|
8e3c411a3e | ||
|
|
1e1c887a2f |
10
CHANGELOG.md
10
CHANGELOG.md
@@ -5,6 +5,16 @@ All notable changes to Crawl4AI will be documented in this file.
|
|||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- **🔒 HTTPS Preservation for Internal Links**: New `preserve_https_for_internal_links` configuration flag
|
||||||
|
- Maintains HTTPS scheme for internal links even when servers redirect to HTTP
|
||||||
|
- Prevents security downgrades during deep crawling
|
||||||
|
- Useful for security-conscious crawling and sites supporting both protocols
|
||||||
|
- Fully backward compatible with opt-in flag (default: `False`)
|
||||||
|
- Fixes issue #1410 where HTTPS URLs were being downgraded to HTTP
|
||||||
|
|
||||||
## [0.7.3] - 2025-08-09
|
## [0.7.3] - 2025-08-09
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|||||||
@@ -373,7 +373,7 @@ async def main():
|
|||||||
|
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url="https://docs.micronaut.io/4.7.6/guide/",
|
url="https://docs.micronaut.io/4.9.9/guide/",
|
||||||
config=run_config
|
config=run_config
|
||||||
)
|
)
|
||||||
print(len(result.markdown.raw_markdown))
|
print(len(result.markdown.raw_markdown))
|
||||||
@@ -425,7 +425,7 @@ async def main():
|
|||||||
"type": "attribute",
|
"type": "attribute",
|
||||||
"attribute": "src"
|
"attribute": "src"
|
||||||
}
|
}
|
||||||
}
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
||||||
|
|||||||
@@ -1121,6 +1121,7 @@ class CrawlerRunConfig():
|
|||||||
exclude_domains: list = None,
|
exclude_domains: list = None,
|
||||||
exclude_internal_links: bool = False,
|
exclude_internal_links: bool = False,
|
||||||
score_links: bool = False,
|
score_links: bool = False,
|
||||||
|
preserve_https_for_internal_links: bool = False,
|
||||||
# Debugging and Logging Parameters
|
# Debugging and Logging Parameters
|
||||||
verbose: bool = True,
|
verbose: bool = True,
|
||||||
log_console: bool = False,
|
log_console: bool = False,
|
||||||
@@ -1244,6 +1245,7 @@ class CrawlerRunConfig():
|
|||||||
self.exclude_domains = exclude_domains or []
|
self.exclude_domains = exclude_domains or []
|
||||||
self.exclude_internal_links = exclude_internal_links
|
self.exclude_internal_links = exclude_internal_links
|
||||||
self.score_links = score_links
|
self.score_links = score_links
|
||||||
|
self.preserve_https_for_internal_links = preserve_https_for_internal_links
|
||||||
|
|
||||||
# Debugging and Logging Parameters
|
# Debugging and Logging Parameters
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
@@ -1517,6 +1519,7 @@ class CrawlerRunConfig():
|
|||||||
exclude_domains=kwargs.get("exclude_domains", []),
|
exclude_domains=kwargs.get("exclude_domains", []),
|
||||||
exclude_internal_links=kwargs.get("exclude_internal_links", False),
|
exclude_internal_links=kwargs.get("exclude_internal_links", False),
|
||||||
score_links=kwargs.get("score_links", False),
|
score_links=kwargs.get("score_links", False),
|
||||||
|
preserve_https_for_internal_links=kwargs.get("preserve_https_for_internal_links", False),
|
||||||
# Debugging and Logging Parameters
|
# Debugging and Logging Parameters
|
||||||
verbose=kwargs.get("verbose", True),
|
verbose=kwargs.get("verbose", True),
|
||||||
log_console=kwargs.get("log_console", False),
|
log_console=kwargs.get("log_console", False),
|
||||||
@@ -1623,6 +1626,7 @@ class CrawlerRunConfig():
|
|||||||
"exclude_domains": self.exclude_domains,
|
"exclude_domains": self.exclude_domains,
|
||||||
"exclude_internal_links": self.exclude_internal_links,
|
"exclude_internal_links": self.exclude_internal_links,
|
||||||
"score_links": self.score_links,
|
"score_links": self.score_links,
|
||||||
|
"preserve_https_for_internal_links": self.preserve_https_for_internal_links,
|
||||||
"verbose": self.verbose,
|
"verbose": self.verbose,
|
||||||
"log_console": self.log_console,
|
"log_console": self.log_console,
|
||||||
"capture_network_requests": self.capture_network_requests,
|
"capture_network_requests": self.capture_network_requests,
|
||||||
|
|||||||
@@ -824,7 +824,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
except Error:
|
except Error:
|
||||||
visibility_info = await self.check_visibility(page)
|
visibility_info = await self.check_visibility(page)
|
||||||
|
|
||||||
if self.browser_config.config.verbose:
|
if self.browser_config.verbose:
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
message="Body visibility info: {info}",
|
message="Body visibility info: {info}",
|
||||||
tag="DEBUG",
|
tag="DEBUG",
|
||||||
|
|||||||
@@ -354,6 +354,7 @@ class AsyncWebCrawler:
|
|||||||
###############################################################
|
###############################################################
|
||||||
# Process the HTML content, Call CrawlerStrategy.process_html #
|
# Process the HTML content, Call CrawlerStrategy.process_html #
|
||||||
###############################################################
|
###############################################################
|
||||||
|
from urllib.parse import urlparse
|
||||||
crawl_result: CrawlResult = await self.aprocess_html(
|
crawl_result: CrawlResult = await self.aprocess_html(
|
||||||
url=url,
|
url=url,
|
||||||
html=html,
|
html=html,
|
||||||
@@ -364,6 +365,7 @@ class AsyncWebCrawler:
|
|||||||
verbose=config.verbose,
|
verbose=config.verbose,
|
||||||
is_raw_html=True if url.startswith("raw:") else False,
|
is_raw_html=True if url.startswith("raw:") else False,
|
||||||
redirected_url=async_response.redirected_url,
|
redirected_url=async_response.redirected_url,
|
||||||
|
original_scheme=urlparse(url).scheme,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -258,7 +258,11 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
normalized_href = normalize_url(href, url)
|
normalized_href = normalize_url(
|
||||||
|
href, url,
|
||||||
|
preserve_https=kwargs.get('preserve_https_for_internal_links', False),
|
||||||
|
original_scheme=kwargs.get('original_scheme')
|
||||||
|
)
|
||||||
link_data = {
|
link_data = {
|
||||||
"href": normalized_href,
|
"href": normalized_href,
|
||||||
"text": link.text_content().strip(),
|
"text": link.text_content().strip(),
|
||||||
|
|||||||
@@ -2146,7 +2146,9 @@ def normalize_url(
|
|||||||
drop_query_tracking=True,
|
drop_query_tracking=True,
|
||||||
sort_query=True,
|
sort_query=True,
|
||||||
keep_fragment=False,
|
keep_fragment=False,
|
||||||
extra_drop_params=None
|
extra_drop_params=None,
|
||||||
|
preserve_https=False,
|
||||||
|
original_scheme=None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Extended URL normalizer
|
Extended URL normalizer
|
||||||
@@ -2176,6 +2178,17 @@ def normalize_url(
|
|||||||
|
|
||||||
# Resolve relative paths first
|
# Resolve relative paths first
|
||||||
full_url = urljoin(base_url, href.strip())
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Preserve HTTPS if requested and original scheme was HTTPS
|
||||||
|
if preserve_https and original_scheme == 'https':
|
||||||
|
parsed_full = urlparse(full_url)
|
||||||
|
parsed_base = urlparse(base_url)
|
||||||
|
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
|
||||||
|
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
|
||||||
|
if (parsed_full.scheme == 'http' and
|
||||||
|
parsed_full.netloc == parsed_base.netloc and
|
||||||
|
not href.strip().startswith('//')):
|
||||||
|
full_url = full_url.replace('http://', 'https://', 1)
|
||||||
|
|
||||||
# Parse once, edit parts, then rebuild
|
# Parse once, edit parts, then rebuild
|
||||||
parsed = urlparse(full_url)
|
parsed = urlparse(full_url)
|
||||||
@@ -2225,7 +2238,7 @@ def normalize_url(
|
|||||||
return normalized
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
def normalize_url_for_deep_crawl(href, base_url):
|
def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
|
||||||
"""Normalize URLs to ensure consistent format"""
|
"""Normalize URLs to ensure consistent format"""
|
||||||
from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
|
from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
|
||||||
|
|
||||||
@@ -2236,6 +2249,17 @@ def normalize_url_for_deep_crawl(href, base_url):
|
|||||||
# Use urljoin to handle relative URLs
|
# Use urljoin to handle relative URLs
|
||||||
full_url = urljoin(base_url, href.strip())
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Preserve HTTPS if requested and original scheme was HTTPS
|
||||||
|
if preserve_https and original_scheme == 'https':
|
||||||
|
parsed_full = urlparse(full_url)
|
||||||
|
parsed_base = urlparse(base_url)
|
||||||
|
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
|
||||||
|
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
|
||||||
|
if (parsed_full.scheme == 'http' and
|
||||||
|
parsed_full.netloc == parsed_base.netloc and
|
||||||
|
not href.strip().startswith('//')):
|
||||||
|
full_url = full_url.replace('http://', 'https://', 1)
|
||||||
|
|
||||||
# Parse the URL for normalization
|
# Parse the URL for normalization
|
||||||
parsed = urlparse(full_url)
|
parsed = urlparse(full_url)
|
||||||
|
|
||||||
@@ -2273,7 +2297,7 @@ def normalize_url_for_deep_crawl(href, base_url):
|
|||||||
return normalized
|
return normalized
|
||||||
|
|
||||||
@lru_cache(maxsize=10000)
|
@lru_cache(maxsize=10000)
|
||||||
def efficient_normalize_url_for_deep_crawl(href, base_url):
|
def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
|
||||||
"""Efficient URL normalization with proper parsing"""
|
"""Efficient URL normalization with proper parsing"""
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
@@ -2283,6 +2307,17 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
|
|||||||
# Resolve relative URLs
|
# Resolve relative URLs
|
||||||
full_url = urljoin(base_url, href.strip())
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Preserve HTTPS if requested and original scheme was HTTPS
|
||||||
|
if preserve_https and original_scheme == 'https':
|
||||||
|
parsed_full = urlparse(full_url)
|
||||||
|
parsed_base = urlparse(base_url)
|
||||||
|
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
|
||||||
|
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
|
||||||
|
if (parsed_full.scheme == 'http' and
|
||||||
|
parsed_full.netloc == parsed_base.netloc and
|
||||||
|
not href.strip().startswith('//')):
|
||||||
|
full_url = full_url.replace('http://', 'https://', 1)
|
||||||
|
|
||||||
# Use proper URL parsing
|
# Use proper URL parsing
|
||||||
parsed = urlparse(full_url)
|
parsed = urlparse(full_url)
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import asyncio
|
|||||||
from typing import List, Tuple, Dict
|
from typing import List, Tuple, Dict
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
from datetime import datetime
|
from datetime import datetime, timezone
|
||||||
from base64 import b64encode
|
from base64 import b64encode
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
@@ -576,7 +576,7 @@ async def handle_crawl_job(
|
|||||||
task_id = f"crawl_{uuid4().hex[:8]}"
|
task_id = f"crawl_{uuid4().hex[:8]}"
|
||||||
await redis.hset(f"task:{task_id}", mapping={
|
await redis.hset(f"task:{task_id}", mapping={
|
||||||
"status": TaskStatus.PROCESSING, # <-- keep enum values consistent
|
"status": TaskStatus.PROCESSING, # <-- keep enum values consistent
|
||||||
"created_at": datetime.utcnow().isoformat(),
|
"created_at": datetime.now(timezone.utc).replace(tzinfo=None).isoformat(),
|
||||||
"url": json.dumps(urls), # store list as JSON string
|
"url": json.dumps(urls), # store list as JSON string
|
||||||
"result": "",
|
"result": "",
|
||||||
"error": "",
|
"error": "",
|
||||||
|
|||||||
@@ -155,6 +155,7 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
|
|||||||
| **`exclude_external_links`** | `bool` (False) | Removes all links pointing outside the current domain. |
|
| **`exclude_external_links`** | `bool` (False) | Removes all links pointing outside the current domain. |
|
||||||
| **`exclude_social_media_links`** | `bool` (False) | Strips links specifically to social sites (like Facebook or Twitter). |
|
| **`exclude_social_media_links`** | `bool` (False) | Strips links specifically to social sites (like Facebook or Twitter). |
|
||||||
| **`exclude_domains`** | `list` ([]) | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`). |
|
| **`exclude_domains`** | `list` ([]) | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`). |
|
||||||
|
| **`preserve_https_for_internal_links`** | `bool` (False) | If `True`, preserves HTTPS scheme for internal links even when the server redirects to HTTP. Useful for security-conscious crawling. |
|
||||||
|
|
||||||
Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains).
|
Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains).
|
||||||
|
|
||||||
|
|||||||
@@ -472,6 +472,17 @@ Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pag
|
|||||||
|
|
||||||
5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.
|
5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.
|
||||||
|
|
||||||
|
6.**Preserve HTTPS for security.** If crawling HTTPS sites that redirect to HTTP, use `preserve_https_for_internal_links=True` to maintain secure connections:
|
||||||
|
|
||||||
|
```python
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2),
|
||||||
|
preserve_https_for_internal_links=True # Keep HTTPS even if server redirects to HTTP
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
This is especially useful for security-conscious crawling or when dealing with sites that support both protocols.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 10. Summary & Next Steps
|
## 10. Summary & Next Steps
|
||||||
|
|||||||
@@ -102,16 +102,16 @@ async def smart_blog_crawler():
|
|||||||
|
|
||||||
# Step 2: Configure discovery - let's find all blog posts
|
# Step 2: Configure discovery - let's find all blog posts
|
||||||
config = SeedingConfig(
|
config = SeedingConfig(
|
||||||
source="sitemap", # Use the website's sitemap
|
source="sitemap+cc", # Use the website's sitemap+cc
|
||||||
pattern="*/blog/*.html", # Only blog posts
|
pattern="*/courses/*", # Only courses related posts
|
||||||
extract_head=True, # Get page metadata
|
extract_head=True, # Get page metadata
|
||||||
max_urls=100 # Limit for this example
|
max_urls=100 # Limit for this example
|
||||||
)
|
)
|
||||||
|
|
||||||
# Step 3: Discover URLs from the Python blog
|
# Step 3: Discover URLs from the Python blog
|
||||||
print("🔍 Discovering blog posts...")
|
print("🔍 Discovering course posts...")
|
||||||
urls = await seeder.urls("realpython.com", config)
|
urls = await seeder.urls("realpython.com", config)
|
||||||
print(f"✅ Found {len(urls)} blog posts")
|
print(f"✅ Found {len(urls)} course posts")
|
||||||
|
|
||||||
# Step 4: Filter for Python tutorials (using metadata!)
|
# Step 4: Filter for Python tutorials (using metadata!)
|
||||||
tutorials = [
|
tutorials = [
|
||||||
@@ -134,7 +134,8 @@ async def smart_blog_crawler():
|
|||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
config = CrawlerRunConfig(
|
config = CrawlerRunConfig(
|
||||||
only_text=True,
|
only_text=True,
|
||||||
word_count_threshold=300 # Only substantial articles
|
word_count_threshold=300, # Only substantial articles
|
||||||
|
stream=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract URLs and crawl them
|
# Extract URLs and crawl them
|
||||||
@@ -155,7 +156,7 @@ asyncio.run(smart_blog_crawler())
|
|||||||
|
|
||||||
**What just happened?**
|
**What just happened?**
|
||||||
|
|
||||||
1. We discovered all blog URLs from the sitemap
|
1. We discovered all blog URLs from the sitemap+cc
|
||||||
2. We filtered using metadata (no crawling needed!)
|
2. We filtered using metadata (no crawling needed!)
|
||||||
3. We crawled only the relevant tutorials
|
3. We crawled only the relevant tutorials
|
||||||
4. We saved tons of time and bandwidth
|
4. We saved tons of time and bandwidth
|
||||||
@@ -282,8 +283,8 @@ config = SeedingConfig(
|
|||||||
live_check=True, # Verify each URL is accessible
|
live_check=True, # Verify each URL is accessible
|
||||||
concurrency=20 # Check 20 URLs in parallel
|
concurrency=20 # Check 20 URLs in parallel
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
urls = await seeder.urls("example.com", config)
|
urls = await seeder.urls("example.com", config)
|
||||||
|
|
||||||
# Now you can filter by status
|
# Now you can filter by status
|
||||||
live_urls = [u for u in urls if u["status"] == "valid"]
|
live_urls = [u for u in urls if u["status"] == "valid"]
|
||||||
@@ -311,8 +312,8 @@ This is where URL seeding gets really powerful. Instead of crawling entire pages
|
|||||||
config = SeedingConfig(
|
config = SeedingConfig(
|
||||||
extract_head=True # Extract metadata from <head> section
|
extract_head=True # Extract metadata from <head> section
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
urls = await seeder.urls("example.com", config)
|
urls = await seeder.urls("example.com", config)
|
||||||
|
|
||||||
# Now each URL has rich metadata
|
# Now each URL has rich metadata
|
||||||
for url in urls[:3]:
|
for url in urls[:3]:
|
||||||
@@ -387,8 +388,8 @@ config = SeedingConfig(
|
|||||||
scoring_method="bm25",
|
scoring_method="bm25",
|
||||||
score_threshold=0.3
|
score_threshold=0.3
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
urls = await seeder.urls("example.com", config)
|
urls = await seeder.urls("example.com", config)
|
||||||
|
|
||||||
# URLs are scored based on:
|
# URLs are scored based on:
|
||||||
# 1. Domain parts matching (e.g., 'python' in python.example.com)
|
# 1. Domain parts matching (e.g., 'python' in python.example.com)
|
||||||
@@ -429,8 +430,8 @@ config = SeedingConfig(
|
|||||||
extract_head=True,
|
extract_head=True,
|
||||||
live_check=True
|
live_check=True
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
urls = await seeder.urls("blog.example.com", config)
|
urls = await seeder.urls("blog.example.com", config)
|
||||||
|
|
||||||
# Analyze the results
|
# Analyze the results
|
||||||
for url in urls[:5]:
|
for url in urls[:5]:
|
||||||
@@ -488,8 +489,8 @@ config = SeedingConfig(
|
|||||||
scoring_method="bm25", # Use BM25 algorithm
|
scoring_method="bm25", # Use BM25 algorithm
|
||||||
score_threshold=0.3 # Minimum relevance score
|
score_threshold=0.3 # Minimum relevance score
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
urls = await seeder.urls("realpython.com", config)
|
urls = await seeder.urls("realpython.com", config)
|
||||||
|
|
||||||
# Results are automatically sorted by relevance!
|
# Results are automatically sorted by relevance!
|
||||||
for url in urls[:5]:
|
for url in urls[:5]:
|
||||||
@@ -511,8 +512,8 @@ config = SeedingConfig(
|
|||||||
score_threshold=0.5,
|
score_threshold=0.5,
|
||||||
max_urls=20
|
max_urls=20
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
urls = await seeder.urls("docs.example.com", config)
|
urls = await seeder.urls("docs.example.com", config)
|
||||||
|
|
||||||
# The highest scoring URLs will be API docs!
|
# The highest scoring URLs will be API docs!
|
||||||
```
|
```
|
||||||
@@ -529,8 +530,8 @@ config = SeedingConfig(
|
|||||||
score_threshold=0.4,
|
score_threshold=0.4,
|
||||||
pattern="*/product/*" # Combine with pattern matching
|
pattern="*/product/*" # Combine with pattern matching
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
urls = await seeder.urls("shop.example.com", config)
|
urls = await seeder.urls("shop.example.com", config)
|
||||||
|
|
||||||
# Filter further by price (from metadata)
|
# Filter further by price (from metadata)
|
||||||
affordable = [
|
affordable = [
|
||||||
@@ -550,8 +551,8 @@ config = SeedingConfig(
|
|||||||
scoring_method="bm25",
|
scoring_method="bm25",
|
||||||
score_threshold=0.35
|
score_threshold=0.35
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
urls = await seeder.urls("technews.com", config)
|
urls = await seeder.urls("technews.com", config)
|
||||||
|
|
||||||
# Filter by date
|
# Filter by date
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
@@ -591,8 +592,8 @@ for query in queries:
|
|||||||
score_threshold=0.4,
|
score_threshold=0.4,
|
||||||
max_urls=10 # Top 10 per topic
|
max_urls=10 # Top 10 per topic
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
urls = await seeder.urls("learning-platform.com", config)
|
urls = await seeder.urls("learning-platform.com", config)
|
||||||
all_tutorials.extend(urls)
|
all_tutorials.extend(urls)
|
||||||
|
|
||||||
# Remove duplicates while preserving order
|
# Remove duplicates while preserving order
|
||||||
@@ -625,7 +626,8 @@ config = SeedingConfig(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Returns a dictionary: {domain: [urls]}
|
# Returns a dictionary: {domain: [urls]}
|
||||||
results = await seeder.many_urls(domains, config)
|
async with AsyncUrlSeeder() as seeder:
|
||||||
|
results = await seeder.many_urls(domains, config)
|
||||||
|
|
||||||
# Process results
|
# Process results
|
||||||
for domain, urls in results.items():
|
for domain, urls in results.items():
|
||||||
@@ -654,8 +656,8 @@ config = SeedingConfig(
|
|||||||
pattern="*/blog/*",
|
pattern="*/blog/*",
|
||||||
max_urls=100
|
max_urls=100
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
results = await seeder.many_urls(competitors, config)
|
results = await seeder.many_urls(competitors, config)
|
||||||
|
|
||||||
# Analyze content types
|
# Analyze content types
|
||||||
for domain, urls in results.items():
|
for domain, urls in results.items():
|
||||||
@@ -690,8 +692,8 @@ config = SeedingConfig(
|
|||||||
score_threshold=0.3,
|
score_threshold=0.3,
|
||||||
max_urls=20 # Per site
|
max_urls=20 # Per site
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
results = await seeder.many_urls(educational_sites, config)
|
results = await seeder.many_urls(educational_sites, config)
|
||||||
|
|
||||||
# Find the best beginner tutorials
|
# Find the best beginner tutorials
|
||||||
all_tutorials = []
|
all_tutorials = []
|
||||||
@@ -731,8 +733,8 @@ config = SeedingConfig(
|
|||||||
score_threshold=0.5, # High threshold for relevance
|
score_threshold=0.5, # High threshold for relevance
|
||||||
max_urls=10
|
max_urls=10
|
||||||
)
|
)
|
||||||
|
async with AsyncUrlSeeder() as seeder:
|
||||||
results = await seeder.many_urls(news_sites, config)
|
results = await seeder.many_urls(news_sites, config)
|
||||||
|
|
||||||
# Collect all mentions
|
# Collect all mentions
|
||||||
mentions = []
|
mentions = []
|
||||||
|
|||||||
175
tests/test_preserve_https_for_internal_links.py
Normal file
175
tests/test_preserve_https_for_internal_links.py
Normal file
@@ -0,0 +1,175 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Final test and demo for HTTPS preservation feature (Issue #1410)
|
||||||
|
|
||||||
|
This demonstrates how the preserve_https_for_internal_links flag
|
||||||
|
prevents HTTPS downgrade when servers redirect to HTTP.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
|
def demonstrate_issue():
|
||||||
|
"""Show the problem: HTTPS -> HTTP redirect causes HTTP links"""
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("DEMONSTRATING THE ISSUE")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Simulate what happens during crawling
|
||||||
|
original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
|
||||||
|
redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/" # Server redirects to HTTP
|
||||||
|
|
||||||
|
# Extract a relative link
|
||||||
|
relative_link = "/author/Albert-Einstein"
|
||||||
|
|
||||||
|
# Standard URL joining uses the redirected (HTTP) base
|
||||||
|
resolved_url = urljoin(redirected_url, relative_link)
|
||||||
|
|
||||||
|
print(f"Original URL: {original_url}")
|
||||||
|
print(f"Redirected to: {redirected_url}")
|
||||||
|
print(f"Relative link: {relative_link}")
|
||||||
|
print(f"Resolved link: {resolved_url}")
|
||||||
|
print(f"\n❌ Problem: Link is now HTTP instead of HTTPS!")
|
||||||
|
|
||||||
|
return resolved_url
|
||||||
|
|
||||||
|
def demonstrate_solution():
|
||||||
|
"""Show the solution: preserve HTTPS for internal links"""
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("DEMONSTRATING THE SOLUTION")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Our normalize_url with HTTPS preservation
|
||||||
|
def normalize_url_with_preservation(href, base_url, preserve_https=False, original_scheme=None):
|
||||||
|
"""Normalize URL with optional HTTPS preservation"""
|
||||||
|
|
||||||
|
# Standard resolution
|
||||||
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Preserve HTTPS if requested
|
||||||
|
if preserve_https and original_scheme == 'https':
|
||||||
|
parsed_full = urlparse(full_url)
|
||||||
|
parsed_base = urlparse(base_url)
|
||||||
|
|
||||||
|
# Only for same-domain links
|
||||||
|
if parsed_full.scheme == 'http' and parsed_full.netloc == parsed_base.netloc:
|
||||||
|
full_url = full_url.replace('http://', 'https://', 1)
|
||||||
|
print(f" → Preserved HTTPS for {parsed_full.netloc}")
|
||||||
|
|
||||||
|
return full_url
|
||||||
|
|
||||||
|
# Same scenario as before
|
||||||
|
original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
|
||||||
|
redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/"
|
||||||
|
relative_link = "/author/Albert-Einstein"
|
||||||
|
|
||||||
|
# Without preservation (current behavior)
|
||||||
|
resolved_without = normalize_url_with_preservation(
|
||||||
|
relative_link, redirected_url,
|
||||||
|
preserve_https=False, original_scheme='https'
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\nWithout preservation:")
|
||||||
|
print(f" Result: {resolved_without}")
|
||||||
|
|
||||||
|
# With preservation (new feature)
|
||||||
|
resolved_with = normalize_url_with_preservation(
|
||||||
|
relative_link, redirected_url,
|
||||||
|
preserve_https=True, original_scheme='https'
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\nWith preservation (preserve_https_for_internal_links=True):")
|
||||||
|
print(f" Result: {resolved_with}")
|
||||||
|
print(f"\n✅ Solution: Internal link stays HTTPS!")
|
||||||
|
|
||||||
|
return resolved_with
|
||||||
|
|
||||||
|
def test_edge_cases():
|
||||||
|
"""Test important edge cases"""
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("EDGE CASES")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
|
def preserve_https(href, base_url, original_scheme):
|
||||||
|
"""Helper to test preservation logic"""
|
||||||
|
full_url = urljoin(base_url, href)
|
||||||
|
|
||||||
|
if original_scheme == 'https':
|
||||||
|
parsed_full = urlparse(full_url)
|
||||||
|
parsed_base = urlparse(base_url)
|
||||||
|
# Fixed: check for protocol-relative URLs
|
||||||
|
if (parsed_full.scheme == 'http' and
|
||||||
|
parsed_full.netloc == parsed_base.netloc and
|
||||||
|
not href.strip().startswith('//')):
|
||||||
|
full_url = full_url.replace('http://', 'https://', 1)
|
||||||
|
|
||||||
|
return full_url
|
||||||
|
|
||||||
|
test_cases = [
|
||||||
|
# (description, href, base_url, original_scheme, should_be_https)
|
||||||
|
("External link", "http://other.com/page", "http://example.com", "https", False),
|
||||||
|
("Already HTTPS", "/page", "https://example.com", "https", True),
|
||||||
|
("No original HTTPS", "/page", "http://example.com", "http", False),
|
||||||
|
("Subdomain", "/page", "http://sub.example.com", "https", True),
|
||||||
|
("Protocol-relative", "//example.com/page", "http://example.com", "https", False),
|
||||||
|
]
|
||||||
|
|
||||||
|
for desc, href, base_url, orig_scheme, should_be_https in test_cases:
|
||||||
|
result = preserve_https(href, base_url, orig_scheme)
|
||||||
|
is_https = result.startswith('https://')
|
||||||
|
status = "✅" if is_https == should_be_https else "❌"
|
||||||
|
|
||||||
|
print(f"\n{status} {desc}:")
|
||||||
|
print(f" Input: {href} + {base_url}")
|
||||||
|
print(f" Result: {result}")
|
||||||
|
print(f" Expected HTTPS: {should_be_https}, Got: {is_https}")
|
||||||
|
|
||||||
|
def usage_example():
|
||||||
|
"""Show how to use the feature in crawl4ai"""
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("USAGE IN CRAWL4AI")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
print("""
|
||||||
|
To enable HTTPS preservation in your crawl4ai code:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
preserve_https_for_internal_links=True # Enable HTTPS preservation
|
||||||
|
)
|
||||||
|
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://example.com",
|
||||||
|
config=config
|
||||||
|
)
|
||||||
|
|
||||||
|
# All internal links will maintain HTTPS even if
|
||||||
|
# the server redirects to HTTP
|
||||||
|
```
|
||||||
|
|
||||||
|
This is especially useful for:
|
||||||
|
- Sites that redirect HTTPS to HTTP but still support HTTPS
|
||||||
|
- Security-conscious crawling where you want to stay on HTTPS
|
||||||
|
- Avoiding mixed content issues in downstream processing
|
||||||
|
""")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Run all demonstrations
|
||||||
|
demonstrate_issue()
|
||||||
|
demonstrate_solution()
|
||||||
|
test_edge_cases()
|
||||||
|
usage_example()
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("✅ All tests complete!")
|
||||||
|
print("=" * 60)
|
||||||
Reference in New Issue
Block a user