feat(deep-crawling): improve URL normalization and domain filtering
Enhance URL handling in deep crawling with: - New URL normalization functions for consistent URL formats - Improved domain filtering with subdomain support - Added URLPatternFilter to public API - Better URL deduplication in BFS strategy These changes improve crawling accuracy and reduce duplicate visits.
This commit is contained in:
@@ -1,5 +1,4 @@
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
|
||||
import json
|
||||
@@ -33,6 +32,8 @@ import hashlib
|
||||
|
||||
from urllib.robotparser import RobotFileParser
|
||||
import aiohttp
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
from functools import lru_cache
|
||||
|
||||
from packaging import version
|
||||
from . import __version__
|
||||
@@ -1962,6 +1963,82 @@ def normalize_url(href, base_url):
|
||||
return normalized
|
||||
|
||||
|
||||
def normalize_url_for_deep_crawl(href, base_url):
|
||||
"""Normalize URLs to ensure consistent format"""
|
||||
from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
|
||||
|
||||
# Handle None or empty values
|
||||
if not href:
|
||||
return None
|
||||
|
||||
# Use urljoin to handle relative URLs
|
||||
full_url = urljoin(base_url, href.strip())
|
||||
|
||||
# Parse the URL for normalization
|
||||
parsed = urlparse(full_url)
|
||||
|
||||
# Convert hostname to lowercase
|
||||
netloc = parsed.netloc.lower()
|
||||
|
||||
# Remove fragment entirely
|
||||
fragment = ''
|
||||
|
||||
# Normalize query parameters if needed
|
||||
query = parsed.query
|
||||
if query:
|
||||
# Parse query parameters
|
||||
params = parse_qs(query)
|
||||
|
||||
# Remove tracking parameters (example - customize as needed)
|
||||
tracking_params = ['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'fbclid']
|
||||
for param in tracking_params:
|
||||
if param in params:
|
||||
del params[param]
|
||||
|
||||
# Rebuild query string, sorted for consistency
|
||||
query = urlencode(params, doseq=True) if params else ''
|
||||
|
||||
# Build normalized URL
|
||||
normalized = urlunparse((
|
||||
parsed.scheme,
|
||||
netloc,
|
||||
parsed.path.rstrip('/') or '/', # Normalize trailing slash
|
||||
parsed.params,
|
||||
query,
|
||||
fragment
|
||||
))
|
||||
|
||||
return normalized
|
||||
|
||||
@lru_cache(maxsize=10000)
|
||||
def efficient_normalize_url_for_deep_crawl(href, base_url):
|
||||
"""Efficient URL normalization with proper parsing"""
|
||||
from urllib.parse import urljoin
|
||||
|
||||
if not href:
|
||||
return None
|
||||
|
||||
# Resolve relative URLs
|
||||
full_url = urljoin(base_url, href.strip())
|
||||
|
||||
# Use proper URL parsing
|
||||
parsed = urlparse(full_url)
|
||||
|
||||
# Only perform the most critical normalizations
|
||||
# 1. Lowercase hostname
|
||||
# 2. Remove fragment
|
||||
normalized = urlunparse((
|
||||
parsed.scheme,
|
||||
parsed.netloc.lower(),
|
||||
parsed.path,
|
||||
parsed.params,
|
||||
parsed.query,
|
||||
'' # Remove fragment
|
||||
))
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
def normalize_url_tmp(href, base_url):
|
||||
"""Normalize URLs to ensure consistent format"""
|
||||
# Extract protocol and domain from base URL
|
||||
|
||||
Reference in New Issue
Block a user