@@ -1,849 +0,0 @@
#!/usr/bin/env python3
"""
Comprehensive test suite for URL normalization functions in utils.py
Tests all scenarios and edge cases for the updated normalize_url functions.
"""
import sys
import os
import time
from pathlib import Path
from urllib . parse import urljoin , urlparse , urlunparse , parse_qsl , urlencode
# Add the crawl4ai package to the path
sys . path . insert ( 0 , str ( Path ( __file__ ) . parent . parent ) )
# Import only the specific functions we need to test
from crawl4ai . utils import get_base_domain , is_external_url
# ANSI Color codes for beautiful console output
class Colors :
# Basic colors
RED = ' \033 [91m '
GREEN = ' \033 [92m '
YELLOW = ' \033 [93m '
BLUE = ' \033 [94m '
MAGENTA = ' \033 [95m '
CYAN = ' \033 [96m '
WHITE = ' \033 [97m '
# Bright colors
BRIGHT_RED = ' \033 [91;1m '
BRIGHT_GREEN = ' \033 [92;1m '
BRIGHT_YELLOW = ' \033 [93;1m '
BRIGHT_BLUE = ' \033 [94;1m '
BRIGHT_MAGENTA = ' \033 [95;1m '
BRIGHT_CYAN = ' \033 [96;1m '
BRIGHT_WHITE = ' \033 [97;1m '
# Background colors
BG_RED = ' \033 [41m '
BG_GREEN = ' \033 [42m '
BG_YELLOW = ' \033 [43m '
BG_BLUE = ' \033 [44m '
# Text styles
BOLD = ' \033 [1m '
UNDERLINE = ' \033 [4m '
RESET = ' \033 [0m '
# Icons
CHECK = ' ✓ '
CROSS = ' ✗ '
WARNING = ' ⚠ '
INFO = ' ℹ '
STAR = ' ⭐ '
FIRE = ' 🔥 '
ROCKET = ' 🚀 '
TARGET = ' 🎯 '
def colorize ( text , color ) :
""" Apply color to text """
return f " { color } { text } { Colors . RESET } "
def print_header ( title , icon = " " ) :
""" Print a formatted header """
width = 80
print ( f " \n { Colors . BG_BLUE } { Colors . WHITE } { Colors . BOLD } { ' = ' * width } { Colors . RESET } " )
if icon :
print ( f " { Colors . BG_BLUE } { Colors . WHITE } { Colors . BOLD } { ' ' * ( ( width - len ( title ) - len ( icon ) - 1 ) / / 2 ) } { icon } { title } { ' ' * ( ( width - len ( title ) - len ( icon ) - 1 ) / / 2 ) } { Colors . RESET } " )
else :
print ( f " { Colors . BG_BLUE } { Colors . WHITE } { Colors . BOLD } { ' ' * ( ( width - len ( title ) ) / / 2 ) } { title } { ' ' * ( ( width - len ( title ) ) / / 2 ) } { Colors . RESET } " )
print ( f " { Colors . BG_BLUE } { Colors . WHITE } { Colors . BOLD } { ' = ' * width } { Colors . RESET } " )
def print_section ( title , icon = " " ) :
""" Print a formatted section header """
if icon :
print ( f " \n { Colors . CYAN } { Colors . BOLD } { icon } { title } { Colors . RESET } " )
else :
print ( f " \n { Colors . CYAN } { Colors . BOLD } { title } { Colors . RESET } " )
print ( f " { Colors . CYAN } { ' - ' * ( len ( title ) + ( len ( icon ) + 1 if icon else 0 ) ) } { Colors . RESET } " )
def print_success ( message ) :
""" Print success message """
print ( f " { Colors . GREEN } { Colors . CHECK } { message } { Colors . RESET } " )
def print_error ( message ) :
""" Print error message """
print ( f " { Colors . RED } { Colors . CROSS } { message } { Colors . RESET } " )
def print_warning ( message ) :
""" Print warning message """
print ( f " { Colors . YELLOW } { Colors . WARNING } { message } { Colors . RESET } " )
def print_info ( message ) :
""" Print info message """
print ( f " { Colors . BLUE } { Colors . INFO } { message } { Colors . RESET } " )
def print_test_result ( test_name , passed , expected = None , actual = None ) :
""" Print formatted test result """
if passed :
print ( f " { Colors . GREEN } { Colors . CHECK } { test_name } { Colors . RESET } " )
else :
print ( f " { Colors . RED } { Colors . CROSS } { test_name } { Colors . RESET } " )
if expected is not None and actual is not None :
print ( f " { Colors . BRIGHT_RED } Expected: { expected } { Colors . RESET } " )
print ( f " { Colors . BRIGHT_RED } Actual: { actual } { Colors . RESET } " )
def print_progress ( current , total , test_name = " " ) :
""" Print progress indicator """
percentage = ( current / total ) * 100
bar_length = 40
filled_length = int ( bar_length * current / / total )
bar = ' █ ' * filled_length + ' ░ ' * ( bar_length - filled_length )
sys . stdout . write ( f ' \r { Colors . CYAN } Progress: [ { bar } ] { percentage : .1f } % ( { current } / { total } ) { test_name } { Colors . RESET } ' )
sys . stdout . flush ( )
if current == total :
print ( ) # New line when complete
# Copy the normalize_url functions directly to avoid import issues
def normalize_url (
href : str ,
base_url : str ,
* ,
drop_query_tracking = True ,
sort_query = True ,
keep_fragment = False ,
extra_drop_params = None ,
preserve_https = False ,
original_scheme = None
) :
"""
Extended URL normalizer with fixes for edge cases - copied from utils.py for testing
"""
if not href or not href . strip ( ) :
return None
# Resolve relative paths first
full_url = urljoin ( base_url , href . strip ( ) )
# Preserve HTTPS if requested and original scheme was HTTPS
if preserve_https and original_scheme == ' https ' :
parsed_full = urlparse ( full_url )
parsed_base = urlparse ( base_url )
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
if ( parsed_full . scheme == ' http ' and
parsed_full . netloc == parsed_base . netloc and
not href . strip ( ) . startswith ( ' // ' ) ) :
full_url = full_url . replace ( ' http:// ' , ' https:// ' , 1 )
# Parse once, edit parts, then rebuild
parsed = urlparse ( full_url )
# ── netloc ──
netloc = parsed . netloc . lower ( )
# Remove default ports
if ' : ' in netloc :
host , port = netloc . rsplit ( ' : ' , 1 )
if ( parsed . scheme == ' http ' and port == ' 80 ' ) or ( parsed . scheme == ' https ' and port == ' 443 ' ) :
netloc = host
else :
netloc = f " { host } : { port } "
# ── path ──
# Strip duplicate slashes and trailing "/" (except root)
# IMPORTANT: Don't use quote(unquote()) as it mangles + signs in URLs
# The path from urlparse is already properly encoded
path = parsed . path
if path . endswith ( ' / ' ) and path != ' / ' :
path = path . rstrip ( ' / ' )
# ── query ──
query = parsed . query
if query :
# explode, mutate, then rebuild
params = list ( parse_qsl ( query , keep_blank_values = True ) ) # Parse query string into key-value pairs, preserving blank values
if drop_query_tracking :
# Define default tracking parameters to remove for cleaner URLs
default_tracking = {
' utm_source ' , ' utm_medium ' , ' utm_campaign ' , ' utm_term ' ,
' utm_content ' , ' gclid ' , ' fbclid ' , ' ref ' , ' ref_src '
}
if extra_drop_params :
default_tracking | = { p . lower ( ) for p in extra_drop_params } # Add any extra parameters to drop, case-insensitive
params = [ ( k , v ) for k , v in params if k not in default_tracking ] # Filter out tracking parameters
# Normalize parameter keys to lowercase
params = [ ( k . lower ( ) , v ) for k , v in params ]
if sort_query :
params . sort ( key = lambda kv : kv [ 0 ] ) # Sort parameters alphabetically by key (now lowercase)
query = urlencode ( params , doseq = True ) if params else ' ' # Rebuild query string, handling sequences properly
# ── fragment ──
fragment = parsed . fragment if keep_fragment else ' '
# Re-assemble
normalized = urlunparse ( (
parsed . scheme ,
netloc ,
path ,
parsed . params ,
query ,
fragment
) )
return normalized
def normalize_url_for_deep_crawl ( href , base_url , preserve_https = False , original_scheme = None ) :
""" Normalize URLs for deep crawling - copied from utils.py for testing """
if not href :
return None
# Use urljoin to handle relative URLs
full_url = urljoin ( base_url , href . strip ( ) )
# Preserve HTTPS if requested and original scheme was HTTPS
if preserve_https and original_scheme == ' https ' :
parsed_full = urlparse ( full_url )
parsed_base = urlparse ( base_url )
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
if ( parsed_full . scheme == ' http ' and
parsed_full . netloc == parsed_base . netloc and
not href . strip ( ) . startswith ( ' // ' ) ) :
full_url = full_url . replace ( ' http:// ' , ' https:// ' , 1 )
# Parse the URL for normalization
parsed = urlparse ( full_url )
# Convert hostname to lowercase
netloc = parsed . netloc . lower ( )
# Remove fragment entirely
fragment = ' '
# Normalize query parameters if needed
query = parsed . query
if query :
# Parse query parameters
params = parse_qsl ( query )
# Remove tracking parameters (example - customize as needed)
tracking_params = [ ' utm_source ' , ' utm_medium ' , ' utm_campaign ' , ' ref ' , ' fbclid ' ]
params = [ ( k , v ) for k , v in params if k not in tracking_params ]
# Rebuild query string, sorted for consistency
query = urlencode ( params , doseq = True ) if params else ' '
# Build normalized URL
normalized = urlunparse ( (
parsed . scheme ,
netloc ,
parsed . path . rstrip ( ' / ' ) , # Normalize trailing slash
parsed . params ,
query ,
fragment
) )
return normalized
def efficient_normalize_url_for_deep_crawl ( href , base_url , preserve_https = False , original_scheme = None ) :
""" Efficient URL normalization with proper parsing - copied from utils.py for testing """
if not href :
return None
# Resolve relative URLs
full_url = urljoin ( base_url , href . strip ( ) )
# Preserve HTTPS if requested and original scheme was HTTPS
if preserve_https and original_scheme == ' https ' :
parsed_full = urlparse ( full_url )
parsed_base = urlparse ( base_url )
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
if ( parsed_full . scheme == ' http ' and
parsed_full . netloc == parsed_base . netloc and
not href . strip ( ) . startswith ( ' // ' ) ) :
full_url = full_url . replace ( ' http:// ' , ' https:// ' , 1 )
# Use proper URL parsing
parsed = urlparse ( full_url )
# Only perform the most critical normalizations
# 1. Lowercase hostname
# 2. Remove fragment
normalized = urlunparse ( (
parsed . scheme ,
parsed . netloc . lower ( ) ,
parsed . path . rstrip ( ' / ' ) ,
parsed . params ,
parsed . query ,
' ' # Remove fragment
) )
return normalized
class URLNormalizationTestSuite :
""" Comprehensive test suite for URL normalization functions """
def __init__ ( self ) :
self . base_url = " https://example.com/path/page.html "
self . https_base_url = " https://example.com/path/page.html "
self . http_base_url = " http://example.com/path/page.html "
self . tests_run = 0
self . tests_passed = 0
self . tests_failed = [ ]
self . test_start_time = None
self . section_stats = { }
self . current_section = None
def start_section ( self , section_name , icon = " " ) :
""" Start a new test section """
self . current_section = section_name
if section_name not in self . section_stats :
self . section_stats [ section_name ] = { ' run ' : 0 , ' passed ' : 0 , ' failed ' : 0 }
print_section ( section_name , icon )
def assert_equal ( self , actual , expected , test_name ) :
""" Assert that actual equals expected """
self . tests_run + = 1
if self . current_section :
self . section_stats [ self . current_section ] [ ' run ' ] + = 1
if actual == expected :
self . tests_passed + = 1
if self . current_section :
self . section_stats [ self . current_section ] [ ' passed ' ] + = 1
print_test_result ( test_name , True )
else :
self . tests_failed . append ( {
' name ' : test_name ,
' expected ' : expected ,
' actual ' : actual ,
' section ' : self . current_section
} )
if self . current_section :
self . section_stats [ self . current_section ] [ ' failed ' ] + = 1
print_test_result ( test_name , False , expected , actual )
def assert_none ( self , actual , test_name ) :
""" Assert that actual is None """
self . assert_equal ( actual , None , test_name )
def test_basic_url_resolution ( self ) :
""" Test basic relative and absolute URL resolution """
self . start_section ( " Basic URL Resolution " , Colors . TARGET )
# Absolute URLs should remain unchanged
self . assert_equal (
normalize_url ( " https://other.com/page.html " , self . base_url ) ,
" https://other.com/page.html " ,
" Absolute URL unchanged "
)
# Relative URLs
self . assert_equal (
normalize_url ( " relative.html " , self . base_url ) ,
" https://example.com/path/relative.html " ,
" Relative URL resolution "
)
self . assert_equal (
normalize_url ( " ./relative.html " , self . base_url ) ,
" https://example.com/path/relative.html " ,
" Relative URL with dot "
)
self . assert_equal (
normalize_url ( " ../relative.html " , self . base_url ) ,
" https://example.com/relative.html " ,
" Parent directory resolution "
)
# Root-relative URLs
self . assert_equal (
normalize_url ( " /root.html " , self . base_url ) ,
" https://example.com/root.html " ,
" Root-relative URL "
)
# Protocol-relative URLs
self . assert_equal (
normalize_url ( " //cdn.example.com/asset.js " , self . base_url ) ,
" https://cdn.example.com/asset.js " ,
" Protocol-relative URL "
)
def test_query_parameter_handling ( self ) :
""" Test query parameter sorting and tracking removal """
self . start_section ( " Query Parameter Handling " , Colors . STAR )
# Basic query parameters
self . assert_equal (
normalize_url ( " https://example.com?page=1&sort=name " , self . base_url ) ,
" https://example.com?page=1&sort=name " ,
" Basic query parameters sorted "
)
# Tracking parameters removal
self . assert_equal (
normalize_url ( " https://example.com?utm_source=google&utm_medium=email&page=1 " , self . base_url ) ,
" https://example.com?page=1 " ,
" Tracking parameters removed "
)
# Mixed tracking and valid parameters
self . assert_equal (
normalize_url ( " https://example.com?fbclid=123&utm_campaign=test&category=news&id=456 " , self . base_url ) ,
" https://example.com?category=news&id=456 " ,
" Mixed tracking and valid parameters "
)
# Empty query values
self . assert_equal (
normalize_url ( " https://example.com?page=&sort=name " , self . base_url ) ,
" https://example.com?page=&sort=name " ,
" Empty query values preserved "
)
# Disable tracking removal
self . assert_equal (
normalize_url ( " https://example.com?utm_source=google&page=1 " , self . base_url , drop_query_tracking = False ) ,
" https://example.com?page=1&utm_source=google " ,
" Tracking parameters preserved when disabled "
)
# Disable sorting
self . assert_equal (
normalize_url ( " https://example.com?z=1&a=2 " , self . base_url , sort_query = False ) ,
" https://example.com?z=1&a=2 " ,
" Query parameters not sorted when disabled "
)
def test_fragment_handling ( self ) :
""" Test fragment/hash handling """
self . start_section ( " Fragment Handling " , Colors . FIRE )
# Fragments removed by default
self . assert_equal (
normalize_url ( " https://example.com/page.html#section " , self . base_url ) ,
" https://example.com/page.html " ,
" Fragment removed by default "
)
# Fragments preserved when requested
self . assert_equal (
normalize_url ( " https://example.com/page.html#section " , self . base_url , keep_fragment = True ) ,
" https://example.com/page.html#section " ,
" Fragment preserved when requested "
)
# Fragments with query parameters
self . assert_equal (
normalize_url ( " https://example.com?page=1#section " , self . base_url , keep_fragment = True ) ,
" https://example.com?page=1#section " ,
" Fragment with query parameters "
)
def test_https_preservation ( self ) :
""" Test HTTPS preservation logic """
self . start_section ( " HTTPS Preservation " , Colors . ROCKET )
# Same domain HTTP to HTTPS
self . assert_equal (
normalize_url ( " http://example.com/page.html " , self . https_base_url , preserve_https = True , original_scheme = ' https ' ) ,
" https://example.com/page.html " ,
" HTTP to HTTPS for same domain "
)
# Different domain should not change
self . assert_equal (
normalize_url ( " http://other.com/page.html " , self . https_base_url , preserve_https = True , original_scheme = ' https ' ) ,
" http://other.com/page.html " ,
" Different domain HTTP unchanged "
)
# Protocol-relative should follow base
self . assert_equal (
normalize_url ( " //example.com/page.html " , self . https_base_url , preserve_https = True , original_scheme = ' https ' ) ,
" https://example.com/page.html " ,
" Protocol-relative follows base scheme "
)
def test_edge_cases ( self ) :
""" Test edge cases and error conditions """
self . start_section ( " Edge Cases " , Colors . WARNING )
# None and empty inputs
result = normalize_url ( None , self . base_url ) # type: ignore
self . assert_none ( result , " None input " )
self . assert_none ( normalize_url ( " " , self . base_url ) , " Empty string input " )
self . assert_none ( normalize_url ( " " , self . base_url ) , " Whitespace only input " )
# Malformed URLs
try :
normalize_url ( " not-a-url " , " invalid-base " )
print ( " ✗ Should have raised ValueError for invalid base URL " )
except ValueError :
print ( " ✓ Correctly raised ValueError for invalid base URL " )
# Special protocols
self . assert_equal (
normalize_url ( " mailto:test@example.com " , self . base_url ) ,
" mailto:test@example.com " ,
" Mailto protocol preserved "
)
self . assert_equal (
normalize_url ( " tel:+1234567890 " , self . base_url ) ,
" tel:+1234567890 " ,
" Tel protocol preserved "
)
self . assert_equal (
normalize_url ( " javascript:void(0) " , self . base_url ) ,
" javascript:void(0) " ,
" JavaScript protocol preserved "
)
def test_case_sensitivity ( self ) :
""" Test case sensitivity handling """
self . start_section ( " Case Sensitivity " , Colors . INFO )
# Domain case normalization
self . assert_equal (
normalize_url ( " https://EXAMPLE.COM/page.html " , self . base_url ) ,
" https://example.com/page.html " ,
" Domain case normalization "
)
# Mixed case paths
self . assert_equal (
normalize_url ( " https://example.com/PATH/Page.HTML " , self . base_url ) ,
" https://example.com/PATH/Page.HTML " ,
" Path case preserved "
)
# Query parameter case
self . assert_equal (
normalize_url ( " https://example.com?PARAM=value " , self . base_url ) ,
" https://example.com?param=value " ,
" Query parameter case normalization "
)
def test_unicode_and_special_chars ( self ) :
""" Test Unicode and special characters """
self . start_section ( " Unicode & Special Characters " , " 🌍 " )
# Unicode in path
self . assert_equal (
normalize_url ( " https://example.com/café.html " , self . base_url ) ,
" https://example.com/café.html " ,
" Unicode characters in path "
)
# Encoded characters
self . assert_equal (
normalize_url ( " https://example.com/caf % C3 % A9.html " , self . base_url ) ,
" https://example.com/caf % C3 % A9.html " ,
" URL-encoded characters preserved "
)
# Spaces in URLs
self . assert_equal (
normalize_url ( " https://example.com/page with spaces.html " , self . base_url ) ,
" https://example.com/page with spaces.html " ,
" Spaces in URLs handled "
)
def test_port_numbers ( self ) :
""" Test port number handling """
self . start_section ( " Port Numbers " , " 🔌 " )
# Default ports
self . assert_equal (
normalize_url ( " https://example.com:443/page.html " , self . base_url ) ,
" https://example.com/page.html " ,
" Default HTTPS port removed "
)
self . assert_equal (
normalize_url ( " http://example.com:80/page.html " , self . base_url ) ,
" http://example.com/page.html " ,
" Default HTTP port removed "
)
# Non-default ports
self . assert_equal (
normalize_url ( " https://example.com:8443/page.html " , self . base_url ) ,
" https://example.com:8443/page.html " ,
" Non-default port preserved "
)
def test_trailing_slashes ( self ) :
""" Test trailing slash normalization """
self . start_section ( " Trailing Slashes " , " 📁 " )
# Remove trailing slash from paths
self . assert_equal (
normalize_url ( " https://example.com/path/ " , self . base_url ) ,
" https://example.com/path " ,
" Trailing slash removed from path "
)
# Preserve root trailing slash
self . assert_equal (
normalize_url ( " https://example.com/ " , self . base_url ) ,
" https://example.com/ " ,
" Root trailing slash preserved "
)
# Multiple trailing slashes
self . assert_equal (
normalize_url ( " https://example.com/path// " , self . base_url ) ,
" https://example.com/path " ,
" Multiple trailing slashes normalized "
)
def test_deep_crawl_functions ( self ) :
""" Test deep crawl specific normalization functions """
self . start_section ( " Deep Crawl Functions " , " 🔍 " )
# Test normalize_url_for_deep_crawl
result = normalize_url_for_deep_crawl ( " https://EXAMPLE.COM/path/?utm_source=test&page=1 " , self . base_url )
expected = " https://example.com/path?page=1 "
self . assert_equal ( result , expected , " Deep crawl normalization " )
# Test efficient version
result = efficient_normalize_url_for_deep_crawl ( " https://EXAMPLE.COM/path/#fragment " , self . base_url )
expected = " https://example.com/path "
self . assert_equal ( result , expected , " Efficient deep crawl normalization " )
def test_base_domain_extraction ( self ) :
""" Test base domain extraction """
self . start_section ( " Base Domain Extraction " , " 🏠 " )
self . assert_equal (
get_base_domain ( " https://www.example.com/path " ) ,
" example.com " ,
" WWW prefix removed "
)
self . assert_equal (
get_base_domain ( " https://sub.example.co.uk/path " ) ,
" example.co.uk " ,
" Special TLD handled "
)
self . assert_equal (
get_base_domain ( " https://example.com:8080/path " ) ,
" example.com " ,
" Port removed "
)
def test_external_url_detection ( self ) :
""" Test external URL detection """
self . start_section ( " External URL Detection " , " 🌐 " )
self . assert_equal (
is_external_url ( " https://other.com/page.html " , " example.com " ) ,
True ,
" Different domain is external "
)
self . assert_equal (
is_external_url ( " https://www.example.com/page.html " , " example.com " ) ,
False ,
" Same domain with www is internal "
)
self . assert_equal (
is_external_url ( " mailto:test@example.com " , " example.com " ) ,
True ,
" Special protocol is external "
)
def run_all_tests ( self ) :
""" Run all test suites """
print_header ( " 🚀 URL Normalization Test Suite " , Colors . ROCKET )
self . test_start_time = time . time ( )
# Run all test sections
sections = [
( " Basic URL Resolution " , Colors . TARGET , self . test_basic_url_resolution ) ,
( " Query Parameter Handling " , Colors . STAR , self . test_query_parameter_handling ) ,
( " Fragment Handling " , Colors . FIRE , self . test_fragment_handling ) ,
( " HTTPS Preservation " , Colors . ROCKET , self . test_https_preservation ) ,
( " Edge Cases " , Colors . WARNING , self . test_edge_cases ) ,
( " Case Sensitivity " , Colors . INFO , self . test_case_sensitivity ) ,
( " Unicode & Special Characters " , " 🌍 " , self . test_unicode_and_special_chars ) ,
( " Port Numbers " , " 🔌 " , self . test_port_numbers ) ,
( " Trailing Slashes " , " 📁 " , self . test_trailing_slashes ) ,
( " Deep Crawl Functions " , " 🔍 " , self . test_deep_crawl_functions ) ,
( " Base Domain Extraction " , " 🏠 " , self . test_base_domain_extraction ) ,
( " External URL Detection " , " 🌐 " , self . test_external_url_detection ) ,
]
total_sections = len ( sections )
for i , ( section_name , icon , test_method ) in enumerate ( sections , 1 ) :
print_progress ( i - 1 , total_sections , f " Running { section_name } " )
test_method ( )
print_progress ( i , total_sections , f " Completed { section_name } " )
# Calculate execution time
execution_time = time . time ( ) - self . test_start_time
# Print comprehensive statistics
self . print_comprehensive_stats ( execution_time )
return len ( self . tests_failed ) == 0
def print_comprehensive_stats ( self , execution_time ) :
""" Print comprehensive test statistics """
print_header ( " 📊 Test Results Summary " , " 📈 " )
# Overall statistics
success_rate = ( self . tests_passed / self . tests_run * 100 ) if self . tests_run > 0 else 0
print ( f " { Colors . BOLD } Overall Statistics: { Colors . RESET } " )
print ( f " Total Tests: { Colors . CYAN } { self . tests_run } { Colors . RESET } " )
print ( f " Passed: { Colors . GREEN } { self . tests_passed } { Colors . RESET } " )
print ( f " Failed: { Colors . RED } { len ( self . tests_failed ) } { Colors . RESET } " )
print ( f " Success Rate: { Colors . BRIGHT_CYAN } { success_rate : .1f } % { Colors . RESET } " )
print ( f " Execution Time: { Colors . YELLOW } { execution_time : .2f } s { Colors . RESET } " )
# Performance indicator
if success_rate == 100 :
print_success ( " 🎉 Perfect! All tests passed! " )
elif success_rate > = 90 :
print_success ( " ✅ Excellent! Nearly perfect results! " )
elif success_rate > = 75 :
print_warning ( " ⚠️ Good results, but some improvements needed " )
else :
print_error ( " ❌ Significant issues detected - review failures below " )
# Section-by-section breakdown
if self . section_stats :
print ( f " \n { Colors . BOLD } Section Breakdown: { Colors . RESET } " )
for section_name , stats in self . section_stats . items ( ) :
section_success_rate = ( stats [ ' passed ' ] / stats [ ' run ' ] * 100 ) if stats [ ' run ' ] > 0 else 0
status_icon = Colors . CHECK if stats [ ' failed ' ] == 0 else Colors . CROSS
status_color = Colors . GREEN if stats [ ' failed ' ] == 0 else Colors . RED
print ( f " { status_icon } { section_name } : { Colors . CYAN } { stats [ ' run ' ] } { Colors . RESET } tests, "
f " { status_color } { stats [ ' passed ' ] } passed { Colors . RESET } , "
f " { Colors . RED } { stats [ ' failed ' ] } failed { Colors . RESET } "
f " ( { Colors . BRIGHT_CYAN } { section_success_rate : .1f } % { Colors . RESET } ) " )
# Failed tests details
if self . tests_failed :
print ( f " \n { Colors . BOLD } { Colors . RED } Failed Tests Details: { Colors . RESET } " )
for i , failure in enumerate ( self . tests_failed , 1 ) :
print ( f " { Colors . RED } { i } . { failure [ ' name ' ] } { Colors . RESET } " )
if ' section ' in failure and failure [ ' section ' ] :
print ( f " Section: { Colors . YELLOW } { failure [ ' section ' ] } { Colors . RESET } " )
print ( f " Expected: { Colors . BRIGHT_RED } { failure [ ' expected ' ] } { Colors . RESET } " )
print ( f " Actual: { Colors . BRIGHT_RED } { failure [ ' actual ' ] } { Colors . RESET } " )
print ( )
# Recommendations
if self . tests_failed :
print ( f " { Colors . BOLD } { Colors . YELLOW } Recommendations: { Colors . RESET } " )
print ( f " • Review the { len ( self . tests_failed ) } failed test(s) above " )
print ( " • Check URL normalization logic for edge cases " )
print ( " • Verify query parameter handling " )
print ( " • Test with real-world URLs " )
else :
print ( f " \n { Colors . BOLD } { Colors . GREEN } Recommendations: { Colors . RESET } " )
print ( " • All tests passed! URL normalization is working correctly " )
print ( " • Consider adding more edge cases for future robustness " )
print ( " • Monitor performance with large-scale crawling " )
def test_crawling_integration ( ) :
""" Test integration with crawling scripts """
print_section ( " Crawling Integration Test " , " 🔗 " )
# Test URLs that would be encountered in real crawling
test_urls = [
" https://example.com/blog/post?utm_source=newsletter&utm_medium=email " ,
" https://example.com/products?page=1&sort=price&ref=search " ,
" /about.html " ,
" ../contact.html " ,
" //cdn.example.com/js/main.js " ,
" mailto:support@example.com " ,
" #top " ,
" " ,
None ,
]
base_url = " https://example.com/current/page.html "
print ( " Testing real-world URL scenarios: " )
for url in test_urls :
try :
normalized = normalize_url ( url , base_url )
print ( f " { url } -> { normalized } " )
except ( ValueError , TypeError ) as e :
print ( f " { url } -> ERROR: { e } " )
if __name__ == " __main__ " :
print_header ( " 🧪 URL Normalization Comprehensive Test Suite " , " 🧪 " )
print_info ( " Testing URL normalization functions with comprehensive scenarios and edge cases " )
print ( )
# Run the test suite
test_suite = URLNormalizationTestSuite ( )
success = test_suite . run_all_tests ( )
# Run integration tests
print ( )
test_crawling_integration ( )
# Final summary
print ( )
print_header ( " 🏁 Final Test Summary " , " 🏁 " )
if success :
print_success ( " 🎉 ALL TESTS PASSED! URL normalization is working perfectly! " )
print_info ( " The updated URL normalization functions are ready for production use. " )
else :
print_error ( " ❌ SOME TESTS FAILED! Please review the issues above. " )
print_warning ( " URL normalization may have issues that need to be addressed before deployment. " )
print ( )
print_info ( " Test suite completed. Check the results above for detailed analysis. " )
# Exit with appropriate code
sys . exit ( 0 if success else 1 )