diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 1dfbce84..8f6a7d83 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -15,7 +15,7 @@ from .config import (
)
from bs4 import NavigableString, Comment
from bs4 import PageElement, Tag
-from urllib.parse import urljoin
+from urllib.parse import urljoin , urlparse
from requests.exceptions import InvalidSchema
from .utils import (
extract_metadata,
@@ -24,8 +24,7 @@ from .utils import (
get_base_domain,
extract_metadata_using_lxml,
)
-from lxml import etree
-from lxml import html as lhtml
+from lxml import etree, html as lhtml
from typing import List
from .models import ScrapingResult, MediaItem, Link, Media, Links
import copy
@@ -130,7 +129,27 @@ class WebScrapingStrategy(ContentScrapingStrategy):
ScrapingResult: A structured result containing the scraped content.
"""
actual_url = kwargs.get("redirected_url", url)
- raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
+ # raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
+ effective_base_url = actual_url
+ try:
+ soup_for_base_check = BeautifulSoup(html, "html.parser")
+ base_tag = soup_for_base_check.find("base", href=True)
+ if base_tag:
+ base_href_val = base_tag.get("href")
+ if base_href_val is not None:
+ resolved_base_href = urljoin(actual_url, base_href_val)
+ parsed_resolved_base = urlparse(resolved_base_href)
+ if parsed_resolved_base.scheme and parsed_resolved_base.netloc:
+ effective_base_url = resolved_base_href
+ except Exception as e:
+ self._log(
+ "error",
+ message="Error resolving base URL: {error}",
+ tag="SCRAPE",
+ params={"error": str(e)},
+ )
+ kwargs_for_scrap = {**kwargs, '_effective_base_url_override': effective_base_url }
+ raw_result = self._scrap(actual_url, html, is_async=False, **kwargs_for_scrap)
if raw_result is None:
return ScrapingResult(
cleaned_html="",
@@ -1487,6 +1506,27 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
doc = lhtml.document_fromstring(html)
# Match BeautifulSoup's behavior of using body or full doc
# body = doc.xpath('//body')[0] if doc.xpath('//body') else doc
+ # Determine effective base URL considering
+ base_tag_element = doc.find(".//base[@href]")
+ if base_tag_element is not None:
+ base_href_value = base_tag_element.get("href")
+ if base_href_value is not None:
+ resolved_base_href = urljoin(url, base_href_value)
+ parse_resolved_base_href = urlparse(resolved_base_href)
+ if parse_resolved_base_href.scheme and parse_resolved_base_href.netloc:
+ effective_base_url = resolved_base_href
+ self._log(
+ "debug",
+ f"Using , resolved effective base URL for links: {effective_base_url}",
+ url=url, # Log against original document URL
+ tag="SCRAPE_BASE_URL")
+ else:
+ effective_base_url = url
+ self._log(
+ "warning",
+ f" resolved to non-absolute URL '{resolved_base_href}'. Using document URL '{actual_url}' as base.",
+ url=url, # Log against original document URL
+ tag="SCRAPE_BASE_URL")
body = doc
base_domain = get_base_domain(url)
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index d8b366d9..c51fa254 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -15,9 +15,10 @@ from .html2text import html2text, CustomHTML2Text
from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, IMAGE_SCORE_THRESHOLD, DEFAULT_PROVIDER, PROVIDER_MODELS
import httpx
from socket import gaierror
-from pathlib import Path
+from pathlib import Path , PurePath
from typing import Dict, Any, List, Optional, Callable
from urllib.parse import urljoin
+
import requests
from requests.exceptions import InvalidSchema
import xxhash
@@ -2056,18 +2057,29 @@ def fast_format_html(html_string):
def normalize_url(href, base_url):
"""Normalize URLs to ensure consistent format"""
from urllib.parse import urljoin, urlparse
-
+ if href is None:
+ return None
+
+ href_str = str(href).strip()
+ if not href_str:
+ # Empty href, conventionally resolves to the base URL itself.
+ return base_url
# Parse base URL to get components
+ parsed_href = urlparse(href_str)
+ if parsed_href.scheme and parsed_href.scheme.lower() in ["mailto", "tel", "javascript", "data", "file"]:
+ # If href is already a full URL, return it as is
+ return href_str
+
parsed_base = urlparse(base_url)
if not parsed_base.scheme or not parsed_base.netloc:
raise ValueError(f"Invalid base URL format: {base_url}")
- # Ensure base_url ends with a trailing slash if it's a directory path
- if not base_url.endswith('/'):
- base_url = base_url + '/'
+ # # Ensure base_url ends with a trailing slash if it's a directory path
+ # if not base_url.endswith('/'):
+ # base_url = base_url + '/'
# Use urljoin to handle all cases
- normalized = urljoin(base_url, href.strip())
+ normalized = urljoin(base_url, href_str)
return normalized
@@ -2080,7 +2092,7 @@ def normalize_url_for_deep_crawl(href, base_url):
return None
# Use urljoin to handle relative URLs
- full_url = urljoin(base_url, href.strip())
+ full_url = urljoin(base_url, str(href).strip())
# Parse the URL for normalization
parsed = urlparse(full_url)
@@ -2110,7 +2122,7 @@ def normalize_url_for_deep_crawl(href, base_url):
normalized = urlunparse((
parsed.scheme,
netloc,
- parsed.path.rstrip('/'), # Normalize trailing slash
+ str(PurePath(parsed.path)).rstrip('/'), # Normalize path to remove duplicate slashes
parsed.params,
query,
fragment
@@ -2127,7 +2139,7 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
return None
# Resolve relative URLs
- full_url = urljoin(base_url, href.strip())
+ full_url = urljoin(base_url, str(href).strip())
# Use proper URL parsing
parsed = urlparse(full_url)
@@ -2135,52 +2147,51 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
# Only perform the most critical normalizations
# 1. Lowercase hostname
# 2. Remove fragment
+ path = parsed.path
+ if len(path) > 1 and path.endswith('/'):
+ path = path.rstrip('/')
normalized = urlunparse((
parsed.scheme,
parsed.netloc.lower(),
- parsed.path.rstrip('/'),
- parsed.params,
- parsed.query,
- '' # Remove fragment
))
return normalized
-def normalize_url_tmp(href, base_url):
- """Normalize URLs to ensure consistent format"""
- # Extract protocol and domain from base URL
- try:
- base_parts = base_url.split("/")
- protocol = base_parts[0]
- domain = base_parts[2]
- except IndexError:
- raise ValueError(f"Invalid base URL format: {base_url}")
+# def normalize_url_tmp(href, base_url):
+# """Normalize URLs to ensure consistent format"""
+# # Extract protocol and domain from base URL
+# try:
+# base_parts = base_url.split("/")
+# protocol = base_parts[0]
+# domain = base_parts[2]
+# except IndexError:
+# raise ValueError(f"Invalid base URL format: {base_url}")
- # Handle special protocols
- special_protocols = {"mailto:", "tel:", "ftp:", "file:", "data:", "javascript:"}
- if any(href.lower().startswith(proto) for proto in special_protocols):
- return href.strip()
+# # Handle special protocols
+# special_protocols = {"mailto:", "tel:", "ftp:", "file:", "data:", "javascript:"}
+# if any(href.lower().startswith(proto) for proto in special_protocols):
+# return href.strip()
- # Handle anchor links
- if href.startswith("#"):
- return f"{base_url}{href}"
+# # Handle anchor links
+# if href.startswith("#"):
+# return f"{base_url}{href}"
- # Handle protocol-relative URLs
- if href.startswith("//"):
- return f"{protocol}{href}"
+# # Handle protocol-relative URLs
+# if href.startswith("//"):
+# return f"{protocol}{href}"
- # Handle root-relative URLs
- if href.startswith("/"):
- return f"{protocol}//{domain}{href}"
+# # Handle root-relative URLs
+# if href.startswith("/"):
+# return f"{protocol}//{domain}{href}"
- # Handle relative URLs
- if not href.startswith(("http://", "https://")):
- # Remove leading './' if present
- href = href.lstrip("./")
- return f"{protocol}//{domain}/{href}"
+# # Handle relative URLs
+# if not href.startswith(("http://", "https://")):
+# # Remove leading './' if present
+# href = href.lstrip("./")
+# return f"{protocol}//{domain}/{href}"
- return href.strip()
+# return href.strip()
def get_base_domain(url: str) -> str: