#1127: Improve URL handling and normalization in scraping strategies

2025-06-10 11:57:06 +08:00
parent 74b06d4b80
commit 9442597f81
2 changed files with 96 additions and 45 deletions
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -15,7 +15,7 @@ from .config import (
 )
 from bs4 import NavigableString, Comment
 from bs4 import PageElement, Tag
-from urllib.parse import urljoin
+from urllib.parse import urljoin , urlparse
 from requests.exceptions import InvalidSchema
 from .utils import (
    extract_metadata,
@@ -24,8 +24,7 @@ from .utils import (
    get_base_domain,
    extract_metadata_using_lxml,
 )
-from lxml import etree
-from lxml import html as lhtml
+from lxml import etree, html as lhtml
 from typing import List
 from .models import ScrapingResult, MediaItem, Link, Media, Links
 import copy
@@ -130,7 +129,27 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            ScrapingResult: A structured result containing the scraped content.
        """
        actual_url = kwargs.get("redirected_url", url)
-        raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
+        # raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
+        effective_base_url = actual_url
+        try:
+            soup_for_base_check = BeautifulSoup(html, "html.parser")
+            base_tag = soup_for_base_check.find("base", href=True)
+            if base_tag:
+                base_href_val = base_tag.get("href")
+                if base_href_val is not None:
+                    resolved_base_href = urljoin(actual_url, base_href_val)
+                    parsed_resolved_base = urlparse(resolved_base_href)
+                    if parsed_resolved_base.scheme and parsed_resolved_base.netloc:
+                        effective_base_url = resolved_base_href
+        except Exception as e:
+            self._log(
+                "error",
+                message="Error resolving base URL: {error}",
+                tag="SCRAPE",
+                params={"error": str(e)},
+            )
+        kwargs_for_scrap = {**kwargs, '_effective_base_url_override': effective_base_url }
+        raw_result = self._scrap(actual_url, html, is_async=False, **kwargs_for_scrap)
        if raw_result is None:
            return ScrapingResult(
                cleaned_html="",
@@ -1487,6 +1506,27 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
            doc = lhtml.document_fromstring(html)
            # Match BeautifulSoup's behavior of using body or full doc
            # body = doc.xpath('//body')[0] if doc.xpath('//body') else doc
+            # Determine effective base URL considering <base href="...">
+            base_tag_element = doc.find(".//base[@href]")
+            if base_tag_element is not None:
+                base_href_value = base_tag_element.get("href")
+                if base_href_value is not None:
+                    resolved_base_href = urljoin(url, base_href_value)
+                    parse_resolved_base_href = urlparse(resolved_base_href)
+                    if parse_resolved_base_href.scheme and parse_resolved_base_href.netloc:
+                        effective_base_url = resolved_base_href
+                        self._log(
+                            "debug",
+                            f"Using <base href='{base_href_value}'>, resolved effective base URL for links: {effective_base_url}",
+                            url=url, # Log against original document URL 
+                            tag="SCRAPE_BASE_URL")
+                    else:
+                        effective_base_url = url
+                        self._log(
+                            "warning",
+                            f"<base href='{base_href_value}'> resolved to non-absolute URL '{resolved_base_href}'. Using document URL '{actual_url}' as base.",
+                            url=url, # Log against original document URL 
+                            tag="SCRAPE_BASE_URL")
            body = doc

            base_domain = get_base_domain(url)
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -15,9 +15,10 @@ from .html2text import html2text, CustomHTML2Text
 from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, IMAGE_SCORE_THRESHOLD, DEFAULT_PROVIDER, PROVIDER_MODELS
 import httpx
 from socket import gaierror
-from pathlib import Path
+from pathlib import Path , PurePath
 from typing import Dict, Any, List, Optional, Callable
 from urllib.parse import urljoin
+
 import requests
 from requests.exceptions import InvalidSchema
 import xxhash
@@ -2056,18 +2057,29 @@ def fast_format_html(html_string):
 def normalize_url(href, base_url):
    """Normalize URLs to ensure consistent format"""
    from urllib.parse import urljoin, urlparse
-
+    if href is None:
+        return None
+    
+    href_str = str(href).strip()
+    if not href_str:
+        # Empty href, conventionally resolves to the base URL itself.
+        return base_url
    # Parse base URL to get components
+    parsed_href = urlparse(href_str)
+    if parsed_href.scheme and parsed_href.scheme.lower() in ["mailto", "tel", "javascript", "data", "file"]:
+        # If href is already a full URL, return it as is
+        return href_str
+    
    parsed_base = urlparse(base_url)
    if not parsed_base.scheme or not parsed_base.netloc:
        raise ValueError(f"Invalid base URL format: {base_url}")

-    # Ensure base_url ends with a trailing slash if it's a directory path
-    if not base_url.endswith('/'):
-        base_url = base_url + '/'
+    # # Ensure base_url ends with a trailing slash if it's a directory path
+    # if not base_url.endswith('/'):
+    #     base_url = base_url + '/'

    # Use urljoin to handle all cases
-    normalized = urljoin(base_url, href.strip())
+    normalized = urljoin(base_url, href_str)
    return normalized


@@ -2080,7 +2092,7 @@ def normalize_url_for_deep_crawl(href, base_url):
        return None

    # Use urljoin to handle relative URLs
-    full_url = urljoin(base_url, href.strip())
+    full_url = urljoin(base_url, str(href).strip())
    
    # Parse the URL for normalization
    parsed = urlparse(full_url)
@@ -2110,7 +2122,7 @@ def normalize_url_for_deep_crawl(href, base_url):
    normalized = urlunparse((
        parsed.scheme,
        netloc,
-        parsed.path.rstrip('/'),  # Normalize trailing slash
+        str(PurePath(parsed.path)).rstrip('/'),  # Normalize path to remove duplicate slashes
        parsed.params,
        query,
        fragment
@@ -2127,7 +2139,7 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
        return None
    
    # Resolve relative URLs
-    full_url = urljoin(base_url, href.strip())
+    full_url = urljoin(base_url, str(href).strip())
    
    # Use proper URL parsing
    parsed = urlparse(full_url)
@@ -2135,52 +2147,51 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
    # Only perform the most critical normalizations
    # 1. Lowercase hostname
    # 2. Remove fragment
+    path = parsed.path
+    if len(path) > 1 and path.endswith('/'):
+        path = path.rstrip('/')
    normalized = urlunparse((
        parsed.scheme,
        parsed.netloc.lower(),
-        parsed.path.rstrip('/'),
-        parsed.params,
-        parsed.query,
-        ''  # Remove fragment
    ))
    
    return normalized


-def normalize_url_tmp(href, base_url):
-    """Normalize URLs to ensure consistent format"""
-    # Extract protocol and domain from base URL
-    try:
-        base_parts = base_url.split("/")
-        protocol = base_parts[0]
-        domain = base_parts[2]
-    except IndexError:
-        raise ValueError(f"Invalid base URL format: {base_url}")
+# def normalize_url_tmp(href, base_url):
+#     """Normalize URLs to ensure consistent format"""
+#     # Extract protocol and domain from base URL
+#     try:
+#         base_parts = base_url.split("/")
+#         protocol = base_parts[0]
+#         domain = base_parts[2]
+#     except IndexError:
+#         raise ValueError(f"Invalid base URL format: {base_url}")

-    # Handle special protocols
-    special_protocols = {"mailto:", "tel:", "ftp:", "file:", "data:", "javascript:"}
-    if any(href.lower().startswith(proto) for proto in special_protocols):
-        return href.strip()
+#     # Handle special protocols
+#     special_protocols = {"mailto:", "tel:", "ftp:", "file:", "data:", "javascript:"}
+#     if any(href.lower().startswith(proto) for proto in special_protocols):
+#         return href.strip()

-    # Handle anchor links
-    if href.startswith("#"):
-        return f"{base_url}{href}"
+#     # Handle anchor links
+#     if href.startswith("#"):
+#         return f"{base_url}{href}"

-    # Handle protocol-relative URLs
-    if href.startswith("//"):
-        return f"{protocol}{href}"
+#     # Handle protocol-relative URLs
+#     if href.startswith("//"):
+#         return f"{protocol}{href}"

-    # Handle root-relative URLs
-    if href.startswith("/"):
-        return f"{protocol}//{domain}{href}"
+#     # Handle root-relative URLs
+#     if href.startswith("/"):
+#         return f"{protocol}//{domain}{href}"

-    # Handle relative URLs
-    if not href.startswith(("http://", "https://")):
-        # Remove leading './' if present
-        href = href.lstrip("./")
-        return f"{protocol}//{domain}/{href}"
+#     # Handle relative URLs
+#     if not href.startswith(("http://", "https://")):
+#         # Remove leading './' if present
+#         href = href.lstrip("./")
+#         return f"{protocol}//{domain}/{href}"

-    return href.strip()
+#     return href.strip()


 def get_base_domain(url: str) -> str: