From 9442597f81059365abe6e4fd97845214f51ae525 Mon Sep 17 00:00:00 2001 From: AHMET YILMAZ Date: Tue, 10 Jun 2025 11:57:06 +0800 Subject: [PATCH] #1127: Improve URL handling and normalization in scraping strategies --- crawl4ai/content_scraping_strategy.py | 48 ++++++++++++-- crawl4ai/utils.py | 93 +++++++++++++++------------ 2 files changed, 96 insertions(+), 45 deletions(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 1dfbce84..8f6a7d83 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -15,7 +15,7 @@ from .config import ( ) from bs4 import NavigableString, Comment from bs4 import PageElement, Tag -from urllib.parse import urljoin +from urllib.parse import urljoin , urlparse from requests.exceptions import InvalidSchema from .utils import ( extract_metadata, @@ -24,8 +24,7 @@ from .utils import ( get_base_domain, extract_metadata_using_lxml, ) -from lxml import etree -from lxml import html as lhtml +from lxml import etree, html as lhtml from typing import List from .models import ScrapingResult, MediaItem, Link, Media, Links import copy @@ -130,7 +129,27 @@ class WebScrapingStrategy(ContentScrapingStrategy): ScrapingResult: A structured result containing the scraped content. """ actual_url = kwargs.get("redirected_url", url) - raw_result = self._scrap(actual_url, html, is_async=False, **kwargs) + # raw_result = self._scrap(actual_url, html, is_async=False, **kwargs) + effective_base_url = actual_url + try: + soup_for_base_check = BeautifulSoup(html, "html.parser") + base_tag = soup_for_base_check.find("base", href=True) + if base_tag: + base_href_val = base_tag.get("href") + if base_href_val is not None: + resolved_base_href = urljoin(actual_url, base_href_val) + parsed_resolved_base = urlparse(resolved_base_href) + if parsed_resolved_base.scheme and parsed_resolved_base.netloc: + effective_base_url = resolved_base_href + except Exception as e: + self._log( + "error", + message="Error resolving base URL: {error}", + tag="SCRAPE", + params={"error": str(e)}, + ) + kwargs_for_scrap = {**kwargs, '_effective_base_url_override': effective_base_url } + raw_result = self._scrap(actual_url, html, is_async=False, **kwargs_for_scrap) if raw_result is None: return ScrapingResult( cleaned_html="", @@ -1487,6 +1506,27 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): doc = lhtml.document_fromstring(html) # Match BeautifulSoup's behavior of using body or full doc # body = doc.xpath('//body')[0] if doc.xpath('//body') else doc + # Determine effective base URL considering + base_tag_element = doc.find(".//base[@href]") + if base_tag_element is not None: + base_href_value = base_tag_element.get("href") + if base_href_value is not None: + resolved_base_href = urljoin(url, base_href_value) + parse_resolved_base_href = urlparse(resolved_base_href) + if parse_resolved_base_href.scheme and parse_resolved_base_href.netloc: + effective_base_url = resolved_base_href + self._log( + "debug", + f"Using , resolved effective base URL for links: {effective_base_url}", + url=url, # Log against original document URL + tag="SCRAPE_BASE_URL") + else: + effective_base_url = url + self._log( + "warning", + f" resolved to non-absolute URL '{resolved_base_href}'. Using document URL '{actual_url}' as base.", + url=url, # Log against original document URL + tag="SCRAPE_BASE_URL") body = doc base_domain = get_base_domain(url) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index d8b366d9..c51fa254 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -15,9 +15,10 @@ from .html2text import html2text, CustomHTML2Text from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, IMAGE_SCORE_THRESHOLD, DEFAULT_PROVIDER, PROVIDER_MODELS import httpx from socket import gaierror -from pathlib import Path +from pathlib import Path , PurePath from typing import Dict, Any, List, Optional, Callable from urllib.parse import urljoin + import requests from requests.exceptions import InvalidSchema import xxhash @@ -2056,18 +2057,29 @@ def fast_format_html(html_string): def normalize_url(href, base_url): """Normalize URLs to ensure consistent format""" from urllib.parse import urljoin, urlparse - + if href is None: + return None + + href_str = str(href).strip() + if not href_str: + # Empty href, conventionally resolves to the base URL itself. + return base_url # Parse base URL to get components + parsed_href = urlparse(href_str) + if parsed_href.scheme and parsed_href.scheme.lower() in ["mailto", "tel", "javascript", "data", "file"]: + # If href is already a full URL, return it as is + return href_str + parsed_base = urlparse(base_url) if not parsed_base.scheme or not parsed_base.netloc: raise ValueError(f"Invalid base URL format: {base_url}") - # Ensure base_url ends with a trailing slash if it's a directory path - if not base_url.endswith('/'): - base_url = base_url + '/' + # # Ensure base_url ends with a trailing slash if it's a directory path + # if not base_url.endswith('/'): + # base_url = base_url + '/' # Use urljoin to handle all cases - normalized = urljoin(base_url, href.strip()) + normalized = urljoin(base_url, href_str) return normalized @@ -2080,7 +2092,7 @@ def normalize_url_for_deep_crawl(href, base_url): return None # Use urljoin to handle relative URLs - full_url = urljoin(base_url, href.strip()) + full_url = urljoin(base_url, str(href).strip()) # Parse the URL for normalization parsed = urlparse(full_url) @@ -2110,7 +2122,7 @@ def normalize_url_for_deep_crawl(href, base_url): normalized = urlunparse(( parsed.scheme, netloc, - parsed.path.rstrip('/'), # Normalize trailing slash + str(PurePath(parsed.path)).rstrip('/'), # Normalize path to remove duplicate slashes parsed.params, query, fragment @@ -2127,7 +2139,7 @@ def efficient_normalize_url_for_deep_crawl(href, base_url): return None # Resolve relative URLs - full_url = urljoin(base_url, href.strip()) + full_url = urljoin(base_url, str(href).strip()) # Use proper URL parsing parsed = urlparse(full_url) @@ -2135,52 +2147,51 @@ def efficient_normalize_url_for_deep_crawl(href, base_url): # Only perform the most critical normalizations # 1. Lowercase hostname # 2. Remove fragment + path = parsed.path + if len(path) > 1 and path.endswith('/'): + path = path.rstrip('/') normalized = urlunparse(( parsed.scheme, parsed.netloc.lower(), - parsed.path.rstrip('/'), - parsed.params, - parsed.query, - '' # Remove fragment )) return normalized -def normalize_url_tmp(href, base_url): - """Normalize URLs to ensure consistent format""" - # Extract protocol and domain from base URL - try: - base_parts = base_url.split("/") - protocol = base_parts[0] - domain = base_parts[2] - except IndexError: - raise ValueError(f"Invalid base URL format: {base_url}") +# def normalize_url_tmp(href, base_url): +# """Normalize URLs to ensure consistent format""" +# # Extract protocol and domain from base URL +# try: +# base_parts = base_url.split("/") +# protocol = base_parts[0] +# domain = base_parts[2] +# except IndexError: +# raise ValueError(f"Invalid base URL format: {base_url}") - # Handle special protocols - special_protocols = {"mailto:", "tel:", "ftp:", "file:", "data:", "javascript:"} - if any(href.lower().startswith(proto) for proto in special_protocols): - return href.strip() +# # Handle special protocols +# special_protocols = {"mailto:", "tel:", "ftp:", "file:", "data:", "javascript:"} +# if any(href.lower().startswith(proto) for proto in special_protocols): +# return href.strip() - # Handle anchor links - if href.startswith("#"): - return f"{base_url}{href}" +# # Handle anchor links +# if href.startswith("#"): +# return f"{base_url}{href}" - # Handle protocol-relative URLs - if href.startswith("//"): - return f"{protocol}{href}" +# # Handle protocol-relative URLs +# if href.startswith("//"): +# return f"{protocol}{href}" - # Handle root-relative URLs - if href.startswith("/"): - return f"{protocol}//{domain}{href}" +# # Handle root-relative URLs +# if href.startswith("/"): +# return f"{protocol}//{domain}{href}" - # Handle relative URLs - if not href.startswith(("http://", "https://")): - # Remove leading './' if present - href = href.lstrip("./") - return f"{protocol}//{domain}/{href}" +# # Handle relative URLs +# if not href.startswith(("http://", "https://")): +# # Remove leading './' if present +# href = href.lstrip("./") +# return f"{protocol}//{domain}/{href}" - return href.strip() +# return href.strip() def get_base_domain(url: str) -> str: