#1127: Improve URL handling and normalization in scraping strategies

This commit is contained in:
AHMET YILMAZ
2025-06-10 11:57:06 +08:00
parent 74b06d4b80
commit 9442597f81
2 changed files with 96 additions and 45 deletions

View File

@@ -15,7 +15,7 @@ from .config import (
)
from bs4 import NavigableString, Comment
from bs4 import PageElement, Tag
from urllib.parse import urljoin
from urllib.parse import urljoin , urlparse
from requests.exceptions import InvalidSchema
from .utils import (
extract_metadata,
@@ -24,8 +24,7 @@ from .utils import (
get_base_domain,
extract_metadata_using_lxml,
)
from lxml import etree
from lxml import html as lhtml
from lxml import etree, html as lhtml
from typing import List
from .models import ScrapingResult, MediaItem, Link, Media, Links
import copy
@@ -130,7 +129,27 @@ class WebScrapingStrategy(ContentScrapingStrategy):
ScrapingResult: A structured result containing the scraped content.
"""
actual_url = kwargs.get("redirected_url", url)
raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
# raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
effective_base_url = actual_url
try:
soup_for_base_check = BeautifulSoup(html, "html.parser")
base_tag = soup_for_base_check.find("base", href=True)
if base_tag:
base_href_val = base_tag.get("href")
if base_href_val is not None:
resolved_base_href = urljoin(actual_url, base_href_val)
parsed_resolved_base = urlparse(resolved_base_href)
if parsed_resolved_base.scheme and parsed_resolved_base.netloc:
effective_base_url = resolved_base_href
except Exception as e:
self._log(
"error",
message="Error resolving base URL: {error}",
tag="SCRAPE",
params={"error": str(e)},
)
kwargs_for_scrap = {**kwargs, '_effective_base_url_override': effective_base_url }
raw_result = self._scrap(actual_url, html, is_async=False, **kwargs_for_scrap)
if raw_result is None:
return ScrapingResult(
cleaned_html="",
@@ -1487,6 +1506,27 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
doc = lhtml.document_fromstring(html)
# Match BeautifulSoup's behavior of using body or full doc
# body = doc.xpath('//body')[0] if doc.xpath('//body') else doc
# Determine effective base URL considering <base href="...">
base_tag_element = doc.find(".//base[@href]")
if base_tag_element is not None:
base_href_value = base_tag_element.get("href")
if base_href_value is not None:
resolved_base_href = urljoin(url, base_href_value)
parse_resolved_base_href = urlparse(resolved_base_href)
if parse_resolved_base_href.scheme and parse_resolved_base_href.netloc:
effective_base_url = resolved_base_href
self._log(
"debug",
f"Using <base href='{base_href_value}'>, resolved effective base URL for links: {effective_base_url}",
url=url, # Log against original document URL
tag="SCRAPE_BASE_URL")
else:
effective_base_url = url
self._log(
"warning",
f"<base href='{base_href_value}'> resolved to non-absolute URL '{resolved_base_href}'. Using document URL '{actual_url}' as base.",
url=url, # Log against original document URL
tag="SCRAPE_BASE_URL")
body = doc
base_domain = get_base_domain(url)

View File

@@ -15,9 +15,10 @@ from .html2text import html2text, CustomHTML2Text
from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, IMAGE_SCORE_THRESHOLD, DEFAULT_PROVIDER, PROVIDER_MODELS
import httpx
from socket import gaierror
from pathlib import Path
from pathlib import Path , PurePath
from typing import Dict, Any, List, Optional, Callable
from urllib.parse import urljoin
import requests
from requests.exceptions import InvalidSchema
import xxhash
@@ -2056,18 +2057,29 @@ def fast_format_html(html_string):
def normalize_url(href, base_url):
"""Normalize URLs to ensure consistent format"""
from urllib.parse import urljoin, urlparse
if href is None:
return None
href_str = str(href).strip()
if not href_str:
# Empty href, conventionally resolves to the base URL itself.
return base_url
# Parse base URL to get components
parsed_href = urlparse(href_str)
if parsed_href.scheme and parsed_href.scheme.lower() in ["mailto", "tel", "javascript", "data", "file"]:
# If href is already a full URL, return it as is
return href_str
parsed_base = urlparse(base_url)
if not parsed_base.scheme or not parsed_base.netloc:
raise ValueError(f"Invalid base URL format: {base_url}")
# Ensure base_url ends with a trailing slash if it's a directory path
if not base_url.endswith('/'):
base_url = base_url + '/'
# # Ensure base_url ends with a trailing slash if it's a directory path
# if not base_url.endswith('/'):
# base_url = base_url + '/'
# Use urljoin to handle all cases
normalized = urljoin(base_url, href.strip())
normalized = urljoin(base_url, href_str)
return normalized
@@ -2080,7 +2092,7 @@ def normalize_url_for_deep_crawl(href, base_url):
return None
# Use urljoin to handle relative URLs
full_url = urljoin(base_url, href.strip())
full_url = urljoin(base_url, str(href).strip())
# Parse the URL for normalization
parsed = urlparse(full_url)
@@ -2110,7 +2122,7 @@ def normalize_url_for_deep_crawl(href, base_url):
normalized = urlunparse((
parsed.scheme,
netloc,
parsed.path.rstrip('/'), # Normalize trailing slash
str(PurePath(parsed.path)).rstrip('/'), # Normalize path to remove duplicate slashes
parsed.params,
query,
fragment
@@ -2127,7 +2139,7 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
return None
# Resolve relative URLs
full_url = urljoin(base_url, href.strip())
full_url = urljoin(base_url, str(href).strip())
# Use proper URL parsing
parsed = urlparse(full_url)
@@ -2135,52 +2147,51 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
# Only perform the most critical normalizations
# 1. Lowercase hostname
# 2. Remove fragment
path = parsed.path
if len(path) > 1 and path.endswith('/'):
path = path.rstrip('/')
normalized = urlunparse((
parsed.scheme,
parsed.netloc.lower(),
parsed.path.rstrip('/'),
parsed.params,
parsed.query,
'' # Remove fragment
))
return normalized
def normalize_url_tmp(href, base_url):
"""Normalize URLs to ensure consistent format"""
# Extract protocol and domain from base URL
try:
base_parts = base_url.split("/")
protocol = base_parts[0]
domain = base_parts[2]
except IndexError:
raise ValueError(f"Invalid base URL format: {base_url}")
# def normalize_url_tmp(href, base_url):
# """Normalize URLs to ensure consistent format"""
# # Extract protocol and domain from base URL
# try:
# base_parts = base_url.split("/")
# protocol = base_parts[0]
# domain = base_parts[2]
# except IndexError:
# raise ValueError(f"Invalid base URL format: {base_url}")
# Handle special protocols
special_protocols = {"mailto:", "tel:", "ftp:", "file:", "data:", "javascript:"}
if any(href.lower().startswith(proto) for proto in special_protocols):
return href.strip()
# # Handle special protocols
# special_protocols = {"mailto:", "tel:", "ftp:", "file:", "data:", "javascript:"}
# if any(href.lower().startswith(proto) for proto in special_protocols):
# return href.strip()
# Handle anchor links
if href.startswith("#"):
return f"{base_url}{href}"
# # Handle anchor links
# if href.startswith("#"):
# return f"{base_url}{href}"
# Handle protocol-relative URLs
if href.startswith("//"):
return f"{protocol}{href}"
# # Handle protocol-relative URLs
# if href.startswith("//"):
# return f"{protocol}{href}"
# Handle root-relative URLs
if href.startswith("/"):
return f"{protocol}//{domain}{href}"
# # Handle root-relative URLs
# if href.startswith("/"):
# return f"{protocol}//{domain}{href}"
# Handle relative URLs
if not href.startswith(("http://", "https://")):
# Remove leading './' if present
href = href.lstrip("./")
return f"{protocol}//{domain}/{href}"
# # Handle relative URLs
# if not href.startswith(("http://", "https://")):
# # Remove leading './' if present
# href = href.lstrip("./")
# return f"{protocol}//{domain}/{href}"
return href.strip()
# return href.strip()
def get_base_domain(url: str) -> str: