#1127: Improve URL handling and normalization in scraping strategies
This commit is contained in:
@@ -15,7 +15,7 @@ from .config import (
|
||||
)
|
||||
from bs4 import NavigableString, Comment
|
||||
from bs4 import PageElement, Tag
|
||||
from urllib.parse import urljoin
|
||||
from urllib.parse import urljoin , urlparse
|
||||
from requests.exceptions import InvalidSchema
|
||||
from .utils import (
|
||||
extract_metadata,
|
||||
@@ -24,8 +24,7 @@ from .utils import (
|
||||
get_base_domain,
|
||||
extract_metadata_using_lxml,
|
||||
)
|
||||
from lxml import etree
|
||||
from lxml import html as lhtml
|
||||
from lxml import etree, html as lhtml
|
||||
from typing import List
|
||||
from .models import ScrapingResult, MediaItem, Link, Media, Links
|
||||
import copy
|
||||
@@ -130,7 +129,27 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
ScrapingResult: A structured result containing the scraped content.
|
||||
"""
|
||||
actual_url = kwargs.get("redirected_url", url)
|
||||
raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
|
||||
# raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
|
||||
effective_base_url = actual_url
|
||||
try:
|
||||
soup_for_base_check = BeautifulSoup(html, "html.parser")
|
||||
base_tag = soup_for_base_check.find("base", href=True)
|
||||
if base_tag:
|
||||
base_href_val = base_tag.get("href")
|
||||
if base_href_val is not None:
|
||||
resolved_base_href = urljoin(actual_url, base_href_val)
|
||||
parsed_resolved_base = urlparse(resolved_base_href)
|
||||
if parsed_resolved_base.scheme and parsed_resolved_base.netloc:
|
||||
effective_base_url = resolved_base_href
|
||||
except Exception as e:
|
||||
self._log(
|
||||
"error",
|
||||
message="Error resolving base URL: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)},
|
||||
)
|
||||
kwargs_for_scrap = {**kwargs, '_effective_base_url_override': effective_base_url }
|
||||
raw_result = self._scrap(actual_url, html, is_async=False, **kwargs_for_scrap)
|
||||
if raw_result is None:
|
||||
return ScrapingResult(
|
||||
cleaned_html="",
|
||||
@@ -1487,6 +1506,27 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
doc = lhtml.document_fromstring(html)
|
||||
# Match BeautifulSoup's behavior of using body or full doc
|
||||
# body = doc.xpath('//body')[0] if doc.xpath('//body') else doc
|
||||
# Determine effective base URL considering <base href="...">
|
||||
base_tag_element = doc.find(".//base[@href]")
|
||||
if base_tag_element is not None:
|
||||
base_href_value = base_tag_element.get("href")
|
||||
if base_href_value is not None:
|
||||
resolved_base_href = urljoin(url, base_href_value)
|
||||
parse_resolved_base_href = urlparse(resolved_base_href)
|
||||
if parse_resolved_base_href.scheme and parse_resolved_base_href.netloc:
|
||||
effective_base_url = resolved_base_href
|
||||
self._log(
|
||||
"debug",
|
||||
f"Using <base href='{base_href_value}'>, resolved effective base URL for links: {effective_base_url}",
|
||||
url=url, # Log against original document URL
|
||||
tag="SCRAPE_BASE_URL")
|
||||
else:
|
||||
effective_base_url = url
|
||||
self._log(
|
||||
"warning",
|
||||
f"<base href='{base_href_value}'> resolved to non-absolute URL '{resolved_base_href}'. Using document URL '{actual_url}' as base.",
|
||||
url=url, # Log against original document URL
|
||||
tag="SCRAPE_BASE_URL")
|
||||
body = doc
|
||||
|
||||
base_domain = get_base_domain(url)
|
||||
|
||||
@@ -15,9 +15,10 @@ from .html2text import html2text, CustomHTML2Text
|
||||
from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, IMAGE_SCORE_THRESHOLD, DEFAULT_PROVIDER, PROVIDER_MODELS
|
||||
import httpx
|
||||
from socket import gaierror
|
||||
from pathlib import Path
|
||||
from pathlib import Path , PurePath
|
||||
from typing import Dict, Any, List, Optional, Callable
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
from requests.exceptions import InvalidSchema
|
||||
import xxhash
|
||||
@@ -2056,18 +2057,29 @@ def fast_format_html(html_string):
|
||||
def normalize_url(href, base_url):
|
||||
"""Normalize URLs to ensure consistent format"""
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
if href is None:
|
||||
return None
|
||||
|
||||
href_str = str(href).strip()
|
||||
if not href_str:
|
||||
# Empty href, conventionally resolves to the base URL itself.
|
||||
return base_url
|
||||
# Parse base URL to get components
|
||||
parsed_href = urlparse(href_str)
|
||||
if parsed_href.scheme and parsed_href.scheme.lower() in ["mailto", "tel", "javascript", "data", "file"]:
|
||||
# If href is already a full URL, return it as is
|
||||
return href_str
|
||||
|
||||
parsed_base = urlparse(base_url)
|
||||
if not parsed_base.scheme or not parsed_base.netloc:
|
||||
raise ValueError(f"Invalid base URL format: {base_url}")
|
||||
|
||||
# Ensure base_url ends with a trailing slash if it's a directory path
|
||||
if not base_url.endswith('/'):
|
||||
base_url = base_url + '/'
|
||||
# # Ensure base_url ends with a trailing slash if it's a directory path
|
||||
# if not base_url.endswith('/'):
|
||||
# base_url = base_url + '/'
|
||||
|
||||
# Use urljoin to handle all cases
|
||||
normalized = urljoin(base_url, href.strip())
|
||||
normalized = urljoin(base_url, href_str)
|
||||
return normalized
|
||||
|
||||
|
||||
@@ -2080,7 +2092,7 @@ def normalize_url_for_deep_crawl(href, base_url):
|
||||
return None
|
||||
|
||||
# Use urljoin to handle relative URLs
|
||||
full_url = urljoin(base_url, href.strip())
|
||||
full_url = urljoin(base_url, str(href).strip())
|
||||
|
||||
# Parse the URL for normalization
|
||||
parsed = urlparse(full_url)
|
||||
@@ -2110,7 +2122,7 @@ def normalize_url_for_deep_crawl(href, base_url):
|
||||
normalized = urlunparse((
|
||||
parsed.scheme,
|
||||
netloc,
|
||||
parsed.path.rstrip('/'), # Normalize trailing slash
|
||||
str(PurePath(parsed.path)).rstrip('/'), # Normalize path to remove duplicate slashes
|
||||
parsed.params,
|
||||
query,
|
||||
fragment
|
||||
@@ -2127,7 +2139,7 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
|
||||
return None
|
||||
|
||||
# Resolve relative URLs
|
||||
full_url = urljoin(base_url, href.strip())
|
||||
full_url = urljoin(base_url, str(href).strip())
|
||||
|
||||
# Use proper URL parsing
|
||||
parsed = urlparse(full_url)
|
||||
@@ -2135,52 +2147,51 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
|
||||
# Only perform the most critical normalizations
|
||||
# 1. Lowercase hostname
|
||||
# 2. Remove fragment
|
||||
path = parsed.path
|
||||
if len(path) > 1 and path.endswith('/'):
|
||||
path = path.rstrip('/')
|
||||
normalized = urlunparse((
|
||||
parsed.scheme,
|
||||
parsed.netloc.lower(),
|
||||
parsed.path.rstrip('/'),
|
||||
parsed.params,
|
||||
parsed.query,
|
||||
'' # Remove fragment
|
||||
))
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
def normalize_url_tmp(href, base_url):
|
||||
"""Normalize URLs to ensure consistent format"""
|
||||
# Extract protocol and domain from base URL
|
||||
try:
|
||||
base_parts = base_url.split("/")
|
||||
protocol = base_parts[0]
|
||||
domain = base_parts[2]
|
||||
except IndexError:
|
||||
raise ValueError(f"Invalid base URL format: {base_url}")
|
||||
# def normalize_url_tmp(href, base_url):
|
||||
# """Normalize URLs to ensure consistent format"""
|
||||
# # Extract protocol and domain from base URL
|
||||
# try:
|
||||
# base_parts = base_url.split("/")
|
||||
# protocol = base_parts[0]
|
||||
# domain = base_parts[2]
|
||||
# except IndexError:
|
||||
# raise ValueError(f"Invalid base URL format: {base_url}")
|
||||
|
||||
# Handle special protocols
|
||||
special_protocols = {"mailto:", "tel:", "ftp:", "file:", "data:", "javascript:"}
|
||||
if any(href.lower().startswith(proto) for proto in special_protocols):
|
||||
return href.strip()
|
||||
# # Handle special protocols
|
||||
# special_protocols = {"mailto:", "tel:", "ftp:", "file:", "data:", "javascript:"}
|
||||
# if any(href.lower().startswith(proto) for proto in special_protocols):
|
||||
# return href.strip()
|
||||
|
||||
# Handle anchor links
|
||||
if href.startswith("#"):
|
||||
return f"{base_url}{href}"
|
||||
# # Handle anchor links
|
||||
# if href.startswith("#"):
|
||||
# return f"{base_url}{href}"
|
||||
|
||||
# Handle protocol-relative URLs
|
||||
if href.startswith("//"):
|
||||
return f"{protocol}{href}"
|
||||
# # Handle protocol-relative URLs
|
||||
# if href.startswith("//"):
|
||||
# return f"{protocol}{href}"
|
||||
|
||||
# Handle root-relative URLs
|
||||
if href.startswith("/"):
|
||||
return f"{protocol}//{domain}{href}"
|
||||
# # Handle root-relative URLs
|
||||
# if href.startswith("/"):
|
||||
# return f"{protocol}//{domain}{href}"
|
||||
|
||||
# Handle relative URLs
|
||||
if not href.startswith(("http://", "https://")):
|
||||
# Remove leading './' if present
|
||||
href = href.lstrip("./")
|
||||
return f"{protocol}//{domain}/{href}"
|
||||
# # Handle relative URLs
|
||||
# if not href.startswith(("http://", "https://")):
|
||||
# # Remove leading './' if present
|
||||
# href = href.lstrip("./")
|
||||
# return f"{protocol}//{domain}/{href}"
|
||||
|
||||
return href.strip()
|
||||
# return href.strip()
|
||||
|
||||
|
||||
def get_base_domain(url: str) -> str:
|
||||
|
||||
Reference in New Issue
Block a user