#1127: Improve URL handling and normalization in scraping strategies
This commit is contained in:
@@ -15,7 +15,7 @@ from .config import (
|
|||||||
)
|
)
|
||||||
from bs4 import NavigableString, Comment
|
from bs4 import NavigableString, Comment
|
||||||
from bs4 import PageElement, Tag
|
from bs4 import PageElement, Tag
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin , urlparse
|
||||||
from requests.exceptions import InvalidSchema
|
from requests.exceptions import InvalidSchema
|
||||||
from .utils import (
|
from .utils import (
|
||||||
extract_metadata,
|
extract_metadata,
|
||||||
@@ -24,8 +24,7 @@ from .utils import (
|
|||||||
get_base_domain,
|
get_base_domain,
|
||||||
extract_metadata_using_lxml,
|
extract_metadata_using_lxml,
|
||||||
)
|
)
|
||||||
from lxml import etree
|
from lxml import etree, html as lhtml
|
||||||
from lxml import html as lhtml
|
|
||||||
from typing import List
|
from typing import List
|
||||||
from .models import ScrapingResult, MediaItem, Link, Media, Links
|
from .models import ScrapingResult, MediaItem, Link, Media, Links
|
||||||
import copy
|
import copy
|
||||||
@@ -130,7 +129,27 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
ScrapingResult: A structured result containing the scraped content.
|
ScrapingResult: A structured result containing the scraped content.
|
||||||
"""
|
"""
|
||||||
actual_url = kwargs.get("redirected_url", url)
|
actual_url = kwargs.get("redirected_url", url)
|
||||||
raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
|
# raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
|
||||||
|
effective_base_url = actual_url
|
||||||
|
try:
|
||||||
|
soup_for_base_check = BeautifulSoup(html, "html.parser")
|
||||||
|
base_tag = soup_for_base_check.find("base", href=True)
|
||||||
|
if base_tag:
|
||||||
|
base_href_val = base_tag.get("href")
|
||||||
|
if base_href_val is not None:
|
||||||
|
resolved_base_href = urljoin(actual_url, base_href_val)
|
||||||
|
parsed_resolved_base = urlparse(resolved_base_href)
|
||||||
|
if parsed_resolved_base.scheme and parsed_resolved_base.netloc:
|
||||||
|
effective_base_url = resolved_base_href
|
||||||
|
except Exception as e:
|
||||||
|
self._log(
|
||||||
|
"error",
|
||||||
|
message="Error resolving base URL: {error}",
|
||||||
|
tag="SCRAPE",
|
||||||
|
params={"error": str(e)},
|
||||||
|
)
|
||||||
|
kwargs_for_scrap = {**kwargs, '_effective_base_url_override': effective_base_url }
|
||||||
|
raw_result = self._scrap(actual_url, html, is_async=False, **kwargs_for_scrap)
|
||||||
if raw_result is None:
|
if raw_result is None:
|
||||||
return ScrapingResult(
|
return ScrapingResult(
|
||||||
cleaned_html="",
|
cleaned_html="",
|
||||||
@@ -1487,6 +1506,27 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
|||||||
doc = lhtml.document_fromstring(html)
|
doc = lhtml.document_fromstring(html)
|
||||||
# Match BeautifulSoup's behavior of using body or full doc
|
# Match BeautifulSoup's behavior of using body or full doc
|
||||||
# body = doc.xpath('//body')[0] if doc.xpath('//body') else doc
|
# body = doc.xpath('//body')[0] if doc.xpath('//body') else doc
|
||||||
|
# Determine effective base URL considering <base href="...">
|
||||||
|
base_tag_element = doc.find(".//base[@href]")
|
||||||
|
if base_tag_element is not None:
|
||||||
|
base_href_value = base_tag_element.get("href")
|
||||||
|
if base_href_value is not None:
|
||||||
|
resolved_base_href = urljoin(url, base_href_value)
|
||||||
|
parse_resolved_base_href = urlparse(resolved_base_href)
|
||||||
|
if parse_resolved_base_href.scheme and parse_resolved_base_href.netloc:
|
||||||
|
effective_base_url = resolved_base_href
|
||||||
|
self._log(
|
||||||
|
"debug",
|
||||||
|
f"Using <base href='{base_href_value}'>, resolved effective base URL for links: {effective_base_url}",
|
||||||
|
url=url, # Log against original document URL
|
||||||
|
tag="SCRAPE_BASE_URL")
|
||||||
|
else:
|
||||||
|
effective_base_url = url
|
||||||
|
self._log(
|
||||||
|
"warning",
|
||||||
|
f"<base href='{base_href_value}'> resolved to non-absolute URL '{resolved_base_href}'. Using document URL '{actual_url}' as base.",
|
||||||
|
url=url, # Log against original document URL
|
||||||
|
tag="SCRAPE_BASE_URL")
|
||||||
body = doc
|
body = doc
|
||||||
|
|
||||||
base_domain = get_base_domain(url)
|
base_domain = get_base_domain(url)
|
||||||
|
|||||||
@@ -15,9 +15,10 @@ from .html2text import html2text, CustomHTML2Text
|
|||||||
from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, IMAGE_SCORE_THRESHOLD, DEFAULT_PROVIDER, PROVIDER_MODELS
|
from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, IMAGE_SCORE_THRESHOLD, DEFAULT_PROVIDER, PROVIDER_MODELS
|
||||||
import httpx
|
import httpx
|
||||||
from socket import gaierror
|
from socket import gaierror
|
||||||
from pathlib import Path
|
from pathlib import Path , PurePath
|
||||||
from typing import Dict, Any, List, Optional, Callable
|
from typing import Dict, Any, List, Optional, Callable
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from requests.exceptions import InvalidSchema
|
from requests.exceptions import InvalidSchema
|
||||||
import xxhash
|
import xxhash
|
||||||
@@ -2056,18 +2057,29 @@ def fast_format_html(html_string):
|
|||||||
def normalize_url(href, base_url):
|
def normalize_url(href, base_url):
|
||||||
"""Normalize URLs to ensure consistent format"""
|
"""Normalize URLs to ensure consistent format"""
|
||||||
from urllib.parse import urljoin, urlparse
|
from urllib.parse import urljoin, urlparse
|
||||||
|
if href is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
href_str = str(href).strip()
|
||||||
|
if not href_str:
|
||||||
|
# Empty href, conventionally resolves to the base URL itself.
|
||||||
|
return base_url
|
||||||
# Parse base URL to get components
|
# Parse base URL to get components
|
||||||
|
parsed_href = urlparse(href_str)
|
||||||
|
if parsed_href.scheme and parsed_href.scheme.lower() in ["mailto", "tel", "javascript", "data", "file"]:
|
||||||
|
# If href is already a full URL, return it as is
|
||||||
|
return href_str
|
||||||
|
|
||||||
parsed_base = urlparse(base_url)
|
parsed_base = urlparse(base_url)
|
||||||
if not parsed_base.scheme or not parsed_base.netloc:
|
if not parsed_base.scheme or not parsed_base.netloc:
|
||||||
raise ValueError(f"Invalid base URL format: {base_url}")
|
raise ValueError(f"Invalid base URL format: {base_url}")
|
||||||
|
|
||||||
# Ensure base_url ends with a trailing slash if it's a directory path
|
# # Ensure base_url ends with a trailing slash if it's a directory path
|
||||||
if not base_url.endswith('/'):
|
# if not base_url.endswith('/'):
|
||||||
base_url = base_url + '/'
|
# base_url = base_url + '/'
|
||||||
|
|
||||||
# Use urljoin to handle all cases
|
# Use urljoin to handle all cases
|
||||||
normalized = urljoin(base_url, href.strip())
|
normalized = urljoin(base_url, href_str)
|
||||||
return normalized
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
@@ -2080,7 +2092,7 @@ def normalize_url_for_deep_crawl(href, base_url):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
# Use urljoin to handle relative URLs
|
# Use urljoin to handle relative URLs
|
||||||
full_url = urljoin(base_url, href.strip())
|
full_url = urljoin(base_url, str(href).strip())
|
||||||
|
|
||||||
# Parse the URL for normalization
|
# Parse the URL for normalization
|
||||||
parsed = urlparse(full_url)
|
parsed = urlparse(full_url)
|
||||||
@@ -2110,7 +2122,7 @@ def normalize_url_for_deep_crawl(href, base_url):
|
|||||||
normalized = urlunparse((
|
normalized = urlunparse((
|
||||||
parsed.scheme,
|
parsed.scheme,
|
||||||
netloc,
|
netloc,
|
||||||
parsed.path.rstrip('/'), # Normalize trailing slash
|
str(PurePath(parsed.path)).rstrip('/'), # Normalize path to remove duplicate slashes
|
||||||
parsed.params,
|
parsed.params,
|
||||||
query,
|
query,
|
||||||
fragment
|
fragment
|
||||||
@@ -2127,7 +2139,7 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
# Resolve relative URLs
|
# Resolve relative URLs
|
||||||
full_url = urljoin(base_url, href.strip())
|
full_url = urljoin(base_url, str(href).strip())
|
||||||
|
|
||||||
# Use proper URL parsing
|
# Use proper URL parsing
|
||||||
parsed = urlparse(full_url)
|
parsed = urlparse(full_url)
|
||||||
@@ -2135,52 +2147,51 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
|
|||||||
# Only perform the most critical normalizations
|
# Only perform the most critical normalizations
|
||||||
# 1. Lowercase hostname
|
# 1. Lowercase hostname
|
||||||
# 2. Remove fragment
|
# 2. Remove fragment
|
||||||
|
path = parsed.path
|
||||||
|
if len(path) > 1 and path.endswith('/'):
|
||||||
|
path = path.rstrip('/')
|
||||||
normalized = urlunparse((
|
normalized = urlunparse((
|
||||||
parsed.scheme,
|
parsed.scheme,
|
||||||
parsed.netloc.lower(),
|
parsed.netloc.lower(),
|
||||||
parsed.path.rstrip('/'),
|
|
||||||
parsed.params,
|
|
||||||
parsed.query,
|
|
||||||
'' # Remove fragment
|
|
||||||
))
|
))
|
||||||
|
|
||||||
return normalized
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
def normalize_url_tmp(href, base_url):
|
# def normalize_url_tmp(href, base_url):
|
||||||
"""Normalize URLs to ensure consistent format"""
|
# """Normalize URLs to ensure consistent format"""
|
||||||
# Extract protocol and domain from base URL
|
# # Extract protocol and domain from base URL
|
||||||
try:
|
# try:
|
||||||
base_parts = base_url.split("/")
|
# base_parts = base_url.split("/")
|
||||||
protocol = base_parts[0]
|
# protocol = base_parts[0]
|
||||||
domain = base_parts[2]
|
# domain = base_parts[2]
|
||||||
except IndexError:
|
# except IndexError:
|
||||||
raise ValueError(f"Invalid base URL format: {base_url}")
|
# raise ValueError(f"Invalid base URL format: {base_url}")
|
||||||
|
|
||||||
# Handle special protocols
|
# # Handle special protocols
|
||||||
special_protocols = {"mailto:", "tel:", "ftp:", "file:", "data:", "javascript:"}
|
# special_protocols = {"mailto:", "tel:", "ftp:", "file:", "data:", "javascript:"}
|
||||||
if any(href.lower().startswith(proto) for proto in special_protocols):
|
# if any(href.lower().startswith(proto) for proto in special_protocols):
|
||||||
return href.strip()
|
# return href.strip()
|
||||||
|
|
||||||
# Handle anchor links
|
# # Handle anchor links
|
||||||
if href.startswith("#"):
|
# if href.startswith("#"):
|
||||||
return f"{base_url}{href}"
|
# return f"{base_url}{href}"
|
||||||
|
|
||||||
# Handle protocol-relative URLs
|
# # Handle protocol-relative URLs
|
||||||
if href.startswith("//"):
|
# if href.startswith("//"):
|
||||||
return f"{protocol}{href}"
|
# return f"{protocol}{href}"
|
||||||
|
|
||||||
# Handle root-relative URLs
|
# # Handle root-relative URLs
|
||||||
if href.startswith("/"):
|
# if href.startswith("/"):
|
||||||
return f"{protocol}//{domain}{href}"
|
# return f"{protocol}//{domain}{href}"
|
||||||
|
|
||||||
# Handle relative URLs
|
# # Handle relative URLs
|
||||||
if not href.startswith(("http://", "https://")):
|
# if not href.startswith(("http://", "https://")):
|
||||||
# Remove leading './' if present
|
# # Remove leading './' if present
|
||||||
href = href.lstrip("./")
|
# href = href.lstrip("./")
|
||||||
return f"{protocol}//{domain}/{href}"
|
# return f"{protocol}//{domain}/{href}"
|
||||||
|
|
||||||
return href.strip()
|
# return href.strip()
|
||||||
|
|
||||||
|
|
||||||
def get_base_domain(url: str) -> str:
|
def get_base_domain(url: str) -> str:
|
||||||
|
|||||||
Reference in New Issue
Block a user