Merge pull request #1598 from unclecode/fix/sitemap_seeder

#1559 :Add tests for sitemap parsing and URL normalization in AsyncUr…
This commit is contained in:
Nasrin
2025-11-12 18:09:34 +08:00
committed by GitHub
2 changed files with 193 additions and 13 deletions

View File

@@ -845,6 +845,15 @@ class AsyncUrlSeeder:
return return
data = gzip.decompress(r.content) if url.endswith(".gz") else r.content data = gzip.decompress(r.content) if url.endswith(".gz") else r.content
base_url = str(r.url)
def _normalize_loc(raw: Optional[str]) -> Optional[str]:
if not raw:
return None
normalized = urljoin(base_url, raw.strip())
if not normalized:
return None
return normalized
# Detect if this is a sitemap index by checking for <sitemapindex> or presence of <sitemap> elements # Detect if this is a sitemap index by checking for <sitemapindex> or presence of <sitemap> elements
is_sitemap_index = False is_sitemap_index = False
@@ -857,25 +866,42 @@ class AsyncUrlSeeder:
# Use XML parser for sitemaps, not HTML parser # Use XML parser for sitemaps, not HTML parser
parser = etree.XMLParser(recover=True) parser = etree.XMLParser(recover=True)
root = etree.fromstring(data, parser=parser) root = etree.fromstring(data, parser=parser)
# Namespace-agnostic lookups using local-name() so we honor custom or missing namespaces
sitemap_loc_nodes = root.xpath("//*[local-name()='sitemap']/*[local-name()='loc']")
url_loc_nodes = root.xpath("//*[local-name()='url']/*[local-name()='loc']")
# Define namespace for sitemap self._log(
ns = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'} "debug",
"Parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered",
params={
"url": url,
"sitemap_count": len(sitemap_loc_nodes),
"url_count": len(url_loc_nodes),
},
tag="URL_SEED",
)
# Check for sitemap index entries # Check for sitemap index entries
sitemap_locs = root.xpath('//s:sitemap/s:loc', namespaces=ns) if sitemap_loc_nodes:
if sitemap_locs:
is_sitemap_index = True is_sitemap_index = True
for sitemap_elem in sitemap_locs: for sitemap_elem in sitemap_loc_nodes:
loc = sitemap_elem.text.strip() if sitemap_elem.text else "" loc = _normalize_loc(sitemap_elem.text)
if loc: if loc:
sub_sitemaps.append(loc) sub_sitemaps.append(loc)
# If not a sitemap index, get regular URLs # If not a sitemap index, get regular URLs
if not is_sitemap_index: if not is_sitemap_index:
for loc_elem in root.xpath('//s:url/s:loc', namespaces=ns): for loc_elem in url_loc_nodes:
loc = loc_elem.text.strip() if loc_elem.text else "" loc = _normalize_loc(loc_elem.text)
if loc: if loc:
regular_urls.append(loc) regular_urls.append(loc)
if not regular_urls:
self._log(
"warning",
"No <loc> entries found inside <url> tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.",
params={"url": url},
tag="URL_SEED",
)
except Exception as e: except Exception as e:
self._log("error", "LXML parsing error for sitemap {url}: {error}", self._log("error", "LXML parsing error for sitemap {url}: {error}",
params={"url": url, "error": str(e)}, tag="URL_SEED") params={"url": url, "error": str(e)}, tag="URL_SEED")
@@ -892,19 +918,39 @@ class AsyncUrlSeeder:
# Check for sitemap index entries # Check for sitemap index entries
sitemaps = root.findall('.//sitemap') sitemaps = root.findall('.//sitemap')
url_entries = root.findall('.//url')
self._log(
"debug",
"ElementTree parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered",
params={
"url": url,
"sitemap_count": len(sitemaps),
"url_count": len(url_entries),
},
tag="URL_SEED",
)
if sitemaps: if sitemaps:
is_sitemap_index = True is_sitemap_index = True
for sitemap in sitemaps: for sitemap in sitemaps:
loc_elem = sitemap.find('loc') loc_elem = sitemap.find('loc')
if loc_elem is not None and loc_elem.text: loc = _normalize_loc(loc_elem.text if loc_elem is not None else None)
sub_sitemaps.append(loc_elem.text.strip()) if loc:
sub_sitemaps.append(loc)
# If not a sitemap index, get regular URLs # If not a sitemap index, get regular URLs
if not is_sitemap_index: if not is_sitemap_index:
for url_elem in root.findall('.//url'): for url_elem in url_entries:
loc_elem = url_elem.find('loc') loc_elem = url_elem.find('loc')
if loc_elem is not None and loc_elem.text: loc = _normalize_loc(loc_elem.text if loc_elem is not None else None)
regular_urls.append(loc_elem.text.strip()) if loc:
regular_urls.append(loc)
if not regular_urls:
self._log(
"warning",
"No <loc> entries found inside <url> tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.",
params={"url": url},
tag="URL_SEED",
)
except Exception as e: except Exception as e:
self._log("error", "ElementTree parsing error for sitemap {url}: {error}", self._log("error", "ElementTree parsing error for sitemap {url}: {error}",
params={"url": url, "error": str(e)}, tag="URL_SEED") params={"url": url, "error": str(e)}, tag="URL_SEED")

View File

@@ -0,0 +1,134 @@
import sys
from types import SimpleNamespace
import pytest
# Provide a lightweight stub for rank_bm25 before importing the seeder to avoid
# optional dependency issues (e.g., incompatible wheels in CI).
class _FakeBM25:
def __init__(self, corpus):
self._scores = [1.0] * len(corpus)
def get_scores(self, tokens):
return self._scores
sys.modules.setdefault("rank_bm25", SimpleNamespace(BM25Okapi=_FakeBM25))
from crawl4ai.async_url_seeder import AsyncUrlSeeder
class DummyResponse:
def __init__(self, request_url: str, text: str):
self.status_code = 200
self._content = text.encode("utf-8")
self.url = request_url
def raise_for_status(self):
return None
@property
def content(self):
return self._content
@property
def text(self):
return self._content.decode("utf-8")
class DummyAsyncClient:
def __init__(self, response_map):
self._responses = response_map
async def get(self, url, **kwargs):
payload = self._responses[url]
if callable(payload):
payload = payload()
return DummyResponse(url, payload)
@pytest.mark.asyncio
async def test_iter_sitemap_handles_namespace_less_sitemaps():
xml = """<?xml version="1.0"?>
<urlset>
<url><loc>https://example.com/a</loc></url>
<url><loc>https://example.com/b</loc></url>
</urlset>
"""
seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/sitemap.xml": xml}))
urls = []
async for u in seeder._iter_sitemap("https://example.com/sitemap.xml"):
urls.append(u)
assert urls == ["https://example.com/a", "https://example.com/b"]
@pytest.mark.asyncio
async def test_iter_sitemap_handles_custom_namespace():
xml = """<?xml version="1.0"?>
<urlset xmlns="https://custom.namespace/schema">
<url><loc>https://example.com/ns</loc></url>
</urlset>
"""
seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/ns-sitemap.xml": xml}))
urls = []
async for u in seeder._iter_sitemap("https://example.com/ns-sitemap.xml"):
urls.append(u)
assert urls == ["https://example.com/ns"]
@pytest.mark.asyncio
async def test_iter_sitemap_handles_namespace_index_and_children():
index_xml = """<?xml version="1.0"?>
<sitemapindex xmlns="http://another.example/ns">
<sitemap>
<loc>https://example.com/child-1.xml</loc>
</sitemap>
<sitemap>
<loc>https://example.com/child-2.xml</loc>
</sitemap>
</sitemapindex>
"""
child_xml = """<?xml version="1.0"?>
<urlset xmlns="http://irrelevant">
<url><loc>https://example.com/page-{n}</loc></url>
</urlset>
"""
responses = {
"https://example.com/index.xml": index_xml,
"https://example.com/child-1.xml": child_xml.format(n=1),
"https://example.com/child-2.xml": child_xml.format(n=2),
}
seeder = AsyncUrlSeeder(client=DummyAsyncClient(responses))
urls = []
async for u in seeder._iter_sitemap("https://example.com/index.xml"):
urls.append(u)
assert sorted(urls) == [
"https://example.com/page-1",
"https://example.com/page-2",
]
@pytest.mark.asyncio
async def test_iter_sitemap_normalizes_relative_locations():
xml = """<?xml version="1.0"?>
<urlset>
<url><loc>/relative-path</loc></url>
<url><loc>https://example.com/absolute</loc></url>
</urlset>
"""
seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/sitemap.xml": xml}))
urls = []
async for u in seeder._iter_sitemap("https://example.com/sitemap.xml"):
urls.append(u)
assert urls == [
"https://example.com/relative-path",
"https://example.com/absolute",
]