From 80745bceb9589de51270739859552e71c048c763 Mon Sep 17 00:00:00 2001 From: AHMET YILMAZ Date: Mon, 10 Nov 2025 14:15:54 +0800 Subject: [PATCH] #1559 :Add tests for sitemap parsing and URL normalization in AsyncUrlSeeder --- crawl4ai/async_url_seeder.py | 72 ++++++++-- tests/unit/test_sitemap_namespace_parsing.py | 134 +++++++++++++++++++ 2 files changed, 193 insertions(+), 13 deletions(-) create mode 100644 tests/unit/test_sitemap_namespace_parsing.py diff --git a/crawl4ai/async_url_seeder.py b/crawl4ai/async_url_seeder.py index d2564797..91f61837 100644 --- a/crawl4ai/async_url_seeder.py +++ b/crawl4ai/async_url_seeder.py @@ -845,6 +845,15 @@ class AsyncUrlSeeder: return data = gzip.decompress(r.content) if url.endswith(".gz") else r.content + base_url = str(r.url) + + def _normalize_loc(raw: Optional[str]) -> Optional[str]: + if not raw: + return None + normalized = urljoin(base_url, raw.strip()) + if not normalized: + return None + return normalized # Detect if this is a sitemap index by checking for or presence of elements is_sitemap_index = False @@ -857,25 +866,42 @@ class AsyncUrlSeeder: # Use XML parser for sitemaps, not HTML parser parser = etree.XMLParser(recover=True) root = etree.fromstring(data, parser=parser) + # Namespace-agnostic lookups using local-name() so we honor custom or missing namespaces + sitemap_loc_nodes = root.xpath("//*[local-name()='sitemap']/*[local-name()='loc']") + url_loc_nodes = root.xpath("//*[local-name()='url']/*[local-name()='loc']") - # Define namespace for sitemap - ns = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'} + self._log( + "debug", + "Parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered", + params={ + "url": url, + "sitemap_count": len(sitemap_loc_nodes), + "url_count": len(url_loc_nodes), + }, + tag="URL_SEED", + ) # Check for sitemap index entries - sitemap_locs = root.xpath('//s:sitemap/s:loc', namespaces=ns) - if sitemap_locs: + if sitemap_loc_nodes: is_sitemap_index = True - for sitemap_elem in sitemap_locs: - loc = sitemap_elem.text.strip() if sitemap_elem.text else "" + for sitemap_elem in sitemap_loc_nodes: + loc = _normalize_loc(sitemap_elem.text) if loc: sub_sitemaps.append(loc) # If not a sitemap index, get regular URLs if not is_sitemap_index: - for loc_elem in root.xpath('//s:url/s:loc', namespaces=ns): - loc = loc_elem.text.strip() if loc_elem.text else "" + for loc_elem in url_loc_nodes: + loc = _normalize_loc(loc_elem.text) if loc: regular_urls.append(loc) + if not regular_urls: + self._log( + "warning", + "No entries found inside tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.", + params={"url": url}, + tag="URL_SEED", + ) except Exception as e: self._log("error", "LXML parsing error for sitemap {url}: {error}", params={"url": url, "error": str(e)}, tag="URL_SEED") @@ -892,19 +918,39 @@ class AsyncUrlSeeder: # Check for sitemap index entries sitemaps = root.findall('.//sitemap') + url_entries = root.findall('.//url') + self._log( + "debug", + "ElementTree parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered", + params={ + "url": url, + "sitemap_count": len(sitemaps), + "url_count": len(url_entries), + }, + tag="URL_SEED", + ) if sitemaps: is_sitemap_index = True for sitemap in sitemaps: loc_elem = sitemap.find('loc') - if loc_elem is not None and loc_elem.text: - sub_sitemaps.append(loc_elem.text.strip()) + loc = _normalize_loc(loc_elem.text if loc_elem is not None else None) + if loc: + sub_sitemaps.append(loc) # If not a sitemap index, get regular URLs if not is_sitemap_index: - for url_elem in root.findall('.//url'): + for url_elem in url_entries: loc_elem = url_elem.find('loc') - if loc_elem is not None and loc_elem.text: - regular_urls.append(loc_elem.text.strip()) + loc = _normalize_loc(loc_elem.text if loc_elem is not None else None) + if loc: + regular_urls.append(loc) + if not regular_urls: + self._log( + "warning", + "No entries found inside tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.", + params={"url": url}, + tag="URL_SEED", + ) except Exception as e: self._log("error", "ElementTree parsing error for sitemap {url}: {error}", params={"url": url, "error": str(e)}, tag="URL_SEED") diff --git a/tests/unit/test_sitemap_namespace_parsing.py b/tests/unit/test_sitemap_namespace_parsing.py new file mode 100644 index 00000000..3370ddb5 --- /dev/null +++ b/tests/unit/test_sitemap_namespace_parsing.py @@ -0,0 +1,134 @@ +import sys +from types import SimpleNamespace + +import pytest + +# Provide a lightweight stub for rank_bm25 before importing the seeder to avoid +# optional dependency issues (e.g., incompatible wheels in CI). +class _FakeBM25: + def __init__(self, corpus): + self._scores = [1.0] * len(corpus) + + def get_scores(self, tokens): + return self._scores + + +sys.modules.setdefault("rank_bm25", SimpleNamespace(BM25Okapi=_FakeBM25)) + +from crawl4ai.async_url_seeder import AsyncUrlSeeder + + +class DummyResponse: + def __init__(self, request_url: str, text: str): + self.status_code = 200 + self._content = text.encode("utf-8") + self.url = request_url + + def raise_for_status(self): + return None + + @property + def content(self): + return self._content + + @property + def text(self): + return self._content.decode("utf-8") + + +class DummyAsyncClient: + def __init__(self, response_map): + self._responses = response_map + + async def get(self, url, **kwargs): + payload = self._responses[url] + if callable(payload): + payload = payload() + return DummyResponse(url, payload) + + +@pytest.mark.asyncio +async def test_iter_sitemap_handles_namespace_less_sitemaps(): + xml = """ + + https://example.com/a + https://example.com/b + + """ + seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/sitemap.xml": xml})) + + urls = [] + async for u in seeder._iter_sitemap("https://example.com/sitemap.xml"): + urls.append(u) + + assert urls == ["https://example.com/a", "https://example.com/b"] + + +@pytest.mark.asyncio +async def test_iter_sitemap_handles_custom_namespace(): + xml = """ + + https://example.com/ns + + """ + seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/ns-sitemap.xml": xml})) + + urls = [] + async for u in seeder._iter_sitemap("https://example.com/ns-sitemap.xml"): + urls.append(u) + + assert urls == ["https://example.com/ns"] + + +@pytest.mark.asyncio +async def test_iter_sitemap_handles_namespace_index_and_children(): + index_xml = """ + + + https://example.com/child-1.xml + + + https://example.com/child-2.xml + + + """ + child_xml = """ + + https://example.com/page-{n} + + """ + responses = { + "https://example.com/index.xml": index_xml, + "https://example.com/child-1.xml": child_xml.format(n=1), + "https://example.com/child-2.xml": child_xml.format(n=2), + } + seeder = AsyncUrlSeeder(client=DummyAsyncClient(responses)) + + urls = [] + async for u in seeder._iter_sitemap("https://example.com/index.xml"): + urls.append(u) + + assert sorted(urls) == [ + "https://example.com/page-1", + "https://example.com/page-2", + ] + + +@pytest.mark.asyncio +async def test_iter_sitemap_normalizes_relative_locations(): + xml = """ + + /relative-path + https://example.com/absolute + + """ + seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/sitemap.xml": xml})) + + urls = [] + async for u in seeder._iter_sitemap("https://example.com/sitemap.xml"): + urls.append(u) + + assert urls == [ + "https://example.com/relative-path", + "https://example.com/absolute", + ]