Merge pull request #1598 from unclecode/fix/sitemap_seeder

#1559 :Add tests for sitemap parsing and URL normalization in AsyncUr…
2025-11-12 18:09:34 +08:00
parent 124ac583bb 80745bceb9
commit be00fc3a42
2 changed files with 193 additions and 13 deletions
--- a/crawl4ai/async_url_seeder.py
+++ b/crawl4ai/async_url_seeder.py
@@ -845,6 +845,15 @@ class AsyncUrlSeeder:
            return
        data = gzip.decompress(r.content) if url.endswith(".gz") else r.content
        base_url = str(r.url)
        def _normalize_loc(raw: Optional[str]) -> Optional[str]:
            if not raw:
                return None
            normalized = urljoin(base_url, raw.strip())
            if not normalized:
                return None
            return normalized
        # Detect if this is a sitemap index by checking for <sitemapindex> or presence of <sitemap> elements
        is_sitemap_index = False
@@ -857,25 +866,42 @@ class AsyncUrlSeeder:
                # Use XML parser for sitemaps, not HTML parser
                parser = etree.XMLParser(recover=True)
                root = etree.fromstring(data, parser=parser)
                # Namespace-agnostic lookups using local-name() so we honor custom or missing namespaces
                sitemap_loc_nodes = root.xpath("//*[local-name()='sitemap']/*[local-name()='loc']")
                url_loc_nodes = root.xpath("//*[local-name()='url']/*[local-name()='loc']")
-                # Define namespace for sitemap
+                self._log(
-                ns = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
+                    "debug",
                    "Parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered",
                    params={
                        "url": url,
                        "sitemap_count": len(sitemap_loc_nodes),
                        "url_count": len(url_loc_nodes),
                    },
                    tag="URL_SEED",
                )
                # Check for sitemap index entries
-                sitemap_locs = root.xpath('//s:sitemap/s:loc', namespaces=ns)
+                if sitemap_loc_nodes:
                if sitemap_locs:
                    is_sitemap_index = True
-                    for sitemap_elem in sitemap_locs:
+                    for sitemap_elem in sitemap_loc_nodes:
-                        loc = sitemap_elem.text.strip() if sitemap_elem.text else ""
+                        loc = _normalize_loc(sitemap_elem.text)
                        if loc:
                            sub_sitemaps.append(loc)
                # If not a sitemap index, get regular URLs
                if not is_sitemap_index:
-                    for loc_elem in root.xpath('//s:url/s:loc', namespaces=ns):
+                    for loc_elem in url_loc_nodes:
-                        loc = loc_elem.text.strip() if loc_elem.text else ""
+                        loc = _normalize_loc(loc_elem.text)
                        if loc:
                            regular_urls.append(loc)
                    if not regular_urls:
                        self._log(
                            "warning",
                            "No <loc> entries found inside <url> tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.",
                            params={"url": url},
                            tag="URL_SEED",
                        )
            except Exception as e:
                self._log("error", "LXML parsing error for sitemap {url}: {error}",
                          params={"url": url, "error": str(e)}, tag="URL_SEED")
@@ -892,19 +918,39 @@ class AsyncUrlSeeder:
                # Check for sitemap index entries
                sitemaps = root.findall('.//sitemap')
                url_entries = root.findall('.//url')
                self._log(
                    "debug",
                    "ElementTree parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered",
                    params={
                        "url": url,
                        "sitemap_count": len(sitemaps),
                        "url_count": len(url_entries),
                    },
                    tag="URL_SEED",
                )
                if sitemaps:
                    is_sitemap_index = True
                    for sitemap in sitemaps:
                        loc_elem = sitemap.find('loc')
-                        if loc_elem is not None and loc_elem.text:
+                        loc = _normalize_loc(loc_elem.text if loc_elem is not None else None)
-                            sub_sitemaps.append(loc_elem.text.strip())
+                        if loc:
                            sub_sitemaps.append(loc)
                # If not a sitemap index, get regular URLs
                if not is_sitemap_index:
-                    for url_elem in root.findall('.//url'):
+                    for url_elem in url_entries:
                        loc_elem = url_elem.find('loc')
-                        if loc_elem is not None and loc_elem.text:
+                        loc = _normalize_loc(loc_elem.text if loc_elem is not None else None)
-                            regular_urls.append(loc_elem.text.strip())
+                        if loc:
                            regular_urls.append(loc)
                    if not regular_urls:
                        self._log(
                            "warning",
                            "No <loc> entries found inside <url> tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.",
                            params={"url": url},
                            tag="URL_SEED",
                        )
            except Exception as e:
                self._log("error", "ElementTree parsing error for sitemap {url}: {error}",
                          params={"url": url, "error": str(e)}, tag="URL_SEED")
--- a/tests/unit/test_sitemap_namespace_parsing.py
+++ b/tests/unit/test_sitemap_namespace_parsing.py
@@ -0,0 +1,134 @@
 import sys
 from types import SimpleNamespace
 import pytest
 # Provide a lightweight stub for rank_bm25 before importing the seeder to avoid
 # optional dependency issues (e.g., incompatible wheels in CI).
 class _FakeBM25:
    def __init__(self, corpus):
        self._scores = [1.0] * len(corpus)
    def get_scores(self, tokens):
        return self._scores
 sys.modules.setdefault("rank_bm25", SimpleNamespace(BM25Okapi=_FakeBM25))
 from crawl4ai.async_url_seeder import AsyncUrlSeeder
 class DummyResponse:
    def __init__(self, request_url: str, text: str):
        self.status_code = 200
        self._content = text.encode("utf-8")
        self.url = request_url
    def raise_for_status(self):
        return None
    @property
    def content(self):
        return self._content
    @property
    def text(self):
        return self._content.decode("utf-8")
 class DummyAsyncClient:
    def __init__(self, response_map):
        self._responses = response_map
    async def get(self, url, **kwargs):
        payload = self._responses[url]
        if callable(payload):
            payload = payload()
        return DummyResponse(url, payload)
@pytest.mark.asyncio
 async def test_iter_sitemap_handles_namespace_less_sitemaps():
    xml = """<?xml version="1.0"?>
    <urlset>
        <url><loc>https://example.com/a</loc></url>
        <url><loc>https://example.com/b</loc></url>
    </urlset>
    """
    seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/sitemap.xml": xml}))
    urls = []
    async for u in seeder._iter_sitemap("https://example.com/sitemap.xml"):
        urls.append(u)
    assert urls == ["https://example.com/a", "https://example.com/b"]
@pytest.mark.asyncio
 async def test_iter_sitemap_handles_custom_namespace():
    xml = """<?xml version="1.0"?>
    <urlset xmlns="https://custom.namespace/schema">
        <url><loc>https://example.com/ns</loc></url>
    </urlset>
    """
    seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/ns-sitemap.xml": xml}))
    urls = []
    async for u in seeder._iter_sitemap("https://example.com/ns-sitemap.xml"):
        urls.append(u)
    assert urls == ["https://example.com/ns"]
@pytest.mark.asyncio
 async def test_iter_sitemap_handles_namespace_index_and_children():
    index_xml = """<?xml version="1.0"?>
    <sitemapindex xmlns="http://another.example/ns">
        <sitemap>
            <loc>https://example.com/child-1.xml</loc>
        </sitemap>
        <sitemap>
            <loc>https://example.com/child-2.xml</loc>
        </sitemap>
    </sitemapindex>
    """
    child_xml = """<?xml version="1.0"?>
    <urlset xmlns="http://irrelevant">
        <url><loc>https://example.com/page-{n}</loc></url>
    </urlset>
    """
    responses = {
        "https://example.com/index.xml": index_xml,
        "https://example.com/child-1.xml": child_xml.format(n=1),
        "https://example.com/child-2.xml": child_xml.format(n=2),
    }
    seeder = AsyncUrlSeeder(client=DummyAsyncClient(responses))
    urls = []
    async for u in seeder._iter_sitemap("https://example.com/index.xml"):
        urls.append(u)
    assert sorted(urls) == [
        "https://example.com/page-1",
        "https://example.com/page-2",
    ]
@pytest.mark.asyncio
 async def test_iter_sitemap_normalizes_relative_locations():
    xml = """<?xml version="1.0"?>
    <urlset>
        <url><loc>/relative-path</loc></url>
        <url><loc>https://example.com/absolute</loc></url>
    </urlset>
    """
    seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/sitemap.xml": xml}))
    urls = []
    async for u in seeder._iter_sitemap("https://example.com/sitemap.xml"):
        urls.append(u)
    assert urls == [
        "https://example.com/relative-path",
        "https://example.com/absolute",
    ]