#1551 : Fix casing and variable name consistency for LLMConfig in documentation

2025-11-10 15:38:14 +08:00
3 changed files with 18 additions and 198 deletions
--- a/crawl4ai/async_url_seeder.py
+++ b/crawl4ai/async_url_seeder.py
@@ -845,15 +845,6 @@ class AsyncUrlSeeder:
            return

        data = gzip.decompress(r.content) if url.endswith(".gz") else r.content
-        base_url = str(r.url)
-
-        def _normalize_loc(raw: Optional[str]) -> Optional[str]:
-            if not raw:
-                return None
-            normalized = urljoin(base_url, raw.strip())
-            if not normalized:
-                return None
-            return normalized

        # Detect if this is a sitemap index by checking for <sitemapindex> or presence of <sitemap> elements
        is_sitemap_index = False
@@ -866,42 +857,25 @@ class AsyncUrlSeeder:
                # Use XML parser for sitemaps, not HTML parser
                parser = etree.XMLParser(recover=True)
                root = etree.fromstring(data, parser=parser)
-                # Namespace-agnostic lookups using local-name() so we honor custom or missing namespaces
-                sitemap_loc_nodes = root.xpath("//*[local-name()='sitemap']/*[local-name()='loc']")
-                url_loc_nodes = root.xpath("//*[local-name()='url']/*[local-name()='loc']")

-                self._log(
-                    "debug",
-                    "Parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered",
-                    params={
-                        "url": url,
-                        "sitemap_count": len(sitemap_loc_nodes),
-                        "url_count": len(url_loc_nodes),
-                    },
-                    tag="URL_SEED",
-                )
+                # Define namespace for sitemap
+                ns = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

                # Check for sitemap index entries
-                if sitemap_loc_nodes:
+                sitemap_locs = root.xpath('//s:sitemap/s:loc', namespaces=ns)
+                if sitemap_locs:
                    is_sitemap_index = True
-                    for sitemap_elem in sitemap_loc_nodes:
-                        loc = _normalize_loc(sitemap_elem.text)
+                    for sitemap_elem in sitemap_locs:
+                        loc = sitemap_elem.text.strip() if sitemap_elem.text else ""
                        if loc:
                            sub_sitemaps.append(loc)

                # If not a sitemap index, get regular URLs
                if not is_sitemap_index:
-                    for loc_elem in url_loc_nodes:
-                        loc = _normalize_loc(loc_elem.text)
+                    for loc_elem in root.xpath('//s:url/s:loc', namespaces=ns):
+                        loc = loc_elem.text.strip() if loc_elem.text else ""
                        if loc:
                            regular_urls.append(loc)
-                    if not regular_urls:
-                        self._log(
-                            "warning",
-                            "No <loc> entries found inside <url> tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.",
-                            params={"url": url},
-                            tag="URL_SEED",
-                        )
            except Exception as e:
                self._log("error", "LXML parsing error for sitemap {url}: {error}",
                          params={"url": url, "error": str(e)}, tag="URL_SEED")
@@ -918,39 +892,19 @@ class AsyncUrlSeeder:

                # Check for sitemap index entries
                sitemaps = root.findall('.//sitemap')
-                url_entries = root.findall('.//url')
-                self._log(
-                    "debug",
-                    "ElementTree parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered",
-                    params={
-                        "url": url,
-                        "sitemap_count": len(sitemaps),
-                        "url_count": len(url_entries),
-                    },
-                    tag="URL_SEED",
-                )
                if sitemaps:
                    is_sitemap_index = True
                    for sitemap in sitemaps:
                        loc_elem = sitemap.find('loc')
-                        loc = _normalize_loc(loc_elem.text if loc_elem is not None else None)
-                        if loc:
-                            sub_sitemaps.append(loc)
+                        if loc_elem is not None and loc_elem.text:
+                            sub_sitemaps.append(loc_elem.text.strip())

                # If not a sitemap index, get regular URLs
                if not is_sitemap_index:
-                    for url_elem in url_entries:
+                    for url_elem in root.findall('.//url'):
                        loc_elem = url_elem.find('loc')
-                        loc = _normalize_loc(loc_elem.text if loc_elem is not None else None)
-                        if loc:
-                            regular_urls.append(loc)
-                    if not regular_urls:
-                        self._log(
-                            "warning",
-                            "No <loc> entries found inside <url> tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.",
-                            params={"url": url},
-                            tag="URL_SEED",
-                        )
+                        if loc_elem is not None and loc_elem.text:
+                            regular_urls.append(loc_elem.text.strip())
            except Exception as e:
                self._log("error", "ElementTree parsing error for sitemap {url}: {error}",
                          params={"url": url, "error": str(e)}, tag="URL_SEED")
--- a/docs/md_v2/extraction/llm-strategies.md
+++ b/docs/md_v2/extraction/llm-strategies.md
@@ -20,10 +20,10 @@ In some cases, you need to extract **complex or unstructured** information from

 ## 2. Provider-Agnostic via LiteLLM

-You can use LlmConfig, to quickly configure multiple variations of LLMs and experiment with them to find the optimal one for your use case. You can read more about LlmConfig [here](/api/parameters).
+You can use LLMConfig, to quickly configure multiple variations of LLMs and experiment with them to find the optimal one for your use case. You can read more about LLMConfig [here](/api/parameters).

 ```python
-llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
+llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
 ```

 Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LiteLLM supports is fair game. You just provide:
@@ -58,7 +58,7 @@ For structured data, `"schema"` is recommended. You provide `schema=YourPydantic

 Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`.

-1. **`llmConfig`** (LlmConfig): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.    
+1. **`llm_config`** (LLMConfig): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.
 2. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.  
 3. **`extraction_type`** (str): `"schema"` or `"block"`.  
 4. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”  
@@ -112,7 +112,7 @@ async def main():
    # 1. Define the LLM extraction strategy
    llm_strategy = LLMExtractionStrategy(
        llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
-        schema=Product.schema_json(), # Or use model_json_schema()
+        schema=Product.model_json_schema(), # Or use model_json_schema()
        extraction_type="schema",
        instruction="Extract all product objects with 'name' and 'price' from the content.",
        chunk_token_threshold=1000,
@@ -238,7 +238,7 @@ class KnowledgeGraph(BaseModel):
 async def main():
    # LLM extraction strategy
    llm_strat = LLMExtractionStrategy(
-        llmConfig = LLMConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')),
+        llm_config = LLMConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')),
        schema=KnowledgeGraph.model_json_schema(),
        extraction_type="schema",
        instruction="Extract entities and relationships from the content. Return valid JSON.",
--- a/tests/unit/test_sitemap_namespace_parsing.py
+++ b/tests/unit/test_sitemap_namespace_parsing.py
@@ -1,134 +0,0 @@
-import sys
-from types import SimpleNamespace
-
-import pytest
-
-# Provide a lightweight stub for rank_bm25 before importing the seeder to avoid
-# optional dependency issues (e.g., incompatible wheels in CI).
-class _FakeBM25:
-    def __init__(self, corpus):
-        self._scores = [1.0] * len(corpus)
-
-    def get_scores(self, tokens):
-        return self._scores
-
-
-sys.modules.setdefault("rank_bm25", SimpleNamespace(BM25Okapi=_FakeBM25))
-
-from crawl4ai.async_url_seeder import AsyncUrlSeeder
-
-
-class DummyResponse:
-    def __init__(self, request_url: str, text: str):
-        self.status_code = 200
-        self._content = text.encode("utf-8")
-        self.url = request_url
-
-    def raise_for_status(self):
-        return None
-
-    @property
-    def content(self):
-        return self._content
-
-    @property
-    def text(self):
-        return self._content.decode("utf-8")
-
-
-class DummyAsyncClient:
-    def __init__(self, response_map):
-        self._responses = response_map
-
-    async def get(self, url, **kwargs):
-        payload = self._responses[url]
-        if callable(payload):
-            payload = payload()
-        return DummyResponse(url, payload)
-
-
-@pytest.mark.asyncio
-async def test_iter_sitemap_handles_namespace_less_sitemaps():
-    xml = """<?xml version="1.0"?>
-    <urlset>
-        <url><loc>https://example.com/a</loc></url>
-        <url><loc>https://example.com/b</loc></url>
-    </urlset>
-    """
-    seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/sitemap.xml": xml}))
-
-    urls = []
-    async for u in seeder._iter_sitemap("https://example.com/sitemap.xml"):
-        urls.append(u)
-
-    assert urls == ["https://example.com/a", "https://example.com/b"]
-
-
-@pytest.mark.asyncio
-async def test_iter_sitemap_handles_custom_namespace():
-    xml = """<?xml version="1.0"?>
-    <urlset xmlns="https://custom.namespace/schema">
-        <url><loc>https://example.com/ns</loc></url>
-    </urlset>
-    """
-    seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/ns-sitemap.xml": xml}))
-
-    urls = []
-    async for u in seeder._iter_sitemap("https://example.com/ns-sitemap.xml"):
-        urls.append(u)
-
-    assert urls == ["https://example.com/ns"]
-
-
-@pytest.mark.asyncio
-async def test_iter_sitemap_handles_namespace_index_and_children():
-    index_xml = """<?xml version="1.0"?>
-    <sitemapindex xmlns="http://another.example/ns">
-        <sitemap>
-            <loc>https://example.com/child-1.xml</loc>
-        </sitemap>
-        <sitemap>
-            <loc>https://example.com/child-2.xml</loc>
-        </sitemap>
-    </sitemapindex>
-    """
-    child_xml = """<?xml version="1.0"?>
-    <urlset xmlns="http://irrelevant">
-        <url><loc>https://example.com/page-{n}</loc></url>
-    </urlset>
-    """
-    responses = {
-        "https://example.com/index.xml": index_xml,
-        "https://example.com/child-1.xml": child_xml.format(n=1),
-        "https://example.com/child-2.xml": child_xml.format(n=2),
-    }
-    seeder = AsyncUrlSeeder(client=DummyAsyncClient(responses))
-
-    urls = []
-    async for u in seeder._iter_sitemap("https://example.com/index.xml"):
-        urls.append(u)
-
-    assert sorted(urls) == [
-        "https://example.com/page-1",
-        "https://example.com/page-2",
-    ]
-
-
-@pytest.mark.asyncio
-async def test_iter_sitemap_normalizes_relative_locations():
-    xml = """<?xml version="1.0"?>
-    <urlset>
-        <url><loc>/relative-path</loc></url>
-        <url><loc>https://example.com/absolute</loc></url>
-    </urlset>
-    """
-    seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/sitemap.xml": xml}))
-
-    urls = []
-    async for u in seeder._iter_sitemap("https://example.com/sitemap.xml"):
-        urls.append(u)
-
-    assert urls == [
-        "https://example.com/relative-path",
-        "https://example.com/absolute",
-    ]