#1551 : Fix casing and variable name consistency for LLMConfig in documentation

2025-11-10 15:38:14 +08:00
3 changed files with 18 additions and 198 deletions
--- a/crawl4ai/async_url_seeder.py
+++ b/crawl4ai/async_url_seeder.py
@@ -845,15 +845,6 @@ class AsyncUrlSeeder:
            return
        data = gzip.decompress(r.content) if url.endswith(".gz") else r.content
        base_url = str(r.url)
        def _normalize_loc(raw: Optional[str]) -> Optional[str]:
            if not raw:
                return None
            normalized = urljoin(base_url, raw.strip())
            if not normalized:
                return None
            return normalized
        # Detect if this is a sitemap index by checking for <sitemapindex> or presence of <sitemap> elements
        is_sitemap_index = False
@@ -866,42 +857,25 @@ class AsyncUrlSeeder:
                # Use XML parser for sitemaps, not HTML parser
                parser = etree.XMLParser(recover=True)
                root = etree.fromstring(data, parser=parser)
                # Namespace-agnostic lookups using local-name() so we honor custom or missing namespaces
                sitemap_loc_nodes = root.xpath("//*[local-name()='sitemap']/*[local-name()='loc']")
                url_loc_nodes = root.xpath("//*[local-name()='url']/*[local-name()='loc']")
-                self._log(
+                # Define namespace for sitemap
-                    "debug",
+                ns = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
                    "Parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered",
                    params={
                        "url": url,
                        "sitemap_count": len(sitemap_loc_nodes),
                        "url_count": len(url_loc_nodes),
                    },
                    tag="URL_SEED",
                )
                # Check for sitemap index entries
-                if sitemap_loc_nodes:
+                sitemap_locs = root.xpath('//s:sitemap/s:loc', namespaces=ns)
                if sitemap_locs:
                    is_sitemap_index = True
-                    for sitemap_elem in sitemap_loc_nodes:
+                    for sitemap_elem in sitemap_locs:
-                        loc = _normalize_loc(sitemap_elem.text)
+                        loc = sitemap_elem.text.strip() if sitemap_elem.text else ""
                        if loc:
                            sub_sitemaps.append(loc)
                # If not a sitemap index, get regular URLs
                if not is_sitemap_index:
-                    for loc_elem in url_loc_nodes:
+                    for loc_elem in root.xpath('//s:url/s:loc', namespaces=ns):
-                        loc = _normalize_loc(loc_elem.text)
+                        loc = loc_elem.text.strip() if loc_elem.text else ""
                        if loc:
                            regular_urls.append(loc)
                    if not regular_urls:
                        self._log(
                            "warning",
                            "No <loc> entries found inside <url> tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.",
                            params={"url": url},
                            tag="URL_SEED",
                        )
            except Exception as e:
                self._log("error", "LXML parsing error for sitemap {url}: {error}",
                          params={"url": url, "error": str(e)}, tag="URL_SEED")
@@ -918,39 +892,19 @@ class AsyncUrlSeeder:
                # Check for sitemap index entries
                sitemaps = root.findall('.//sitemap')
                url_entries = root.findall('.//url')
                self._log(
                    "debug",
                    "ElementTree parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered",
                    params={
                        "url": url,
                        "sitemap_count": len(sitemaps),
                        "url_count": len(url_entries),
                    },
                    tag="URL_SEED",
                )
                if sitemaps:
                    is_sitemap_index = True
                    for sitemap in sitemaps:
                        loc_elem = sitemap.find('loc')
-                        loc = _normalize_loc(loc_elem.text if loc_elem is not None else None)
+                        if loc_elem is not None and loc_elem.text:
-                        if loc:
+                            sub_sitemaps.append(loc_elem.text.strip())
                            sub_sitemaps.append(loc)
                # If not a sitemap index, get regular URLs
                if not is_sitemap_index:
-                    for url_elem in url_entries:
+                    for url_elem in root.findall('.//url'):
                        loc_elem = url_elem.find('loc')
-                        loc = _normalize_loc(loc_elem.text if loc_elem is not None else None)
+                        if loc_elem is not None and loc_elem.text:
-                        if loc:
+                            regular_urls.append(loc_elem.text.strip())
                            regular_urls.append(loc)
                    if not regular_urls:
                        self._log(
                            "warning",
                            "No <loc> entries found inside <url> tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.",
                            params={"url": url},
                            tag="URL_SEED",
                        )
            except Exception as e:
                self._log("error", "ElementTree parsing error for sitemap {url}: {error}",
                          params={"url": url, "error": str(e)}, tag="URL_SEED")
--- a/docs/md_v2/extraction/llm-strategies.md
+++ b/docs/md_v2/extraction/llm-strategies.md
@@ -20,10 +20,10 @@ In some cases, you need to extract **complex or unstructured** information from
 ## 2. Provider-Agnostic via LiteLLM
-You can use LlmConfig, to quickly configure multiple variations of LLMs and experiment with them to find the optimal one for your use case. You can read more about LlmConfig [here](/api/parameters).
+You can use LLMConfig, to quickly configure multiple variations of LLMs and experiment with them to find the optimal one for your use case. You can read more about LLMConfig [here](/api/parameters).
 ```python
-llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
+llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
 ```
 Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LiteLLM supports is fair game. You just provide:
@@ -58,7 +58,7 @@ For structured data, `"schema"` is recommended. You provide `schema=YourPydantic
 Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`.
-1. **`llmConfig`** (LlmConfig): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.    
+1. **`llm_config`** (LLMConfig): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.
 2. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.  
 3. **`extraction_type`** (str): `"schema"` or `"block"`.  
 4. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”  
@@ -112,7 +112,7 @@ async def main():
    # 1. Define the LLM extraction strategy
    llm_strategy = LLMExtractionStrategy(
        llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
-        schema=Product.schema_json(), # Or use model_json_schema()
+        schema=Product.model_json_schema(), # Or use model_json_schema()
        extraction_type="schema",
        instruction="Extract all product objects with 'name' and 'price' from the content.",
        chunk_token_threshold=1000,
@@ -238,7 +238,7 @@ class KnowledgeGraph(BaseModel):
 async def main():
    # LLM extraction strategy
    llm_strat = LLMExtractionStrategy(
-        llmConfig = LLMConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')),
+        llm_config = LLMConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')),
        schema=KnowledgeGraph.model_json_schema(),
        extraction_type="schema",
        instruction="Extract entities and relationships from the content. Return valid JSON.",
--- a/tests/unit/test_sitemap_namespace_parsing.py
+++ b/tests/unit/test_sitemap_namespace_parsing.py
@@ -1,134 +0,0 @@
 import sys
 from types import SimpleNamespace
 import pytest
 # Provide a lightweight stub for rank_bm25 before importing the seeder to avoid
 # optional dependency issues (e.g., incompatible wheels in CI).
 class _FakeBM25:
    def __init__(self, corpus):
        self._scores = [1.0] * len(corpus)
    def get_scores(self, tokens):
        return self._scores
 sys.modules.setdefault("rank_bm25", SimpleNamespace(BM25Okapi=_FakeBM25))
 from crawl4ai.async_url_seeder import AsyncUrlSeeder
 class DummyResponse:
    def __init__(self, request_url: str, text: str):
        self.status_code = 200
        self._content = text.encode("utf-8")
        self.url = request_url
    def raise_for_status(self):
        return None
    @property
    def content(self):
        return self._content
    @property
    def text(self):
        return self._content.decode("utf-8")
 class DummyAsyncClient:
    def __init__(self, response_map):
        self._responses = response_map
    async def get(self, url, **kwargs):
        payload = self._responses[url]
        if callable(payload):
            payload = payload()
        return DummyResponse(url, payload)
@pytest.mark.asyncio
 async def test_iter_sitemap_handles_namespace_less_sitemaps():
    xml = """<?xml version="1.0"?>
    <urlset>
        <url><loc>https://example.com/a</loc></url>
        <url><loc>https://example.com/b</loc></url>
    </urlset>
    """
    seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/sitemap.xml": xml}))
    urls = []
    async for u in seeder._iter_sitemap("https://example.com/sitemap.xml"):
        urls.append(u)
    assert urls == ["https://example.com/a", "https://example.com/b"]
@pytest.mark.asyncio
 async def test_iter_sitemap_handles_custom_namespace():
    xml = """<?xml version="1.0"?>
    <urlset xmlns="https://custom.namespace/schema">
        <url><loc>https://example.com/ns</loc></url>
    </urlset>
    """
    seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/ns-sitemap.xml": xml}))
    urls = []
    async for u in seeder._iter_sitemap("https://example.com/ns-sitemap.xml"):
        urls.append(u)
    assert urls == ["https://example.com/ns"]
@pytest.mark.asyncio
 async def test_iter_sitemap_handles_namespace_index_and_children():
    index_xml = """<?xml version="1.0"?>
    <sitemapindex xmlns="http://another.example/ns">
        <sitemap>
            <loc>https://example.com/child-1.xml</loc>
        </sitemap>
        <sitemap>
            <loc>https://example.com/child-2.xml</loc>
        </sitemap>
    </sitemapindex>
    """
    child_xml = """<?xml version="1.0"?>
    <urlset xmlns="http://irrelevant">
        <url><loc>https://example.com/page-{n}</loc></url>
    </urlset>
    """
    responses = {
        "https://example.com/index.xml": index_xml,
        "https://example.com/child-1.xml": child_xml.format(n=1),
        "https://example.com/child-2.xml": child_xml.format(n=2),
    }
    seeder = AsyncUrlSeeder(client=DummyAsyncClient(responses))
    urls = []
    async for u in seeder._iter_sitemap("https://example.com/index.xml"):
        urls.append(u)
    assert sorted(urls) == [
        "https://example.com/page-1",
        "https://example.com/page-2",
    ]
@pytest.mark.asyncio
 async def test_iter_sitemap_normalizes_relative_locations():
    xml = """<?xml version="1.0"?>
    <urlset>
        <url><loc>/relative-path</loc></url>
        <url><loc>https://example.com/absolute</loc></url>
    </urlset>
    """
    seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/sitemap.xml": xml}))
    urls = []
    async for u in seeder._iter_sitemap("https://example.com/sitemap.xml"):
        urls.append(u)
    assert urls == [
        "https://example.com/relative-path",
        "https://example.com/absolute",
    ]