Compare commits

..

1 Commits

Author SHA1 Message Date
AHMET YILMAZ
2e8f8c9b49 #1551 : Fix casing and variable name consistency for LLMConfig in documentation 2025-11-10 15:38:14 +08:00
3 changed files with 18 additions and 198 deletions

View File

@@ -845,15 +845,6 @@ class AsyncUrlSeeder:
return
data = gzip.decompress(r.content) if url.endswith(".gz") else r.content
base_url = str(r.url)
def _normalize_loc(raw: Optional[str]) -> Optional[str]:
if not raw:
return None
normalized = urljoin(base_url, raw.strip())
if not normalized:
return None
return normalized
# Detect if this is a sitemap index by checking for <sitemapindex> or presence of <sitemap> elements
is_sitemap_index = False
@@ -866,42 +857,25 @@ class AsyncUrlSeeder:
# Use XML parser for sitemaps, not HTML parser
parser = etree.XMLParser(recover=True)
root = etree.fromstring(data, parser=parser)
# Namespace-agnostic lookups using local-name() so we honor custom or missing namespaces
sitemap_loc_nodes = root.xpath("//*[local-name()='sitemap']/*[local-name()='loc']")
url_loc_nodes = root.xpath("//*[local-name()='url']/*[local-name()='loc']")
self._log(
"debug",
"Parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered",
params={
"url": url,
"sitemap_count": len(sitemap_loc_nodes),
"url_count": len(url_loc_nodes),
},
tag="URL_SEED",
)
# Define namespace for sitemap
ns = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
# Check for sitemap index entries
if sitemap_loc_nodes:
sitemap_locs = root.xpath('//s:sitemap/s:loc', namespaces=ns)
if sitemap_locs:
is_sitemap_index = True
for sitemap_elem in sitemap_loc_nodes:
loc = _normalize_loc(sitemap_elem.text)
for sitemap_elem in sitemap_locs:
loc = sitemap_elem.text.strip() if sitemap_elem.text else ""
if loc:
sub_sitemaps.append(loc)
# If not a sitemap index, get regular URLs
if not is_sitemap_index:
for loc_elem in url_loc_nodes:
loc = _normalize_loc(loc_elem.text)
for loc_elem in root.xpath('//s:url/s:loc', namespaces=ns):
loc = loc_elem.text.strip() if loc_elem.text else ""
if loc:
regular_urls.append(loc)
if not regular_urls:
self._log(
"warning",
"No <loc> entries found inside <url> tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.",
params={"url": url},
tag="URL_SEED",
)
except Exception as e:
self._log("error", "LXML parsing error for sitemap {url}: {error}",
params={"url": url, "error": str(e)}, tag="URL_SEED")
@@ -918,39 +892,19 @@ class AsyncUrlSeeder:
# Check for sitemap index entries
sitemaps = root.findall('.//sitemap')
url_entries = root.findall('.//url')
self._log(
"debug",
"ElementTree parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered",
params={
"url": url,
"sitemap_count": len(sitemaps),
"url_count": len(url_entries),
},
tag="URL_SEED",
)
if sitemaps:
is_sitemap_index = True
for sitemap in sitemaps:
loc_elem = sitemap.find('loc')
loc = _normalize_loc(loc_elem.text if loc_elem is not None else None)
if loc:
sub_sitemaps.append(loc)
if loc_elem is not None and loc_elem.text:
sub_sitemaps.append(loc_elem.text.strip())
# If not a sitemap index, get regular URLs
if not is_sitemap_index:
for url_elem in url_entries:
for url_elem in root.findall('.//url'):
loc_elem = url_elem.find('loc')
loc = _normalize_loc(loc_elem.text if loc_elem is not None else None)
if loc:
regular_urls.append(loc)
if not regular_urls:
self._log(
"warning",
"No <loc> entries found inside <url> tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.",
params={"url": url},
tag="URL_SEED",
)
if loc_elem is not None and loc_elem.text:
regular_urls.append(loc_elem.text.strip())
except Exception as e:
self._log("error", "ElementTree parsing error for sitemap {url}: {error}",
params={"url": url, "error": str(e)}, tag="URL_SEED")

View File

@@ -20,10 +20,10 @@ In some cases, you need to extract **complex or unstructured** information from
## 2. Provider-Agnostic via LiteLLM
You can use LlmConfig, to quickly configure multiple variations of LLMs and experiment with them to find the optimal one for your use case. You can read more about LlmConfig [here](/api/parameters).
You can use LLMConfig, to quickly configure multiple variations of LLMs and experiment with them to find the optimal one for your use case. You can read more about LLMConfig [here](/api/parameters).
```python
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
```
Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LiteLLM supports is fair game. You just provide:
@@ -58,7 +58,7 @@ For structured data, `"schema"` is recommended. You provide `schema=YourPydantic
Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`.
1. **`llmConfig`** (LlmConfig): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.
1. **`llm_config`** (LLMConfig): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.
2. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.
3. **`extraction_type`** (str): `"schema"` or `"block"`.
4. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”
@@ -112,7 +112,7 @@ async def main():
# 1. Define the LLM extraction strategy
llm_strategy = LLMExtractionStrategy(
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
schema=Product.schema_json(), # Or use model_json_schema()
schema=Product.model_json_schema(), # Or use model_json_schema()
extraction_type="schema",
instruction="Extract all product objects with 'name' and 'price' from the content.",
chunk_token_threshold=1000,
@@ -238,7 +238,7 @@ class KnowledgeGraph(BaseModel):
async def main():
# LLM extraction strategy
llm_strat = LLMExtractionStrategy(
llmConfig = LLMConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')),
llm_config = LLMConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')),
schema=KnowledgeGraph.model_json_schema(),
extraction_type="schema",
instruction="Extract entities and relationships from the content. Return valid JSON.",

View File

@@ -1,134 +0,0 @@
import sys
from types import SimpleNamespace
import pytest
# Provide a lightweight stub for rank_bm25 before importing the seeder to avoid
# optional dependency issues (e.g., incompatible wheels in CI).
class _FakeBM25:
def __init__(self, corpus):
self._scores = [1.0] * len(corpus)
def get_scores(self, tokens):
return self._scores
sys.modules.setdefault("rank_bm25", SimpleNamespace(BM25Okapi=_FakeBM25))
from crawl4ai.async_url_seeder import AsyncUrlSeeder
class DummyResponse:
def __init__(self, request_url: str, text: str):
self.status_code = 200
self._content = text.encode("utf-8")
self.url = request_url
def raise_for_status(self):
return None
@property
def content(self):
return self._content
@property
def text(self):
return self._content.decode("utf-8")
class DummyAsyncClient:
def __init__(self, response_map):
self._responses = response_map
async def get(self, url, **kwargs):
payload = self._responses[url]
if callable(payload):
payload = payload()
return DummyResponse(url, payload)
@pytest.mark.asyncio
async def test_iter_sitemap_handles_namespace_less_sitemaps():
xml = """<?xml version="1.0"?>
<urlset>
<url><loc>https://example.com/a</loc></url>
<url><loc>https://example.com/b</loc></url>
</urlset>
"""
seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/sitemap.xml": xml}))
urls = []
async for u in seeder._iter_sitemap("https://example.com/sitemap.xml"):
urls.append(u)
assert urls == ["https://example.com/a", "https://example.com/b"]
@pytest.mark.asyncio
async def test_iter_sitemap_handles_custom_namespace():
xml = """<?xml version="1.0"?>
<urlset xmlns="https://custom.namespace/schema">
<url><loc>https://example.com/ns</loc></url>
</urlset>
"""
seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/ns-sitemap.xml": xml}))
urls = []
async for u in seeder._iter_sitemap("https://example.com/ns-sitemap.xml"):
urls.append(u)
assert urls == ["https://example.com/ns"]
@pytest.mark.asyncio
async def test_iter_sitemap_handles_namespace_index_and_children():
index_xml = """<?xml version="1.0"?>
<sitemapindex xmlns="http://another.example/ns">
<sitemap>
<loc>https://example.com/child-1.xml</loc>
</sitemap>
<sitemap>
<loc>https://example.com/child-2.xml</loc>
</sitemap>
</sitemapindex>
"""
child_xml = """<?xml version="1.0"?>
<urlset xmlns="http://irrelevant">
<url><loc>https://example.com/page-{n}</loc></url>
</urlset>
"""
responses = {
"https://example.com/index.xml": index_xml,
"https://example.com/child-1.xml": child_xml.format(n=1),
"https://example.com/child-2.xml": child_xml.format(n=2),
}
seeder = AsyncUrlSeeder(client=DummyAsyncClient(responses))
urls = []
async for u in seeder._iter_sitemap("https://example.com/index.xml"):
urls.append(u)
assert sorted(urls) == [
"https://example.com/page-1",
"https://example.com/page-2",
]
@pytest.mark.asyncio
async def test_iter_sitemap_normalizes_relative_locations():
xml = """<?xml version="1.0"?>
<urlset>
<url><loc>/relative-path</loc></url>
<url><loc>https://example.com/absolute</loc></url>
</urlset>
"""
seeder = AsyncUrlSeeder(client=DummyAsyncClient({"https://example.com/sitemap.xml": xml}))
urls = []
async for u in seeder._iter_sitemap("https://example.com/sitemap.xml"):
urls.append(u)
assert urls == [
"https://example.com/relative-path",
"https://example.com/absolute",
]