Merge PR #1746: Fix sitemap-only URL seeding avoiding Common Crawl calls
This commit is contained in:
@@ -400,18 +400,20 @@ class AsyncUrlSeeder:
|
|||||||
if self.logger and hasattr(self.logger, 'verbose') and config.verbose is not None:
|
if self.logger and hasattr(self.logger, 'verbose') and config.verbose is not None:
|
||||||
self.logger.verbose = config.verbose
|
self.logger.verbose = config.verbose
|
||||||
|
|
||||||
# ensure we have the latest CC collection id
|
|
||||||
if self.index_id is None:
|
|
||||||
self.index_id = await self._latest_index()
|
|
||||||
|
|
||||||
# Parse source parameter - split by '+' to get list of sources
|
# Parse source parameter - split by '+' to get list of sources
|
||||||
sources = source.split('+')
|
sources = [s.strip().lower() for s in source.split("+") if s.strip()]
|
||||||
|
|
||||||
valid_sources = {"cc", "sitemap"}
|
valid_sources = {"cc", "sitemap"}
|
||||||
for s in sources:
|
for s in sources:
|
||||||
if s not in valid_sources:
|
if s not in valid_sources:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Invalid source '{s}'. Valid sources are: {', '.join(valid_sources)}")
|
f"Invalid source '{s}'. Valid sources are: {', '.join(valid_sources)}")
|
||||||
|
|
||||||
|
# ensure we have the latest CC collection id when the source is cc
|
||||||
|
if s == "cc" and self.index_id is None:
|
||||||
|
self.index_id = await self._latest_index()
|
||||||
|
|
||||||
|
|
||||||
if hits_per_sec:
|
if hits_per_sec:
|
||||||
if hits_per_sec <= 0:
|
if hits_per_sec <= 0:
|
||||||
self._log(
|
self._log(
|
||||||
|
|||||||
30
tests/general/test_url_seeder_for_only_sitemap.py
Normal file
30
tests/general/test_url_seeder_for_only_sitemap.py
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncLogger, AsyncUrlSeeder, SeedingConfig
|
||||||
|
from pathlib import Path
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
|
||||||
|
async def test_sitemap_source_does_not_hit_commoncrawl():
|
||||||
|
config = SeedingConfig(
|
||||||
|
source="sitemap",
|
||||||
|
live_check=False,
|
||||||
|
extract_head=False,
|
||||||
|
max_urls=50,
|
||||||
|
verbose=True,
|
||||||
|
force=False
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncUrlSeeder(logger=AsyncLogger(verbose=True)) as seeder:
|
||||||
|
async def boom(*args, **kwargs):
|
||||||
|
print("DEBUG: _latest_index called")
|
||||||
|
raise httpx.ConnectTimeout("Simulated CommonCrawl outage")
|
||||||
|
|
||||||
|
seeder._latest_index = boom
|
||||||
|
try:
|
||||||
|
await seeder.urls("https://docs.crawl4ai.com/", config)
|
||||||
|
print("PASS: _latest_index was NOT called (expected after fix).")
|
||||||
|
except httpx.ConnectTimeout:
|
||||||
|
print("FAIL: _latest_index WAS called even though source='sitemap'.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(test_sitemap_source_does_not_hit_commoncrawl())
|
||||||
Reference in New Issue
Block a user