Compare commits
2 Commits
main
...
fix-async-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
43a2088eb0 | ||
|
|
c2c4d42be4 |
@@ -166,6 +166,22 @@ class AsyncUrlSeeder:
|
|||||||
Async version of UrlSeeder.
|
Async version of UrlSeeder.
|
||||||
Call pattern is await/async for / async with.
|
Call pattern is await/async for / async with.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
ttl : timedelta, default TTL
|
||||||
|
Time-to-live for cached results.
|
||||||
|
client : httpx.AsyncClient, optional
|
||||||
|
HTTP client to use. If None, creates a new one.
|
||||||
|
logger : AsyncLoggerBase, optional
|
||||||
|
Logger instance for logging messages.
|
||||||
|
base_directory : str or pathlib.Path, optional
|
||||||
|
Base directory for cache storage. Defaults to home directory.
|
||||||
|
cache_root : str or pathlib.Path, optional
|
||||||
|
Root directory for URL seeder cache. Defaults to ~/.cache/url_seeder.
|
||||||
|
verify_redirect_targets : bool, default True
|
||||||
|
Whether to verify that redirect targets are alive (2xx status) before returning them.
|
||||||
|
When False, returns redirect targets without verification (legacy behavior).
|
||||||
|
|
||||||
Public coroutines
|
Public coroutines
|
||||||
-----------------
|
-----------------
|
||||||
await seed.urls(...)
|
await seed.urls(...)
|
||||||
@@ -203,6 +219,8 @@ class AsyncUrlSeeder:
|
|||||||
# NEW: Add base_directory
|
# NEW: Add base_directory
|
||||||
base_directory: Optional[Union[str, pathlib.Path]] = None,
|
base_directory: Optional[Union[str, pathlib.Path]] = None,
|
||||||
cache_root: Optional[Union[str, Path]] = None,
|
cache_root: Optional[Union[str, Path]] = None,
|
||||||
|
# NEW: Control redirect target verification
|
||||||
|
verify_redirect_targets: bool = True,
|
||||||
):
|
):
|
||||||
self.ttl = ttl
|
self.ttl = ttl
|
||||||
self._owns_client = client is None # Track if we created the client
|
self._owns_client = client is None # Track if we created the client
|
||||||
@@ -228,6 +246,9 @@ class AsyncUrlSeeder:
|
|||||||
(self.cache_root / "live").mkdir(parents=True, exist_ok=True)
|
(self.cache_root / "live").mkdir(parents=True, exist_ok=True)
|
||||||
(self.cache_root / "head").mkdir(exist_ok=True)
|
(self.cache_root / "head").mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
# Store redirect verification setting
|
||||||
|
self.verify_redirect_targets = verify_redirect_targets
|
||||||
|
|
||||||
def _log(self, level: str, message: str, tag: str = "URL_SEED", **kwargs: Any):
|
def _log(self, level: str, message: str, tag: str = "URL_SEED", **kwargs: Any):
|
||||||
"""Helper to log messages using the provided logger, if available."""
|
"""Helper to log messages using the provided logger, if available."""
|
||||||
if self.logger:
|
if self.logger:
|
||||||
@@ -682,24 +703,47 @@ class AsyncUrlSeeder:
|
|||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
* the same URL if it answers 2xx,
|
* the same URL if it answers 2xx,
|
||||||
* the absolute redirect target if it answers 3xx,
|
* the absolute redirect target if it answers 3xx (and if verify_redirect_targets=True, only if target is alive/2xx),
|
||||||
* None on any other status or network error.
|
* None on any other status or network error.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
r = await self.client.head(url, timeout=10, follow_redirects=False)
|
r = await self.client.head(url, timeout=10, follow_redirects=False)
|
||||||
|
# direct 2xx hit
|
||||||
# direct hit
|
|
||||||
if 200 <= r.status_code < 300:
|
if 200 <= r.status_code < 300:
|
||||||
return str(r.url)
|
return str(r.url)
|
||||||
|
# single-level redirect (3xx)
|
||||||
# single level redirect
|
|
||||||
if r.status_code in (301, 302, 303, 307, 308):
|
if r.status_code in (301, 302, 303, 307, 308):
|
||||||
loc = r.headers.get("location")
|
loc = r.headers.get("location")
|
||||||
if loc:
|
if loc:
|
||||||
return urljoin(url, loc)
|
target = urljoin(url, loc)
|
||||||
|
# Avoid infinite loop on self-redirect
|
||||||
|
if target == url:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# If not verifying redirect targets, return immediately (old behavior)
|
||||||
|
if not self.verify_redirect_targets:
|
||||||
|
return target
|
||||||
|
|
||||||
|
# Verify redirect target is alive (new behavior)
|
||||||
|
try:
|
||||||
|
r2 = await self.client.head(target, timeout=10, follow_redirects=False)
|
||||||
|
if 200 <= r2.status_code < 300:
|
||||||
|
return str(r2.url)
|
||||||
|
# Optionally, could handle another 3xx here for 2-step chains, but spec only says 1
|
||||||
|
else:
|
||||||
|
self._log(
|
||||||
|
"debug",
|
||||||
|
"HEAD redirect target {target} did not resolve: status {status}",
|
||||||
|
params={"target": target, "status": r2.status_code},
|
||||||
|
tag="URL_SEED",
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
except Exception as e2:
|
||||||
|
self._log("debug", "HEAD {target} failed: {err}",
|
||||||
|
params={"target": target, "err": str(e2)}, tag="URL_SEED")
|
||||||
|
return None
|
||||||
|
# all other cases
|
||||||
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self._log("debug", "HEAD {url} failed: {err}",
|
self._log("debug", "HEAD {url} failed: {err}",
|
||||||
params={"url": url, "err": str(e)}, tag="URL_SEED")
|
params={"url": url, "err": str(e)}, tag="URL_SEED")
|
||||||
|
|||||||
@@ -542,6 +542,19 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
if el.tag in bypass_tags:
|
if el.tag in bypass_tags:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Skip elements inside <pre> or <code> tags where whitespace is significant
|
||||||
|
# This preserves whitespace-only spans (e.g., <span class="w"> </span>) in code blocks
|
||||||
|
is_in_code_block = False
|
||||||
|
ancestor = el.getparent()
|
||||||
|
while ancestor is not None:
|
||||||
|
if ancestor.tag in ("pre", "code"):
|
||||||
|
is_in_code_block = True
|
||||||
|
break
|
||||||
|
ancestor = ancestor.getparent()
|
||||||
|
|
||||||
|
if is_in_code_block:
|
||||||
|
continue
|
||||||
|
|
||||||
text_content = (el.text_content() or "").strip()
|
text_content = (el.text_content() or "").strip()
|
||||||
if (
|
if (
|
||||||
len(text_content.split()) < word_count_threshold
|
len(text_content.split()) < word_count_threshold
|
||||||
|
|||||||
29
tests/test_async_url_seeder.py
Normal file
29
tests/test_async_url_seeder.py
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
import pytest
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai.async_url_seeder import AsyncUrlSeeder
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_resolve_head_handles_dead_redirects():
|
||||||
|
seeder = AsyncUrlSeeder()
|
||||||
|
# Should return None – redirects to a dead URL
|
||||||
|
assert await seeder._resolve_head("http://youtube.com/sitemap.xml") is None
|
||||||
|
assert await seeder._resolve_head("https://stripe.com/sitemap.xml") is None
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_resolve_head_direct_hit():
|
||||||
|
seeder = AsyncUrlSeeder()
|
||||||
|
# Test with a known live URL, e.g., httpbin
|
||||||
|
result = await seeder._resolve_head("https://httpbin.org/status/200")
|
||||||
|
assert result == "https://httpbin.org/status/200"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_resolve_head_verify_redirect_targets_false():
|
||||||
|
# Test with verification disabled - should return redirect target without checking if alive
|
||||||
|
seeder = AsyncUrlSeeder(verify_redirect_targets=False)
|
||||||
|
# This should return the redirect target even if it's dead (old behavior)
|
||||||
|
result = await seeder._resolve_head("http://youtube.com/sitemap.xml")
|
||||||
|
# The exact redirect target might vary, but it should not be None
|
||||||
|
assert result is not None
|
||||||
|
assert isinstance(result, str)
|
||||||
|
# Should be different from the input URL (indicating redirect was followed)
|
||||||
|
assert result != "http://youtube.com/sitemap.xml"
|
||||||
Reference in New Issue
Block a user