Compare commits
1 Commits
fix-async-
...
fix-cors-d
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
af77800a6b |
@@ -166,22 +166,6 @@ class AsyncUrlSeeder:
|
||||
Async version of UrlSeeder.
|
||||
Call pattern is await/async for / async with.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ttl : timedelta, default TTL
|
||||
Time-to-live for cached results.
|
||||
client : httpx.AsyncClient, optional
|
||||
HTTP client to use. If None, creates a new one.
|
||||
logger : AsyncLoggerBase, optional
|
||||
Logger instance for logging messages.
|
||||
base_directory : str or pathlib.Path, optional
|
||||
Base directory for cache storage. Defaults to home directory.
|
||||
cache_root : str or pathlib.Path, optional
|
||||
Root directory for URL seeder cache. Defaults to ~/.cache/url_seeder.
|
||||
verify_redirect_targets : bool, default True
|
||||
Whether to verify that redirect targets are alive (2xx status) before returning them.
|
||||
When False, returns redirect targets without verification (legacy behavior).
|
||||
|
||||
Public coroutines
|
||||
-----------------
|
||||
await seed.urls(...)
|
||||
@@ -219,8 +203,6 @@ class AsyncUrlSeeder:
|
||||
# NEW: Add base_directory
|
||||
base_directory: Optional[Union[str, pathlib.Path]] = None,
|
||||
cache_root: Optional[Union[str, Path]] = None,
|
||||
# NEW: Control redirect target verification
|
||||
verify_redirect_targets: bool = True,
|
||||
):
|
||||
self.ttl = ttl
|
||||
self._owns_client = client is None # Track if we created the client
|
||||
@@ -245,9 +227,6 @@ class AsyncUrlSeeder:
|
||||
cache_root or "~/.cache/url_seeder"))
|
||||
(self.cache_root / "live").mkdir(parents=True, exist_ok=True)
|
||||
(self.cache_root / "head").mkdir(exist_ok=True)
|
||||
|
||||
# Store redirect verification setting
|
||||
self.verify_redirect_targets = verify_redirect_targets
|
||||
|
||||
def _log(self, level: str, message: str, tag: str = "URL_SEED", **kwargs: Any):
|
||||
"""Helper to log messages using the provided logger, if available."""
|
||||
@@ -703,47 +682,24 @@ class AsyncUrlSeeder:
|
||||
|
||||
Returns:
|
||||
* the same URL if it answers 2xx,
|
||||
* the absolute redirect target if it answers 3xx (and if verify_redirect_targets=True, only if target is alive/2xx),
|
||||
* the absolute redirect target if it answers 3xx,
|
||||
* None on any other status or network error.
|
||||
"""
|
||||
try:
|
||||
r = await self.client.head(url, timeout=10, follow_redirects=False)
|
||||
# direct 2xx hit
|
||||
|
||||
# direct hit
|
||||
if 200 <= r.status_code < 300:
|
||||
return str(r.url)
|
||||
# single-level redirect (3xx)
|
||||
|
||||
# single level redirect
|
||||
if r.status_code in (301, 302, 303, 307, 308):
|
||||
loc = r.headers.get("location")
|
||||
if loc:
|
||||
target = urljoin(url, loc)
|
||||
# Avoid infinite loop on self-redirect
|
||||
if target == url:
|
||||
return None
|
||||
|
||||
# If not verifying redirect targets, return immediately (old behavior)
|
||||
if not self.verify_redirect_targets:
|
||||
return target
|
||||
|
||||
# Verify redirect target is alive (new behavior)
|
||||
try:
|
||||
r2 = await self.client.head(target, timeout=10, follow_redirects=False)
|
||||
if 200 <= r2.status_code < 300:
|
||||
return str(r2.url)
|
||||
# Optionally, could handle another 3xx here for 2-step chains, but spec only says 1
|
||||
else:
|
||||
self._log(
|
||||
"debug",
|
||||
"HEAD redirect target {target} did not resolve: status {status}",
|
||||
params={"target": target, "status": r2.status_code},
|
||||
tag="URL_SEED",
|
||||
)
|
||||
return None
|
||||
except Exception as e2:
|
||||
self._log("debug", "HEAD {target} failed: {err}",
|
||||
params={"target": target, "err": str(e2)}, tag="URL_SEED")
|
||||
return None
|
||||
# all other cases
|
||||
return urljoin(url, loc)
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self._log("debug", "HEAD {url} failed: {err}",
|
||||
params={"url": url, "err": str(e)}, tag="URL_SEED")
|
||||
|
||||
@@ -674,6 +674,11 @@ class BrowserManager:
|
||||
self.default_context = await self.create_browser_context()
|
||||
await self.setup_context(self.default_context)
|
||||
else:
|
||||
# Handle --disable-web-security requiring a separate user data directory
|
||||
if "--disable-web-security" in (self.config.extra_args or []) and not self.config.user_data_dir:
|
||||
import tempfile
|
||||
self.config.user_data_dir = tempfile.mkdtemp()
|
||||
|
||||
browser_args = self._build_browser_args()
|
||||
|
||||
# Launch appropriate browser type
|
||||
@@ -682,9 +687,15 @@ class BrowserManager:
|
||||
elif self.config.browser_type == "webkit":
|
||||
self.browser = await self.playwright.webkit.launch(**browser_args)
|
||||
else:
|
||||
self.browser = await self.playwright.chromium.launch(**browser_args)
|
||||
|
||||
self.default_context = self.browser
|
||||
if "--disable-web-security" in (self.config.extra_args or []):
|
||||
# Use persistent context for --disable-web-security
|
||||
browser_args["args"] = [arg for arg in browser_args["args"] if not arg.startswith("--user-data-dir")]
|
||||
self.default_context = await self.playwright.chromium.launch_persistent_context(self.config.user_data_dir, **browser_args)
|
||||
self.browser = self.default_context
|
||||
self.config.use_managed_browser = True # Treat as managed for get_page logic
|
||||
else:
|
||||
self.browser = await self.playwright.chromium.launch(**browser_args)
|
||||
self.default_context = self.browser
|
||||
|
||||
async def _verify_cdp_ready(self, cdp_url: str) -> bool:
|
||||
"""Verify CDP endpoint is ready with exponential backoff"""
|
||||
@@ -748,6 +759,9 @@ class BrowserManager:
|
||||
if self.config.extra_args:
|
||||
args.extend(self.config.extra_args)
|
||||
|
||||
if self.config.user_data_dir:
|
||||
args.append(f"--user-data-dir={self.config.user_data_dir}")
|
||||
|
||||
# Deduplicate args
|
||||
args = list(dict.fromkeys(args))
|
||||
|
||||
|
||||
@@ -1,29 +0,0 @@
|
||||
import pytest
|
||||
import asyncio
|
||||
from crawl4ai.async_url_seeder import AsyncUrlSeeder
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_resolve_head_handles_dead_redirects():
|
||||
seeder = AsyncUrlSeeder()
|
||||
# Should return None – redirects to a dead URL
|
||||
assert await seeder._resolve_head("http://youtube.com/sitemap.xml") is None
|
||||
assert await seeder._resolve_head("https://stripe.com/sitemap.xml") is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_resolve_head_direct_hit():
|
||||
seeder = AsyncUrlSeeder()
|
||||
# Test with a known live URL, e.g., httpbin
|
||||
result = await seeder._resolve_head("https://httpbin.org/status/200")
|
||||
assert result == "https://httpbin.org/status/200"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_resolve_head_verify_redirect_targets_false():
|
||||
# Test with verification disabled - should return redirect target without checking if alive
|
||||
seeder = AsyncUrlSeeder(verify_redirect_targets=False)
|
||||
# This should return the redirect target even if it's dead (old behavior)
|
||||
result = await seeder._resolve_head("http://youtube.com/sitemap.xml")
|
||||
# The exact redirect target might vary, but it should not be None
|
||||
assert result is not None
|
||||
assert isinstance(result, str)
|
||||
# Should be different from the input URL (indicating redirect was followed)
|
||||
assert result != "http://youtube.com/sitemap.xml"
|
||||
73
tests/test_browser_manager_cors.py
Normal file
73
tests/test_browser_manager_cors.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import os
|
||||
import sys
|
||||
import pytest
|
||||
|
||||
# Add the parent directory to the Python path
|
||||
parent_dir = os.path.dirname(
|
||||
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
)
|
||||
sys.path.append(parent_dir)
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_normal_browser_launch():
|
||||
"""Test that the browser manager launches normally without --disable-web-security"""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://example.com", bypass_cache=True)
|
||||
assert result.success
|
||||
assert result.html
|
||||
assert result.markdown
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cors_bypass_with_disable_web_security():
|
||||
"""Test that --disable-web-security allows XMLHttpRequest to bypass CORS"""
|
||||
browser_config = BrowserConfig(
|
||||
extra_args=['--disable-web-security'],
|
||||
headless=True # Run headless for test
|
||||
)
|
||||
|
||||
# JS code that attempts XMLHttpRequest to a cross-origin URL that normally blocks CORS
|
||||
js_code = """
|
||||
var xhr = new XMLHttpRequest();
|
||||
xhr.open('GET', 'https://raw.githubusercontent.com/tatsu-lab/alpaca_eval/main/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv', false);
|
||||
xhr.send();
|
||||
if (xhr.status == 200) {
|
||||
return {success: true, length: xhr.responseText.length};
|
||||
} else {
|
||||
return {success: false, status: xhr.status, error: xhr.statusText};
|
||||
}
|
||||
"""
|
||||
|
||||
crawler_config = CrawlerRunConfig(js_code=js_code)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com", config=crawler_config, bypass_cache=True)
|
||||
assert result.success, f"Crawl failed: {result.error_message}"
|
||||
js_result = result.js_execution_result
|
||||
assert js_result is not None, "JS execution result is None"
|
||||
assert js_result.get('success') == True, f"XMLHttpRequest failed: {js_result}"
|
||||
# The result is wrapped in 'results' list
|
||||
results = js_result.get('results', [])
|
||||
assert len(results) > 0, "No results in JS execution"
|
||||
xhr_result = results[0]
|
||||
assert xhr_result.get('success') == True, f"XMLHttpRequest failed: {xhr_result}"
|
||||
assert xhr_result.get('length', 0) > 0, f"No data received from XMLHttpRequest: {xhr_result}"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_browser_manager_without_cors_flag():
|
||||
"""Ensure that without --disable-web-security, normal functionality still works"""
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com", bypass_cache=True)
|
||||
assert result.success
|
||||
assert result.html
|
||||
|
||||
|
||||
# Entry point for debugging
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
Reference in New Issue
Block a user