Compare commits
6 Commits
fix-async-
...
fix/linkPr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6695a21a41 | ||
|
|
b36c6daa5c | ||
|
|
94c8a833bf | ||
|
|
84bfea8bd1 | ||
|
|
7771ed3894 | ||
|
|
edd0b576b1 |
@@ -167,6 +167,11 @@ RUN mkdir -p /home/appuser/.cache/ms-playwright \
|
||||
|
||||
RUN crawl4ai-doctor
|
||||
|
||||
# Ensure all cache directories belong to appuser
|
||||
# This fixes permission issues with .cache/url_seeder and other runtime cache dirs
|
||||
RUN mkdir -p /home/appuser/.cache \
|
||||
&& chown -R appuser:appuser /home/appuser/.cache
|
||||
|
||||
# Copy application code
|
||||
COPY deploy/docker/* ${APP_HOME}/
|
||||
|
||||
|
||||
@@ -728,18 +728,18 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
provider = llm_config_dict.get('provider', 'openai/gpt-4o-mini') if llm_config_dict else 'openai/gpt-4o-mini'
|
||||
api_token = llm_config_dict.get('api_token') if llm_config_dict else None
|
||||
|
||||
# response = perform_completion_with_backoff(
|
||||
# provider=provider,
|
||||
# prompt_with_variables=prompt,
|
||||
# api_token=api_token,
|
||||
# json_response=True
|
||||
# )
|
||||
response = perform_completion_with_backoff(
|
||||
provider=provider,
|
||||
prompt_with_variables=prompt,
|
||||
api_token=api_token,
|
||||
json_response=True
|
||||
)
|
||||
|
||||
# variations = json.loads(response.choices[0].message.content)
|
||||
variations = json.loads(response.choices[0].message.content)
|
||||
|
||||
|
||||
# # Mock data with more variations for split
|
||||
variations ={'queries': ['what are the best vegetables to use in fried rice?', 'how do I make vegetable fried rice from scratch?', 'can you provide a quick recipe for vegetable fried rice?', 'what cooking techniques are essential for perfect fried rice with vegetables?', 'how to add flavor to vegetable fried rice?', 'are there any tips for making healthy fried rice with vegetables?']}
|
||||
# variations ={'queries': ['what are the best vegetables to use in fried rice?', 'how do I make vegetable fried rice from scratch?', 'can you provide a quick recipe for vegetable fried rice?', 'what cooking techniques are essential for perfect fried rice with vegetables?', 'how to add flavor to vegetable fried rice?', 'are there any tips for making healthy fried rice with vegetables?']}
|
||||
|
||||
|
||||
# variations = {'queries': [
|
||||
|
||||
@@ -166,22 +166,6 @@ class AsyncUrlSeeder:
|
||||
Async version of UrlSeeder.
|
||||
Call pattern is await/async for / async with.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ttl : timedelta, default TTL
|
||||
Time-to-live for cached results.
|
||||
client : httpx.AsyncClient, optional
|
||||
HTTP client to use. If None, creates a new one.
|
||||
logger : AsyncLoggerBase, optional
|
||||
Logger instance for logging messages.
|
||||
base_directory : str or pathlib.Path, optional
|
||||
Base directory for cache storage. Defaults to home directory.
|
||||
cache_root : str or pathlib.Path, optional
|
||||
Root directory for URL seeder cache. Defaults to ~/.cache/url_seeder.
|
||||
verify_redirect_targets : bool, default True
|
||||
Whether to verify that redirect targets are alive (2xx status) before returning them.
|
||||
When False, returns redirect targets without verification (legacy behavior).
|
||||
|
||||
Public coroutines
|
||||
-----------------
|
||||
await seed.urls(...)
|
||||
@@ -219,8 +203,6 @@ class AsyncUrlSeeder:
|
||||
# NEW: Add base_directory
|
||||
base_directory: Optional[Union[str, pathlib.Path]] = None,
|
||||
cache_root: Optional[Union[str, Path]] = None,
|
||||
# NEW: Control redirect target verification
|
||||
verify_redirect_targets: bool = True,
|
||||
):
|
||||
self.ttl = ttl
|
||||
self._owns_client = client is None # Track if we created the client
|
||||
@@ -245,9 +227,6 @@ class AsyncUrlSeeder:
|
||||
cache_root or "~/.cache/url_seeder"))
|
||||
(self.cache_root / "live").mkdir(parents=True, exist_ok=True)
|
||||
(self.cache_root / "head").mkdir(exist_ok=True)
|
||||
|
||||
# Store redirect verification setting
|
||||
self.verify_redirect_targets = verify_redirect_targets
|
||||
|
||||
def _log(self, level: str, message: str, tag: str = "URL_SEED", **kwargs: Any):
|
||||
"""Helper to log messages using the provided logger, if available."""
|
||||
@@ -703,47 +682,24 @@ class AsyncUrlSeeder:
|
||||
|
||||
Returns:
|
||||
* the same URL if it answers 2xx,
|
||||
* the absolute redirect target if it answers 3xx (and if verify_redirect_targets=True, only if target is alive/2xx),
|
||||
* the absolute redirect target if it answers 3xx,
|
||||
* None on any other status or network error.
|
||||
"""
|
||||
try:
|
||||
r = await self.client.head(url, timeout=10, follow_redirects=False)
|
||||
# direct 2xx hit
|
||||
|
||||
# direct hit
|
||||
if 200 <= r.status_code < 300:
|
||||
return str(r.url)
|
||||
# single-level redirect (3xx)
|
||||
|
||||
# single level redirect
|
||||
if r.status_code in (301, 302, 303, 307, 308):
|
||||
loc = r.headers.get("location")
|
||||
if loc:
|
||||
target = urljoin(url, loc)
|
||||
# Avoid infinite loop on self-redirect
|
||||
if target == url:
|
||||
return None
|
||||
|
||||
# If not verifying redirect targets, return immediately (old behavior)
|
||||
if not self.verify_redirect_targets:
|
||||
return target
|
||||
|
||||
# Verify redirect target is alive (new behavior)
|
||||
try:
|
||||
r2 = await self.client.head(target, timeout=10, follow_redirects=False)
|
||||
if 200 <= r2.status_code < 300:
|
||||
return str(r2.url)
|
||||
# Optionally, could handle another 3xx here for 2-step chains, but spec only says 1
|
||||
else:
|
||||
self._log(
|
||||
"debug",
|
||||
"HEAD redirect target {target} did not resolve: status {status}",
|
||||
params={"target": target, "status": r2.status_code},
|
||||
tag="URL_SEED",
|
||||
)
|
||||
return None
|
||||
except Exception as e2:
|
||||
self._log("debug", "HEAD {target} failed: {err}",
|
||||
params={"target": target, "err": str(e2)}, tag="URL_SEED")
|
||||
return None
|
||||
# all other cases
|
||||
return urljoin(url, loc)
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self._log("debug", "HEAD {url} failed: {err}",
|
||||
params={"url": url, "err": str(e)}, tag="URL_SEED")
|
||||
|
||||
@@ -617,17 +617,17 @@ class AsyncWebCrawler:
|
||||
else config.chunking_strategy
|
||||
)
|
||||
sections = chunking.chunk(content)
|
||||
# extracted_content = config.extraction_strategy.run(url, sections)
|
||||
# extracted_content = config.extraction_strategy.run(_url, sections)
|
||||
|
||||
# Use async version if available for better parallelism
|
||||
if hasattr(config.extraction_strategy, 'arun'):
|
||||
extracted_content = await config.extraction_strategy.arun(url, sections)
|
||||
extracted_content = await config.extraction_strategy.arun(_url, sections)
|
||||
else:
|
||||
# Fallback to sync version run in thread pool to avoid blocking
|
||||
extracted_content = await asyncio.to_thread(
|
||||
config.extraction_strategy.run, url, sections
|
||||
)
|
||||
|
||||
|
||||
extracted_content = json.dumps(
|
||||
extracted_content, indent=4, default=str, ensure_ascii=False
|
||||
)
|
||||
|
||||
@@ -336,8 +336,40 @@ class LinkPreview:
|
||||
|
||||
updated_internal.append(updated_link)
|
||||
else:
|
||||
# Keep original link unchanged
|
||||
updated_internal.append(link)
|
||||
# # Keep original link unchanged
|
||||
# updated_internal.append(link)
|
||||
|
||||
# Head extraction failed - calculate fallback scores
|
||||
# Use URL-based scoring if query provided
|
||||
contextual_score = None
|
||||
if config.link_preview_config and config.link_preview_config.query:
|
||||
# Calculate URL-based relevance score as fallback
|
||||
contextual_score = self.seeder._calculate_url_relevance_score(
|
||||
config.link_preview_config.query,
|
||||
link.href
|
||||
)
|
||||
|
||||
# Create updated link with fallback scoring
|
||||
updated_link = Link(
|
||||
href=link.href,
|
||||
text=link.text,
|
||||
title=link.title,
|
||||
base_domain=link.base_domain,
|
||||
head_data=None, # No head data available
|
||||
head_extraction_status="failed",
|
||||
intrinsic_score=getattr(link, 'intrinsic_score', None),
|
||||
contextual_score=contextual_score
|
||||
)
|
||||
|
||||
# Calculate total score even without head data
|
||||
updated_link.total_score = calculate_total_score(
|
||||
intrinsic_score=updated_link.intrinsic_score,
|
||||
contextual_score=updated_link.contextual_score,
|
||||
score_links_enabled=getattr(config, 'score_links', False),
|
||||
query_provided=bool(config.link_preview_config and config.link_preview_config.query)
|
||||
)
|
||||
|
||||
updated_internal.append(updated_link)
|
||||
|
||||
# Update external links
|
||||
updated_external = []
|
||||
@@ -374,8 +406,40 @@ class LinkPreview:
|
||||
|
||||
updated_external.append(updated_link)
|
||||
else:
|
||||
# Keep original link unchanged
|
||||
updated_external.append(link)
|
||||
# # Keep original link unchanged
|
||||
# updated_external.append(link)
|
||||
|
||||
# Head extraction failed - calculate fallback scores
|
||||
# Use URL-based scoring if query provided
|
||||
contextual_score = None
|
||||
if config.link_preview_config and config.link_preview_config.query:
|
||||
# Calculate URL-based relevance score as fallback
|
||||
contextual_score = self.seeder._calculate_url_relevance_score(
|
||||
config.link_preview_config.query,
|
||||
link.href
|
||||
)
|
||||
|
||||
# Create updated link with fallback scoring
|
||||
updated_link = Link(
|
||||
href=link.href,
|
||||
text=link.text,
|
||||
title=link.title,
|
||||
base_domain=link.base_domain,
|
||||
head_data=None, # No head data available
|
||||
head_extraction_status="failed",
|
||||
intrinsic_score=getattr(link, 'intrinsic_score', None),
|
||||
contextual_score=contextual_score
|
||||
)
|
||||
|
||||
# Calculate total score even without head data
|
||||
updated_link.total_score = calculate_total_score(
|
||||
intrinsic_score=updated_link.intrinsic_score,
|
||||
contextual_score=updated_link.contextual_score,
|
||||
score_links_enabled=getattr(config, 'score_links', False),
|
||||
query_provided=bool(config.link_preview_config and config.link_preview_config.query)
|
||||
)
|
||||
|
||||
updated_external.append(updated_link)
|
||||
|
||||
# Sort links by relevance score if available
|
||||
if any(hasattr(link, 'head_data') and link.head_data and 'relevance_score' in link.head_data
|
||||
|
||||
@@ -9,6 +9,21 @@ from crawl4ai import (
|
||||
RateLimiter,
|
||||
CacheMode
|
||||
)
|
||||
from crawl4ai.extraction_strategy import ExtractionStrategy
|
||||
|
||||
class MockExtractionStrategy(ExtractionStrategy):
|
||||
"""Mock extraction strategy for testing URL parameter handling"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.run_calls = []
|
||||
|
||||
def extract(self, url: str, html: str, *args, **kwargs):
|
||||
return [{"test": "data"}]
|
||||
|
||||
def run(self, url: str, sections: List[str], *args, **kwargs):
|
||||
self.run_calls.append(url)
|
||||
return super().run(url, sections, *args, **kwargs)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("viewport", [
|
||||
@@ -142,8 +157,72 @@ async def test_error_handling(error_url):
|
||||
assert not result.success
|
||||
assert result.error_message is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extraction_strategy_run_with_regular_url():
|
||||
"""
|
||||
Regression test for extraction_strategy.run URL parameter handling with regular URLs.
|
||||
|
||||
This test verifies that when is_raw_html=False (regular URL),
|
||||
extraction_strategy.run is called with the actual URL.
|
||||
"""
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
mock_strategy = MockExtractionStrategy()
|
||||
|
||||
# Test regular URL (is_raw_html=False)
|
||||
regular_url = "https://example.com"
|
||||
result = await crawler.arun(
|
||||
url=regular_url,
|
||||
config=CrawlerRunConfig(
|
||||
page_timeout=30000,
|
||||
extraction_strategy=mock_strategy,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
|
||||
assert result.success
|
||||
assert len(mock_strategy.run_calls) == 1
|
||||
assert mock_strategy.run_calls[0] == regular_url, f"Expected '{regular_url}', got '{mock_strategy.run_calls[0]}'"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extraction_strategy_run_with_raw_html():
|
||||
"""
|
||||
Regression test for extraction_strategy.run URL parameter handling with raw HTML.
|
||||
|
||||
This test verifies that when is_raw_html=True (URL starts with "raw:"),
|
||||
extraction_strategy.run is called with "Raw HTML" instead of the actual URL.
|
||||
"""
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
mock_strategy = MockExtractionStrategy()
|
||||
|
||||
# Test raw HTML URL (is_raw_html=True automatically set)
|
||||
raw_html_url = "raw:<html><body><h1>Test HTML</h1><p>This is a test.</p></body></html>"
|
||||
result = await crawler.arun(
|
||||
url=raw_html_url,
|
||||
config=CrawlerRunConfig(
|
||||
page_timeout=30000,
|
||||
extraction_strategy=mock_strategy,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
|
||||
assert result.success
|
||||
assert len(mock_strategy.run_calls) == 1
|
||||
assert mock_strategy.run_calls[0] == "Raw HTML", f"Expected 'Raw HTML', got '{mock_strategy.run_calls[0]}'"
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_viewport_config((1024, 768)))
|
||||
asyncio.run(test_memory_management())
|
||||
asyncio.run(test_rate_limiting())
|
||||
asyncio.run(test_javascript_execution())
|
||||
asyncio.run(test_javascript_execution())
|
||||
asyncio.run(test_extraction_strategy_run_with_regular_url())
|
||||
asyncio.run(test_extraction_strategy_run_with_raw_html())
|
||||
|
||||
@@ -1,29 +0,0 @@
|
||||
import pytest
|
||||
import asyncio
|
||||
from crawl4ai.async_url_seeder import AsyncUrlSeeder
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_resolve_head_handles_dead_redirects():
|
||||
seeder = AsyncUrlSeeder()
|
||||
# Should return None – redirects to a dead URL
|
||||
assert await seeder._resolve_head("http://youtube.com/sitemap.xml") is None
|
||||
assert await seeder._resolve_head("https://stripe.com/sitemap.xml") is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_resolve_head_direct_hit():
|
||||
seeder = AsyncUrlSeeder()
|
||||
# Test with a known live URL, e.g., httpbin
|
||||
result = await seeder._resolve_head("https://httpbin.org/status/200")
|
||||
assert result == "https://httpbin.org/status/200"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_resolve_head_verify_redirect_targets_false():
|
||||
# Test with verification disabled - should return redirect target without checking if alive
|
||||
seeder = AsyncUrlSeeder(verify_redirect_targets=False)
|
||||
# This should return the redirect target even if it's dead (old behavior)
|
||||
result = await seeder._resolve_head("http://youtube.com/sitemap.xml")
|
||||
# The exact redirect target might vary, but it should not be None
|
||||
assert result is not None
|
||||
assert isinstance(result, str)
|
||||
# Should be different from the input URL (indicating redirect was followed)
|
||||
assert result != "http://youtube.com/sitemap.xml"
|
||||
Reference in New Issue
Block a user