Compare commits
7 Commits
main
...
fix/serial
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a0c5f0f79a | ||
|
|
b36c6daa5c | ||
|
|
94c8a833bf | ||
|
|
84bfea8bd1 | ||
|
|
7771ed3894 | ||
|
|
c2c4d42be4 | ||
|
|
edd0b576b1 |
@@ -167,6 +167,11 @@ RUN mkdir -p /home/appuser/.cache/ms-playwright \
|
||||
|
||||
RUN crawl4ai-doctor
|
||||
|
||||
# Ensure all cache directories belong to appuser
|
||||
# This fixes permission issues with .cache/url_seeder and other runtime cache dirs
|
||||
RUN mkdir -p /home/appuser/.cache \
|
||||
&& chown -R appuser:appuser /home/appuser/.cache
|
||||
|
||||
# Copy application code
|
||||
COPY deploy/docker/* ${APP_HOME}/
|
||||
|
||||
|
||||
@@ -728,18 +728,18 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
provider = llm_config_dict.get('provider', 'openai/gpt-4o-mini') if llm_config_dict else 'openai/gpt-4o-mini'
|
||||
api_token = llm_config_dict.get('api_token') if llm_config_dict else None
|
||||
|
||||
# response = perform_completion_with_backoff(
|
||||
# provider=provider,
|
||||
# prompt_with_variables=prompt,
|
||||
# api_token=api_token,
|
||||
# json_response=True
|
||||
# )
|
||||
response = perform_completion_with_backoff(
|
||||
provider=provider,
|
||||
prompt_with_variables=prompt,
|
||||
api_token=api_token,
|
||||
json_response=True
|
||||
)
|
||||
|
||||
# variations = json.loads(response.choices[0].message.content)
|
||||
variations = json.loads(response.choices[0].message.content)
|
||||
|
||||
|
||||
# # Mock data with more variations for split
|
||||
variations ={'queries': ['what are the best vegetables to use in fried rice?', 'how do I make vegetable fried rice from scratch?', 'can you provide a quick recipe for vegetable fried rice?', 'what cooking techniques are essential for perfect fried rice with vegetables?', 'how to add flavor to vegetable fried rice?', 'are there any tips for making healthy fried rice with vegetables?']}
|
||||
# variations ={'queries': ['what are the best vegetables to use in fried rice?', 'how do I make vegetable fried rice from scratch?', 'can you provide a quick recipe for vegetable fried rice?', 'what cooking techniques are essential for perfect fried rice with vegetables?', 'how to add flavor to vegetable fried rice?', 'are there any tips for making healthy fried rice with vegetables?']}
|
||||
|
||||
|
||||
# variations = {'queries': [
|
||||
|
||||
@@ -598,7 +598,7 @@ class BrowserConfig:
|
||||
"chrome_channel": self.chrome_channel,
|
||||
"channel": self.channel,
|
||||
"proxy": self.proxy,
|
||||
"proxy_config": self.proxy_config,
|
||||
"proxy_config": self.proxy_config.to_dict() if self.proxy_config else None,
|
||||
"viewport_width": self.viewport_width,
|
||||
"viewport_height": self.viewport_height,
|
||||
"accept_downloads": self.accept_downloads,
|
||||
|
||||
@@ -617,11 +617,11 @@ class AsyncWebCrawler:
|
||||
else config.chunking_strategy
|
||||
)
|
||||
sections = chunking.chunk(content)
|
||||
# extracted_content = config.extraction_strategy.run(url, sections)
|
||||
# extracted_content = config.extraction_strategy.run(_url, sections)
|
||||
|
||||
# Use async version if available for better parallelism
|
||||
if hasattr(config.extraction_strategy, 'arun'):
|
||||
extracted_content = await config.extraction_strategy.arun(url, sections)
|
||||
extracted_content = await config.extraction_strategy.arun(_url, sections)
|
||||
else:
|
||||
# Fallback to sync version run in thread pool to avoid blocking
|
||||
extracted_content = await asyncio.to_thread(
|
||||
|
||||
@@ -542,6 +542,19 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
||||
if el.tag in bypass_tags:
|
||||
continue
|
||||
|
||||
# Skip elements inside <pre> or <code> tags where whitespace is significant
|
||||
# This preserves whitespace-only spans (e.g., <span class="w"> </span>) in code blocks
|
||||
is_in_code_block = False
|
||||
ancestor = el.getparent()
|
||||
while ancestor is not None:
|
||||
if ancestor.tag in ("pre", "code"):
|
||||
is_in_code_block = True
|
||||
break
|
||||
ancestor = ancestor.getparent()
|
||||
|
||||
if is_in_code_block:
|
||||
continue
|
||||
|
||||
text_content = (el.text_content() or "").strip()
|
||||
if (
|
||||
len(text_content.split()) < word_count_threshold
|
||||
|
||||
@@ -9,6 +9,21 @@ from crawl4ai import (
|
||||
RateLimiter,
|
||||
CacheMode
|
||||
)
|
||||
from crawl4ai.extraction_strategy import ExtractionStrategy
|
||||
|
||||
class MockExtractionStrategy(ExtractionStrategy):
|
||||
"""Mock extraction strategy for testing URL parameter handling"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.run_calls = []
|
||||
|
||||
def extract(self, url: str, html: str, *args, **kwargs):
|
||||
return [{"test": "data"}]
|
||||
|
||||
def run(self, url: str, sections: List[str], *args, **kwargs):
|
||||
self.run_calls.append(url)
|
||||
return super().run(url, sections, *args, **kwargs)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("viewport", [
|
||||
@@ -142,8 +157,72 @@ async def test_error_handling(error_url):
|
||||
assert not result.success
|
||||
assert result.error_message is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extraction_strategy_run_with_regular_url():
|
||||
"""
|
||||
Regression test for extraction_strategy.run URL parameter handling with regular URLs.
|
||||
|
||||
This test verifies that when is_raw_html=False (regular URL),
|
||||
extraction_strategy.run is called with the actual URL.
|
||||
"""
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
mock_strategy = MockExtractionStrategy()
|
||||
|
||||
# Test regular URL (is_raw_html=False)
|
||||
regular_url = "https://example.com"
|
||||
result = await crawler.arun(
|
||||
url=regular_url,
|
||||
config=CrawlerRunConfig(
|
||||
page_timeout=30000,
|
||||
extraction_strategy=mock_strategy,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
|
||||
assert result.success
|
||||
assert len(mock_strategy.run_calls) == 1
|
||||
assert mock_strategy.run_calls[0] == regular_url, f"Expected '{regular_url}', got '{mock_strategy.run_calls[0]}'"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extraction_strategy_run_with_raw_html():
|
||||
"""
|
||||
Regression test for extraction_strategy.run URL parameter handling with raw HTML.
|
||||
|
||||
This test verifies that when is_raw_html=True (URL starts with "raw:"),
|
||||
extraction_strategy.run is called with "Raw HTML" instead of the actual URL.
|
||||
"""
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
mock_strategy = MockExtractionStrategy()
|
||||
|
||||
# Test raw HTML URL (is_raw_html=True automatically set)
|
||||
raw_html_url = "raw:<html><body><h1>Test HTML</h1><p>This is a test.</p></body></html>"
|
||||
result = await crawler.arun(
|
||||
url=raw_html_url,
|
||||
config=CrawlerRunConfig(
|
||||
page_timeout=30000,
|
||||
extraction_strategy=mock_strategy,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
|
||||
assert result.success
|
||||
assert len(mock_strategy.run_calls) == 1
|
||||
assert mock_strategy.run_calls[0] == "Raw HTML", f"Expected 'Raw HTML', got '{mock_strategy.run_calls[0]}'"
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_viewport_config((1024, 768)))
|
||||
asyncio.run(test_memory_management())
|
||||
asyncio.run(test_rate_limiting())
|
||||
asyncio.run(test_javascript_execution())
|
||||
asyncio.run(test_extraction_strategy_run_with_regular_url())
|
||||
asyncio.run(test_extraction_strategy_run_with_raw_html())
|
||||
|
||||
Reference in New Issue
Block a user