Compare commits

...

6 Commits

Author SHA1 Message Date
Soham Kukreti
a0c5f0f79a fix: ensure BrowserConfig.to_dict serializes proxy_config 2025-11-26 17:44:06 +05:30
ntohidi
b36c6daa5c Fix: permission issues with .cache/url_seeder and other runtime cache dirs. ref #1638 2025-11-25 11:51:59 +01:00
Nasrin
94c8a833bf Merge pull request #1447 from rbushri/fix/wrong_url_raw
Fix: Wrong URL variable used for extraction of raw html
2025-11-25 17:49:44 +08:00
ntohidi
84bfea8bd1 Fix EmbeddingStrategy: Uncomment response handling for the variations and clean up mock data. ref #1621 2025-11-25 10:46:00 +01:00
Rachel Bushrian
7771ed3894 Merge branch 'develop' into fix/wrong_url_raw 2025-11-24 13:54:07 +02:00
rbushria
edd0b576b1 Fix: Use correct URL variable for raw HTML extraction (#1116)
- Prevents full HTML content from being passed as URL to extraction strategies
- Added unit tests to verify raw HTML and regular URL processing

Fix: Wrong URL variable used for extraction of raw html
2025-09-01 23:15:56 +03:00
5 changed files with 97 additions and 13 deletions

View File

@@ -167,6 +167,11 @@ RUN mkdir -p /home/appuser/.cache/ms-playwright \
RUN crawl4ai-doctor RUN crawl4ai-doctor
# Ensure all cache directories belong to appuser
# This fixes permission issues with .cache/url_seeder and other runtime cache dirs
RUN mkdir -p /home/appuser/.cache \
&& chown -R appuser:appuser /home/appuser/.cache
# Copy application code # Copy application code
COPY deploy/docker/* ${APP_HOME}/ COPY deploy/docker/* ${APP_HOME}/

View File

@@ -728,18 +728,18 @@ class EmbeddingStrategy(CrawlStrategy):
provider = llm_config_dict.get('provider', 'openai/gpt-4o-mini') if llm_config_dict else 'openai/gpt-4o-mini' provider = llm_config_dict.get('provider', 'openai/gpt-4o-mini') if llm_config_dict else 'openai/gpt-4o-mini'
api_token = llm_config_dict.get('api_token') if llm_config_dict else None api_token = llm_config_dict.get('api_token') if llm_config_dict else None
# response = perform_completion_with_backoff( response = perform_completion_with_backoff(
# provider=provider, provider=provider,
# prompt_with_variables=prompt, prompt_with_variables=prompt,
# api_token=api_token, api_token=api_token,
# json_response=True json_response=True
# ) )
# variations = json.loads(response.choices[0].message.content) variations = json.loads(response.choices[0].message.content)
# # Mock data with more variations for split # # Mock data with more variations for split
variations ={'queries': ['what are the best vegetables to use in fried rice?', 'how do I make vegetable fried rice from scratch?', 'can you provide a quick recipe for vegetable fried rice?', 'what cooking techniques are essential for perfect fried rice with vegetables?', 'how to add flavor to vegetable fried rice?', 'are there any tips for making healthy fried rice with vegetables?']} # variations ={'queries': ['what are the best vegetables to use in fried rice?', 'how do I make vegetable fried rice from scratch?', 'can you provide a quick recipe for vegetable fried rice?', 'what cooking techniques are essential for perfect fried rice with vegetables?', 'how to add flavor to vegetable fried rice?', 'are there any tips for making healthy fried rice with vegetables?']}
# variations = {'queries': [ # variations = {'queries': [

View File

@@ -598,7 +598,7 @@ class BrowserConfig:
"chrome_channel": self.chrome_channel, "chrome_channel": self.chrome_channel,
"channel": self.channel, "channel": self.channel,
"proxy": self.proxy, "proxy": self.proxy,
"proxy_config": self.proxy_config, "proxy_config": self.proxy_config.to_dict() if self.proxy_config else None,
"viewport_width": self.viewport_width, "viewport_width": self.viewport_width,
"viewport_height": self.viewport_height, "viewport_height": self.viewport_height,
"accept_downloads": self.accept_downloads, "accept_downloads": self.accept_downloads,

View File

@@ -617,17 +617,17 @@ class AsyncWebCrawler:
else config.chunking_strategy else config.chunking_strategy
) )
sections = chunking.chunk(content) sections = chunking.chunk(content)
# extracted_content = config.extraction_strategy.run(url, sections) # extracted_content = config.extraction_strategy.run(_url, sections)
# Use async version if available for better parallelism # Use async version if available for better parallelism
if hasattr(config.extraction_strategy, 'arun'): if hasattr(config.extraction_strategy, 'arun'):
extracted_content = await config.extraction_strategy.arun(url, sections) extracted_content = await config.extraction_strategy.arun(_url, sections)
else: else:
# Fallback to sync version run in thread pool to avoid blocking # Fallback to sync version run in thread pool to avoid blocking
extracted_content = await asyncio.to_thread( extracted_content = await asyncio.to_thread(
config.extraction_strategy.run, url, sections config.extraction_strategy.run, url, sections
) )
extracted_content = json.dumps( extracted_content = json.dumps(
extracted_content, indent=4, default=str, ensure_ascii=False extracted_content, indent=4, default=str, ensure_ascii=False
) )

View File

@@ -9,6 +9,21 @@ from crawl4ai import (
RateLimiter, RateLimiter,
CacheMode CacheMode
) )
from crawl4ai.extraction_strategy import ExtractionStrategy
class MockExtractionStrategy(ExtractionStrategy):
"""Mock extraction strategy for testing URL parameter handling"""
def __init__(self):
super().__init__()
self.run_calls = []
def extract(self, url: str, html: str, *args, **kwargs):
return [{"test": "data"}]
def run(self, url: str, sections: List[str], *args, **kwargs):
self.run_calls.append(url)
return super().run(url, sections, *args, **kwargs)
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("viewport", [ @pytest.mark.parametrize("viewport", [
@@ -142,8 +157,72 @@ async def test_error_handling(error_url):
assert not result.success assert not result.success
assert result.error_message is not None assert result.error_message is not None
@pytest.mark.asyncio
async def test_extraction_strategy_run_with_regular_url():
"""
Regression test for extraction_strategy.run URL parameter handling with regular URLs.
This test verifies that when is_raw_html=False (regular URL),
extraction_strategy.run is called with the actual URL.
"""
browser_config = BrowserConfig(
browser_type="chromium",
headless=True
)
async with AsyncWebCrawler(config=browser_config) as crawler:
mock_strategy = MockExtractionStrategy()
# Test regular URL (is_raw_html=False)
regular_url = "https://example.com"
result = await crawler.arun(
url=regular_url,
config=CrawlerRunConfig(
page_timeout=30000,
extraction_strategy=mock_strategy,
cache_mode=CacheMode.BYPASS
)
)
assert result.success
assert len(mock_strategy.run_calls) == 1
assert mock_strategy.run_calls[0] == regular_url, f"Expected '{regular_url}', got '{mock_strategy.run_calls[0]}'"
@pytest.mark.asyncio
async def test_extraction_strategy_run_with_raw_html():
"""
Regression test for extraction_strategy.run URL parameter handling with raw HTML.
This test verifies that when is_raw_html=True (URL starts with "raw:"),
extraction_strategy.run is called with "Raw HTML" instead of the actual URL.
"""
browser_config = BrowserConfig(
browser_type="chromium",
headless=True
)
async with AsyncWebCrawler(config=browser_config) as crawler:
mock_strategy = MockExtractionStrategy()
# Test raw HTML URL (is_raw_html=True automatically set)
raw_html_url = "raw:<html><body><h1>Test HTML</h1><p>This is a test.</p></body></html>"
result = await crawler.arun(
url=raw_html_url,
config=CrawlerRunConfig(
page_timeout=30000,
extraction_strategy=mock_strategy,
cache_mode=CacheMode.BYPASS
)
)
assert result.success
assert len(mock_strategy.run_calls) == 1
assert mock_strategy.run_calls[0] == "Raw HTML", f"Expected 'Raw HTML', got '{mock_strategy.run_calls[0]}'"
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(test_viewport_config((1024, 768))) asyncio.run(test_viewport_config((1024, 768)))
asyncio.run(test_memory_management()) asyncio.run(test_memory_management())
asyncio.run(test_rate_limiting()) asyncio.run(test_rate_limiting())
asyncio.run(test_javascript_execution()) asyncio.run(test_javascript_execution())
asyncio.run(test_extraction_strategy_run_with_regular_url())
asyncio.run(test_extraction_strategy_run_with_raw_html())