Compare commits
9 Commits
docker-reb
...
fix/serial
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a0c5f0f79a | ||
|
|
b36c6daa5c | ||
|
|
94c8a833bf | ||
|
|
84bfea8bd1 | ||
|
|
7771ed3894 | ||
|
|
c2c4d42be4 | ||
|
|
f68e7531e3 | ||
|
|
cb637fb5c4 | ||
|
|
edd0b576b1 |
@@ -167,6 +167,11 @@ RUN mkdir -p /home/appuser/.cache/ms-playwright \
|
||||
|
||||
RUN crawl4ai-doctor
|
||||
|
||||
# Ensure all cache directories belong to appuser
|
||||
# This fixes permission issues with .cache/url_seeder and other runtime cache dirs
|
||||
RUN mkdir -p /home/appuser/.cache \
|
||||
&& chown -R appuser:appuser /home/appuser/.cache
|
||||
|
||||
# Copy application code
|
||||
COPY deploy/docker/* ${APP_HOME}/
|
||||
|
||||
|
||||
@@ -1034,11 +1034,14 @@ Our enterprise sponsors and technology partners help scale Crawl4AI to power pro
|
||||
|
||||
| Company | About | Sponsorship Tier |
|
||||
|------|------|----------------------------|
|
||||
| <a href="https://dashboard.capsolver.com/passport/register?inviteCode=ESVSECTX5Q23" target="_blank"><picture><source width="120" media="(prefers-color-scheme: dark)" srcset="https://docs.crawl4ai.com/uploads/sponsors/20251013045338_72a71fa4ee4d2f40.png"><source width="120" media="(prefers-color-scheme: light)" srcset="https://www.capsolver.com/assets/images/logo-text.png"><img alt="Capsolver" src="https://www.capsolver.com/assets/images/logo-text.png"></picture></a> | AI-powered Captcha solving service. Supports all major Captcha types, including reCAPTCHA, Cloudflare, and more | 🥈 Silver |
|
||||
| <a href="https://app.scrapeless.com/passport/register?utm_source=official&utm_term=crawl4ai" target="_blank"><picture><source width="250" media="(prefers-color-scheme: dark)" srcset="https://gist.githubusercontent.com/aravindkarnam/0d275b942705604263e5c32d2db27bc1/raw/Scrapeless-light-logo.svg"><source width="250" media="(prefers-color-scheme: light)" srcset="https://gist.githubusercontent.com/aravindkarnam/22d0525cc0f3021bf19ebf6e11a69ccd/raw/Scrapeless-dark-logo.svg"><img alt="Scrapeless" src="https://gist.githubusercontent.com/aravindkarnam/22d0525cc0f3021bf19ebf6e11a69ccd/raw/Scrapeless-dark-logo.svg"></picture></a> | Scrapeless is the best full-stack web scraping toolkit offering Scraping API, Scraping Browser, Web Unlocker, Captcha Solver, and Proxies, designed to handle all your data collection needs. | 🥈 Silver |
|
||||
| <a href="https://dashboard.capsolver.com/passport/register?inviteCode=ESVSECTX5Q23" target="_blank"><picture><source width="120" media="(prefers-color-scheme: dark)" srcset="https://docs.crawl4ai.com/uploads/sponsors/20251013045338_72a71fa4ee4d2f40.png"><source width="120" media="(prefers-color-scheme: light)" srcset="https://www.capsolver.com/assets/images/logo-text.png"><img alt="Capsolver" src="https://www.capsolver.com/assets/images/logo-text.png"></picture></a> | AI-powered Captcha solving service. Supports all major Captcha types, including reCAPTCHA, Cloudflare, and more | 🥉 Bronze |
|
||||
| <a href="https://kipo.ai" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013045751_2d54f57f117c651e.png" alt="DataSync" width="120"/></a> | Helps engineers and buyers find, compare, and source electronic & industrial parts in seconds, with specs, pricing, lead times & alternatives.| 🥇 Gold |
|
||||
| <a href="https://www.kidocode.com/" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013045045_bb8dace3f0440d65.svg" alt="Kidocode" width="120"/><p align="center">KidoCode</p></a> | Kidocode is a hybrid technology and entrepreneurship school for kids aged 5–18, offering both online and on-campus education. | 🥇 Gold |
|
||||
| <a href="https://www.alephnull.sg/" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013050323_a9e8e8c4c3650421.svg" alt="Aleph null" width="120"/></a> | Singapore-based Aleph Null is Asia’s leading edtech hub, dedicated to student-centric, AI-driven education—empowering learners with the tools to thrive in a fast-changing world. | 🥇 Gold |
|
||||
|
||||
|
||||
|
||||
### 🧑🤝 Individual Sponsors
|
||||
|
||||
A heartfelt thanks to our individual supporters! Every contribution helps us keep our opensource mission alive and thriving!
|
||||
|
||||
@@ -728,18 +728,18 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
provider = llm_config_dict.get('provider', 'openai/gpt-4o-mini') if llm_config_dict else 'openai/gpt-4o-mini'
|
||||
api_token = llm_config_dict.get('api_token') if llm_config_dict else None
|
||||
|
||||
# response = perform_completion_with_backoff(
|
||||
# provider=provider,
|
||||
# prompt_with_variables=prompt,
|
||||
# api_token=api_token,
|
||||
# json_response=True
|
||||
# )
|
||||
response = perform_completion_with_backoff(
|
||||
provider=provider,
|
||||
prompt_with_variables=prompt,
|
||||
api_token=api_token,
|
||||
json_response=True
|
||||
)
|
||||
|
||||
# variations = json.loads(response.choices[0].message.content)
|
||||
variations = json.loads(response.choices[0].message.content)
|
||||
|
||||
|
||||
# # Mock data with more variations for split
|
||||
variations ={'queries': ['what are the best vegetables to use in fried rice?', 'how do I make vegetable fried rice from scratch?', 'can you provide a quick recipe for vegetable fried rice?', 'what cooking techniques are essential for perfect fried rice with vegetables?', 'how to add flavor to vegetable fried rice?', 'are there any tips for making healthy fried rice with vegetables?']}
|
||||
# variations ={'queries': ['what are the best vegetables to use in fried rice?', 'how do I make vegetable fried rice from scratch?', 'can you provide a quick recipe for vegetable fried rice?', 'what cooking techniques are essential for perfect fried rice with vegetables?', 'how to add flavor to vegetable fried rice?', 'are there any tips for making healthy fried rice with vegetables?']}
|
||||
|
||||
|
||||
# variations = {'queries': [
|
||||
|
||||
@@ -598,7 +598,7 @@ class BrowserConfig:
|
||||
"chrome_channel": self.chrome_channel,
|
||||
"channel": self.channel,
|
||||
"proxy": self.proxy,
|
||||
"proxy_config": self.proxy_config,
|
||||
"proxy_config": self.proxy_config.to_dict() if self.proxy_config else None,
|
||||
"viewport_width": self.viewport_width,
|
||||
"viewport_height": self.viewport_height,
|
||||
"accept_downloads": self.accept_downloads,
|
||||
|
||||
@@ -617,17 +617,17 @@ class AsyncWebCrawler:
|
||||
else config.chunking_strategy
|
||||
)
|
||||
sections = chunking.chunk(content)
|
||||
# extracted_content = config.extraction_strategy.run(url, sections)
|
||||
# extracted_content = config.extraction_strategy.run(_url, sections)
|
||||
|
||||
# Use async version if available for better parallelism
|
||||
if hasattr(config.extraction_strategy, 'arun'):
|
||||
extracted_content = await config.extraction_strategy.arun(url, sections)
|
||||
extracted_content = await config.extraction_strategy.arun(_url, sections)
|
||||
else:
|
||||
# Fallback to sync version run in thread pool to avoid blocking
|
||||
extracted_content = await asyncio.to_thread(
|
||||
config.extraction_strategy.run, url, sections
|
||||
)
|
||||
|
||||
|
||||
extracted_content = json.dumps(
|
||||
extracted_content, indent=4, default=str, ensure_ascii=False
|
||||
)
|
||||
|
||||
@@ -542,6 +542,19 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
||||
if el.tag in bypass_tags:
|
||||
continue
|
||||
|
||||
# Skip elements inside <pre> or <code> tags where whitespace is significant
|
||||
# This preserves whitespace-only spans (e.g., <span class="w"> </span>) in code blocks
|
||||
is_in_code_block = False
|
||||
ancestor = el.getparent()
|
||||
while ancestor is not None:
|
||||
if ancestor.tag in ("pre", "code"):
|
||||
is_in_code_block = True
|
||||
break
|
||||
ancestor = ancestor.getparent()
|
||||
|
||||
if is_in_code_block:
|
||||
continue
|
||||
|
||||
text_content = (el.text_content() or "").strip()
|
||||
if (
|
||||
len(text_content.split()) < word_count_threshold
|
||||
|
||||
@@ -9,6 +9,21 @@ from crawl4ai import (
|
||||
RateLimiter,
|
||||
CacheMode
|
||||
)
|
||||
from crawl4ai.extraction_strategy import ExtractionStrategy
|
||||
|
||||
class MockExtractionStrategy(ExtractionStrategy):
|
||||
"""Mock extraction strategy for testing URL parameter handling"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.run_calls = []
|
||||
|
||||
def extract(self, url: str, html: str, *args, **kwargs):
|
||||
return [{"test": "data"}]
|
||||
|
||||
def run(self, url: str, sections: List[str], *args, **kwargs):
|
||||
self.run_calls.append(url)
|
||||
return super().run(url, sections, *args, **kwargs)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("viewport", [
|
||||
@@ -142,8 +157,72 @@ async def test_error_handling(error_url):
|
||||
assert not result.success
|
||||
assert result.error_message is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extraction_strategy_run_with_regular_url():
|
||||
"""
|
||||
Regression test for extraction_strategy.run URL parameter handling with regular URLs.
|
||||
|
||||
This test verifies that when is_raw_html=False (regular URL),
|
||||
extraction_strategy.run is called with the actual URL.
|
||||
"""
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
mock_strategy = MockExtractionStrategy()
|
||||
|
||||
# Test regular URL (is_raw_html=False)
|
||||
regular_url = "https://example.com"
|
||||
result = await crawler.arun(
|
||||
url=regular_url,
|
||||
config=CrawlerRunConfig(
|
||||
page_timeout=30000,
|
||||
extraction_strategy=mock_strategy,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
|
||||
assert result.success
|
||||
assert len(mock_strategy.run_calls) == 1
|
||||
assert mock_strategy.run_calls[0] == regular_url, f"Expected '{regular_url}', got '{mock_strategy.run_calls[0]}'"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extraction_strategy_run_with_raw_html():
|
||||
"""
|
||||
Regression test for extraction_strategy.run URL parameter handling with raw HTML.
|
||||
|
||||
This test verifies that when is_raw_html=True (URL starts with "raw:"),
|
||||
extraction_strategy.run is called with "Raw HTML" instead of the actual URL.
|
||||
"""
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
mock_strategy = MockExtractionStrategy()
|
||||
|
||||
# Test raw HTML URL (is_raw_html=True automatically set)
|
||||
raw_html_url = "raw:<html><body><h1>Test HTML</h1><p>This is a test.</p></body></html>"
|
||||
result = await crawler.arun(
|
||||
url=raw_html_url,
|
||||
config=CrawlerRunConfig(
|
||||
page_timeout=30000,
|
||||
extraction_strategy=mock_strategy,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
|
||||
assert result.success
|
||||
assert len(mock_strategy.run_calls) == 1
|
||||
assert mock_strategy.run_calls[0] == "Raw HTML", f"Expected 'Raw HTML', got '{mock_strategy.run_calls[0]}'"
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_viewport_config((1024, 768)))
|
||||
asyncio.run(test_memory_management())
|
||||
asyncio.run(test_rate_limiting())
|
||||
asyncio.run(test_javascript_execution())
|
||||
asyncio.run(test_javascript_execution())
|
||||
asyncio.run(test_extraction_strategy_run_with_regular_url())
|
||||
asyncio.run(test_extraction_strategy_run_with_raw_html())
|
||||
|
||||
Reference in New Issue
Block a user