fix: Resolve unexpected BrowserContext closure during crawl in Docker
- Removed __del__ method in AsyncPlaywrightCrawlerStrategy to ensure reliable browser lifecycle management by using explicit context managers. - Added process monitoring in ManagedBrowser to detect and log unexpected terminations of the browser subprocess. - Updated Docker configuration to expose port 9222 for remote debugging and allocate extra shared memory to prevent browser crashes. - Improved error handling and resource cleanup for browser instances, particularly in Docker environments. Resolves Issue #256
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -199,6 +199,7 @@ test_env/
|
||||
**/.DS_Store
|
||||
|
||||
todo.md
|
||||
todo_executor.md
|
||||
git_changes.py
|
||||
git_changes.md
|
||||
pypi_build.sh
|
||||
|
||||
@@ -115,7 +115,12 @@ HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/health || exit 1
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
EXPOSE 8000 11235 9222 8080
|
||||
|
||||
# Optional: Increase shared memory size to prevent browser crashes
|
||||
# when loading heavy pages
|
||||
RUN mkdir /dev/shm
|
||||
VOLUME /dev/shm
|
||||
|
||||
# Start the FastAPI server
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"]
|
||||
@@ -1,4 +1,4 @@
|
||||
# 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scrapper
|
||||
# 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scraper
|
||||
|
||||
<a href="https://trendshift.io/repositories/11716" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11716" alt="unclecode%2Fcrawl4ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
||||
|
||||
@@ -127,6 +127,9 @@ docker pull unclecode/crawl4ai:gpu # GPU-enabled version
|
||||
|
||||
# Run the container
|
||||
docker run -p 11235:11235 unclecode/crawl4ai:basic # Replace 'basic' with your chosen version
|
||||
|
||||
# In case to allocate more shared memory for the container
|
||||
docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic
|
||||
```
|
||||
|
||||
#### Option 2: Build from Repository
|
||||
|
||||
@@ -26,5 +26,5 @@ if is_sync_version_installed():
|
||||
print("Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies.")
|
||||
else:
|
||||
WebCrawler = None
|
||||
import warnings
|
||||
print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
|
||||
# import warnings
|
||||
# print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
|
||||
@@ -64,12 +64,27 @@ class ManagedBrowser:
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE
|
||||
)
|
||||
# Monitor browser process output for errors
|
||||
asyncio.create_task(self._monitor_browser_process())
|
||||
await asyncio.sleep(2) # Give browser time to start
|
||||
return f"http://localhost:{self.debugging_port}"
|
||||
except Exception as e:
|
||||
await self.cleanup()
|
||||
raise Exception(f"Failed to start browser: {e}")
|
||||
|
||||
async def _monitor_browser_process(self):
|
||||
"""Monitor the browser process for unexpected termination."""
|
||||
if self.browser_process:
|
||||
stdout, stderr = await asyncio.gather(
|
||||
asyncio.to_thread(self.browser_process.stdout.read),
|
||||
asyncio.to_thread(self.browser_process.stderr.read)
|
||||
)
|
||||
if self.browser_process.poll() is not None:
|
||||
print(f"Browser process terminated unexpectedly with code {self.browser_process.returncode}")
|
||||
print(f"STDOUT: {stdout.decode()}")
|
||||
print(f"STDERR: {stderr.decode()}")
|
||||
await self.cleanup()
|
||||
|
||||
def _get_browser_path(self) -> str:
|
||||
"""Returns the browser executable path based on OS and browser type"""
|
||||
if sys.platform == "darwin": # macOS
|
||||
@@ -330,9 +345,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
await self.playwright.stop()
|
||||
self.playwright = None
|
||||
|
||||
def __del__(self):
|
||||
if self.browser or self.playwright:
|
||||
asyncio.get_event_loop().run_until_complete(self.close())
|
||||
# Issue #256: Remove __del__ method to avoid potential issues with async cleanup
|
||||
# def __del__(self):
|
||||
# if self.browser or self.playwright:
|
||||
# asyncio.get_event_loop().run_until_complete(self.close())
|
||||
|
||||
def set_hook(self, hook_type: str, hook: Callable):
|
||||
if hook_type in self.hooks:
|
||||
|
||||
@@ -47,17 +47,17 @@ class AsyncWebCrawler:
|
||||
|
||||
async def awarmup(self):
|
||||
# Print a message for crawl4ai and its version
|
||||
print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
|
||||
print("[LOG] 🌤️ Warming up the AsyncWebCrawler")
|
||||
# await async_db_manager.ainit_db()
|
||||
await async_db_manager.initialize()
|
||||
await self.arun(
|
||||
url="https://google.com/",
|
||||
word_count_threshold=5,
|
||||
bypass_cache=False,
|
||||
verbose=False,
|
||||
)
|
||||
# await self.arun(
|
||||
# url="https://google.com/",
|
||||
# word_count_threshold=5,
|
||||
# bypass_cache=False,
|
||||
# verbose=False,
|
||||
# )
|
||||
self.ready = True
|
||||
if self.verbose:
|
||||
print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
|
||||
|
||||
@@ -51,3 +51,5 @@ SOCIAL_MEDIA_DOMAINS = [
|
||||
# If image format is in jpg, png or webp
|
||||
# If image is in the first half of the total images extracted from the page
|
||||
IMAGE_SCORE_THRESHOLD = 2
|
||||
|
||||
MAX_METRICS_HISTORY = 1000
|
||||
@@ -10,6 +10,7 @@ from .extraction_strategy import *
|
||||
from .crawler_strategy import *
|
||||
from typing import List
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from .content_scrapping_strategy import WebScrappingStrategy
|
||||
from .config import *
|
||||
import warnings
|
||||
import json
|
||||
@@ -181,7 +182,21 @@ class WebCrawler:
|
||||
# Extract content from HTML
|
||||
try:
|
||||
t1 = time.time()
|
||||
result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
|
||||
scrapping_strategy = WebScrappingStrategy()
|
||||
extra_params = {k: v for k, v in kwargs.items() if k not in ["only_text", "image_description_min_word_threshold"]}
|
||||
result = scrapping_strategy.scrap(
|
||||
url,
|
||||
html,
|
||||
word_count_threshold=word_count_threshold,
|
||||
css_selector=css_selector,
|
||||
only_text=kwargs.get("only_text", False),
|
||||
image_description_min_word_threshold=kwargs.get(
|
||||
"image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
||||
),
|
||||
**extra_params,
|
||||
)
|
||||
|
||||
# result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
|
||||
if verbose:
|
||||
print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user