fix: Resolve unexpected BrowserContext closure during crawl in Docker
- Removed __del__ method in AsyncPlaywrightCrawlerStrategy to ensure reliable browser lifecycle management by using explicit context managers. - Added process monitoring in ManagedBrowser to detect and log unexpected terminations of the browser subprocess. - Updated Docker configuration to expose port 9222 for remote debugging and allocate extra shared memory to prevent browser crashes. - Improved error handling and resource cleanup for browser instances, particularly in Docker environments. Resolves Issue #256
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -199,6 +199,7 @@ test_env/
|
|||||||
**/.DS_Store
|
**/.DS_Store
|
||||||
|
|
||||||
todo.md
|
todo.md
|
||||||
|
todo_executor.md
|
||||||
git_changes.py
|
git_changes.py
|
||||||
git_changes.md
|
git_changes.md
|
||||||
pypi_build.sh
|
pypi_build.sh
|
||||||
|
|||||||
@@ -115,7 +115,12 @@ HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
|||||||
CMD curl -f http://localhost:8000/health || exit 1
|
CMD curl -f http://localhost:8000/health || exit 1
|
||||||
|
|
||||||
# Expose port
|
# Expose port
|
||||||
EXPOSE 8000
|
EXPOSE 8000 11235 9222 8080
|
||||||
|
|
||||||
|
# Optional: Increase shared memory size to prevent browser crashes
|
||||||
|
# when loading heavy pages
|
||||||
|
RUN mkdir /dev/shm
|
||||||
|
VOLUME /dev/shm
|
||||||
|
|
||||||
# Start the FastAPI server
|
# Start the FastAPI server
|
||||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"]
|
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"]
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
# 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scrapper
|
# 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scraper
|
||||||
|
|
||||||
<a href="https://trendshift.io/repositories/11716" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11716" alt="unclecode%2Fcrawl4ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
<a href="https://trendshift.io/repositories/11716" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11716" alt="unclecode%2Fcrawl4ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
||||||
|
|
||||||
@@ -127,6 +127,9 @@ docker pull unclecode/crawl4ai:gpu # GPU-enabled version
|
|||||||
|
|
||||||
# Run the container
|
# Run the container
|
||||||
docker run -p 11235:11235 unclecode/crawl4ai:basic # Replace 'basic' with your chosen version
|
docker run -p 11235:11235 unclecode/crawl4ai:basic # Replace 'basic' with your chosen version
|
||||||
|
|
||||||
|
# In case to allocate more shared memory for the container
|
||||||
|
docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Option 2: Build from Repository
|
#### Option 2: Build from Repository
|
||||||
|
|||||||
@@ -26,5 +26,5 @@ if is_sync_version_installed():
|
|||||||
print("Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies.")
|
print("Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies.")
|
||||||
else:
|
else:
|
||||||
WebCrawler = None
|
WebCrawler = None
|
||||||
import warnings
|
# import warnings
|
||||||
print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
|
# print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
|
||||||
@@ -64,12 +64,27 @@ class ManagedBrowser:
|
|||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.PIPE
|
stderr=subprocess.PIPE
|
||||||
)
|
)
|
||||||
|
# Monitor browser process output for errors
|
||||||
|
asyncio.create_task(self._monitor_browser_process())
|
||||||
await asyncio.sleep(2) # Give browser time to start
|
await asyncio.sleep(2) # Give browser time to start
|
||||||
return f"http://localhost:{self.debugging_port}"
|
return f"http://localhost:{self.debugging_port}"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
await self.cleanup()
|
await self.cleanup()
|
||||||
raise Exception(f"Failed to start browser: {e}")
|
raise Exception(f"Failed to start browser: {e}")
|
||||||
|
|
||||||
|
async def _monitor_browser_process(self):
|
||||||
|
"""Monitor the browser process for unexpected termination."""
|
||||||
|
if self.browser_process:
|
||||||
|
stdout, stderr = await asyncio.gather(
|
||||||
|
asyncio.to_thread(self.browser_process.stdout.read),
|
||||||
|
asyncio.to_thread(self.browser_process.stderr.read)
|
||||||
|
)
|
||||||
|
if self.browser_process.poll() is not None:
|
||||||
|
print(f"Browser process terminated unexpectedly with code {self.browser_process.returncode}")
|
||||||
|
print(f"STDOUT: {stdout.decode()}")
|
||||||
|
print(f"STDERR: {stderr.decode()}")
|
||||||
|
await self.cleanup()
|
||||||
|
|
||||||
def _get_browser_path(self) -> str:
|
def _get_browser_path(self) -> str:
|
||||||
"""Returns the browser executable path based on OS and browser type"""
|
"""Returns the browser executable path based on OS and browser type"""
|
||||||
if sys.platform == "darwin": # macOS
|
if sys.platform == "darwin": # macOS
|
||||||
@@ -330,9 +345,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
await self.playwright.stop()
|
await self.playwright.stop()
|
||||||
self.playwright = None
|
self.playwright = None
|
||||||
|
|
||||||
def __del__(self):
|
# Issue #256: Remove __del__ method to avoid potential issues with async cleanup
|
||||||
if self.browser or self.playwright:
|
# def __del__(self):
|
||||||
asyncio.get_event_loop().run_until_complete(self.close())
|
# if self.browser or self.playwright:
|
||||||
|
# asyncio.get_event_loop().run_until_complete(self.close())
|
||||||
|
|
||||||
def set_hook(self, hook_type: str, hook: Callable):
|
def set_hook(self, hook_type: str, hook: Callable):
|
||||||
if hook_type in self.hooks:
|
if hook_type in self.hooks:
|
||||||
|
|||||||
@@ -47,17 +47,17 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
async def awarmup(self):
|
async def awarmup(self):
|
||||||
# Print a message for crawl4ai and its version
|
# Print a message for crawl4ai and its version
|
||||||
print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
|
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
|
print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
|
||||||
print("[LOG] 🌤️ Warming up the AsyncWebCrawler")
|
print("[LOG] 🌤️ Warming up the AsyncWebCrawler")
|
||||||
# await async_db_manager.ainit_db()
|
# await async_db_manager.ainit_db()
|
||||||
await async_db_manager.initialize()
|
await async_db_manager.initialize()
|
||||||
await self.arun(
|
# await self.arun(
|
||||||
url="https://google.com/",
|
# url="https://google.com/",
|
||||||
word_count_threshold=5,
|
# word_count_threshold=5,
|
||||||
bypass_cache=False,
|
# bypass_cache=False,
|
||||||
verbose=False,
|
# verbose=False,
|
||||||
)
|
# )
|
||||||
self.ready = True
|
self.ready = True
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
|
print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
|
||||||
|
|||||||
@@ -51,3 +51,5 @@ SOCIAL_MEDIA_DOMAINS = [
|
|||||||
# If image format is in jpg, png or webp
|
# If image format is in jpg, png or webp
|
||||||
# If image is in the first half of the total images extracted from the page
|
# If image is in the first half of the total images extracted from the page
|
||||||
IMAGE_SCORE_THRESHOLD = 2
|
IMAGE_SCORE_THRESHOLD = 2
|
||||||
|
|
||||||
|
MAX_METRICS_HISTORY = 1000
|
||||||
@@ -10,6 +10,7 @@ from .extraction_strategy import *
|
|||||||
from .crawler_strategy import *
|
from .crawler_strategy import *
|
||||||
from typing import List
|
from typing import List
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from .content_scrapping_strategy import WebScrappingStrategy
|
||||||
from .config import *
|
from .config import *
|
||||||
import warnings
|
import warnings
|
||||||
import json
|
import json
|
||||||
@@ -181,7 +182,21 @@ class WebCrawler:
|
|||||||
# Extract content from HTML
|
# Extract content from HTML
|
||||||
try:
|
try:
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
|
scrapping_strategy = WebScrappingStrategy()
|
||||||
|
extra_params = {k: v for k, v in kwargs.items() if k not in ["only_text", "image_description_min_word_threshold"]}
|
||||||
|
result = scrapping_strategy.scrap(
|
||||||
|
url,
|
||||||
|
html,
|
||||||
|
word_count_threshold=word_count_threshold,
|
||||||
|
css_selector=css_selector,
|
||||||
|
only_text=kwargs.get("only_text", False),
|
||||||
|
image_description_min_word_threshold=kwargs.get(
|
||||||
|
"image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
||||||
|
),
|
||||||
|
**extra_params,
|
||||||
|
)
|
||||||
|
|
||||||
|
# result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds")
|
print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user