fix: Resolve unexpected BrowserContext closure during crawl in Docker

- Removed __del__ method in AsyncPlaywrightCrawlerStrategy to ensure reliable browser lifecycle management by using explicit context managers. - Added process monitoring in ManagedBrowser to detect and log unexpected terminations of the browser subprocess. - Updated Docker configuration to expose port 9222 for remote debugging and allocate extra shared memory to prevent browser crashes. - Improved error handling and resource cleanup for browser instances, particularly in Docker environments. Resolves Issue #256
2024-11-13 15:37:16 +08:00
parent b6d6631b12
commit bf91adf3f8
8 changed files with 57 additions and 15 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -199,6 +199,7 @@ test_env/
 **/.DS_Store

 todo.md
+todo_executor.md
 git_changes.py
 git_changes.md
 pypi_build.sh
--- a/7
+++ b/7
@@ -115,7 +115,12 @@ HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
    CMD curl -f http://localhost:8000/health || exit 1

 # Expose port
-EXPOSE 8000
+EXPOSE 8000 11235 9222 8080
+
+# Optional: Increase shared memory size to prevent browser crashes
+# when loading heavy pages
+RUN mkdir /dev/shm
+VOLUME /dev/shm

 # Start the FastAPI server
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"]
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scrapper
+# 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scraper

 <a href="https://trendshift.io/repositories/11716" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11716" alt="unclecode%2Fcrawl4ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>

@@ -127,6 +127,9 @@ docker pull unclecode/crawl4ai:gpu      # GPU-enabled version

 # Run the container
 docker run -p 11235:11235 unclecode/crawl4ai:basic  # Replace 'basic' with your chosen version
+
+# In case to allocate more shared memory for the container
+docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic
 ```

 #### Option 2: Build from Repository
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -26,5 +26,5 @@ if is_sync_version_installed():
        print("Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies.")
 else:
    WebCrawler = None
-    import warnings
-    print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
+    # import warnings
+    # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -64,12 +64,27 @@ class ManagedBrowser:
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE
            )
+            # Monitor browser process output for errors
+            asyncio.create_task(self._monitor_browser_process())
            await asyncio.sleep(2)  # Give browser time to start
            return f"http://localhost:{self.debugging_port}"
        except Exception as e:
            await self.cleanup()
            raise Exception(f"Failed to start browser: {e}")

+    async def _monitor_browser_process(self):
+        """Monitor the browser process for unexpected termination."""
+        if self.browser_process:
+            stdout, stderr = await asyncio.gather(
+                asyncio.to_thread(self.browser_process.stdout.read),
+                asyncio.to_thread(self.browser_process.stderr.read)
+            )
+            if self.browser_process.poll() is not None:
+                print(f"Browser process terminated unexpectedly with code {self.browser_process.returncode}")
+                print(f"STDOUT: {stdout.decode()}")
+                print(f"STDERR: {stderr.decode()}")
+                await self.cleanup()
+    
    def _get_browser_path(self) -> str:
        """Returns the browser executable path based on OS and browser type"""
        if sys.platform == "darwin":  # macOS
@@ -330,9 +345,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            await self.playwright.stop()
            self.playwright = None

-    def __del__(self):
-        if self.browser or self.playwright:
-            asyncio.get_event_loop().run_until_complete(self.close())
+    # Issue #256: Remove __del__ method to avoid potential issues with async cleanup
+    # def __del__(self):
+    #     if self.browser or self.playwright:
+    #         asyncio.get_event_loop().run_until_complete(self.close())

    def set_hook(self, hook_type: str, hook: Callable):
        if hook_type in self.hooks:
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -47,17 +47,17 @@ class AsyncWebCrawler:

    async def awarmup(self):
        # Print a message for crawl4ai and its version
-        print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
        if self.verbose:
+            print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
            print("[LOG] 🌤️  Warming up the AsyncWebCrawler")
        # await async_db_manager.ainit_db()
        await async_db_manager.initialize()
-        await self.arun(
-            url="https://google.com/",
-            word_count_threshold=5,
-            bypass_cache=False,
-            verbose=False,
-        )
+        # await self.arun(
+        #     url="https://google.com/",
+        #     word_count_threshold=5,
+        #     bypass_cache=False,
+        #     verbose=False,
+        # )
        self.ready = True
        if self.verbose:
            print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -51,3 +51,5 @@ SOCIAL_MEDIA_DOMAINS = [
 # If image format is in jpg, png or webp
 # If image is in the first half of the total images extracted from the page
 IMAGE_SCORE_THRESHOLD = 2
+
+MAX_METRICS_HISTORY = 1000
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -10,6 +10,7 @@ from .extraction_strategy import *
 from .crawler_strategy import *
 from typing import List
 from concurrent.futures import ThreadPoolExecutor
+from .content_scrapping_strategy import WebScrappingStrategy
 from .config import *
 import warnings
 import json
@@ -181,7 +182,21 @@ class WebCrawler:
            # Extract content from HTML
            try:
                t1 = time.time()
-                result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
+                scrapping_strategy = WebScrappingStrategy()
+                extra_params = {k: v for k, v in kwargs.items() if k not in ["only_text", "image_description_min_word_threshold"]}
+                result = scrapping_strategy.scrap(
+                    url,
+                    html,
+                    word_count_threshold=word_count_threshold,
+                    css_selector=css_selector,
+                    only_text=kwargs.get("only_text", False),
+                    image_description_min_word_threshold=kwargs.get(
+                        "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
+                    ),
+                    **extra_params,
+                )
+                
+                # result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
                if verbose:
                    print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds")