fix: Resolve unexpected BrowserContext closure during crawl in Docker

- Removed __del__ method in AsyncPlaywrightCrawlerStrategy to ensure reliable browser lifecycle management by using explicit context managers. - Added process monitoring in ManagedBrowser to detect and log unexpected terminations of the browser subprocess. - Updated Docker configuration to expose port 9222 for remote debugging and allocate extra shared memory to prevent browser crashes. - Improved error handling and resource cleanup for browser instances, particularly in Docker environments. Resolves Issue #256
2024-11-13 15:37:16 +08:00
parent b6d6631b12
commit bf91adf3f8
8 changed files with 57 additions and 15 deletions
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -10,6 +10,7 @@ from .extraction_strategy import *
 from .crawler_strategy import *
 from typing import List
 from concurrent.futures import ThreadPoolExecutor
+from .content_scrapping_strategy import WebScrappingStrategy
 from .config import *
 import warnings
 import json
@@ -181,7 +182,21 @@ class WebCrawler:
            # Extract content from HTML
            try:
                t1 = time.time()
-                result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
+                scrapping_strategy = WebScrappingStrategy()
+                extra_params = {k: v for k, v in kwargs.items() if k not in ["only_text", "image_description_min_word_threshold"]}
+                result = scrapping_strategy.scrap(
+                    url,
+                    html,
+                    word_count_threshold=word_count_threshold,
+                    css_selector=css_selector,
+                    only_text=kwargs.get("only_text", False),
+                    image_description_min_word_threshold=kwargs.get(
+                        "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
+                    ),
+                    **extra_params,
+                )
+                
+                # result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
                if verbose:
                    print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds")