diff --git a/.gitignore b/.gitignore index 4c3e151e..aca02959 100644 --- a/.gitignore +++ b/.gitignore @@ -199,6 +199,7 @@ test_env/ **/.DS_Store todo.md +todo_executor.md git_changes.py git_changes.md pypi_build.sh diff --git a/Dockerfile b/Dockerfile index 9a921d03..125fb9b8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -115,7 +115,12 @@ HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \ CMD curl -f http://localhost:8000/health || exit 1 # Expose port -EXPOSE 8000 +EXPOSE 8000 11235 9222 8080 + +# Optional: Increase shared memory size to prevent browser crashes +# when loading heavy pages +RUN mkdir /dev/shm +VOLUME /dev/shm # Start the FastAPI server CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"] \ No newline at end of file diff --git a/README.md b/README.md index e1a64aa1..d250f936 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scrapper +# 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scraper unclecode%2Fcrawl4ai | Trendshift @@ -127,6 +127,9 @@ docker pull unclecode/crawl4ai:gpu # GPU-enabled version # Run the container docker run -p 11235:11235 unclecode/crawl4ai:basic # Replace 'basic' with your chosen version + +# In case to allocate more shared memory for the container +docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic ``` #### Option 2: Build from Repository diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 0c6a2db4..1bcc491c 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -26,5 +26,5 @@ if is_sync_version_installed(): print("Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies.") else: WebCrawler = None - import warnings - print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.") \ No newline at end of file + # import warnings + # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.") \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 896a0644..57288b59 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -64,12 +64,27 @@ class ManagedBrowser: stdout=subprocess.PIPE, stderr=subprocess.PIPE ) + # Monitor browser process output for errors + asyncio.create_task(self._monitor_browser_process()) await asyncio.sleep(2) # Give browser time to start return f"http://localhost:{self.debugging_port}" except Exception as e: await self.cleanup() raise Exception(f"Failed to start browser: {e}") + async def _monitor_browser_process(self): + """Monitor the browser process for unexpected termination.""" + if self.browser_process: + stdout, stderr = await asyncio.gather( + asyncio.to_thread(self.browser_process.stdout.read), + asyncio.to_thread(self.browser_process.stderr.read) + ) + if self.browser_process.poll() is not None: + print(f"Browser process terminated unexpectedly with code {self.browser_process.returncode}") + print(f"STDOUT: {stdout.decode()}") + print(f"STDERR: {stderr.decode()}") + await self.cleanup() + def _get_browser_path(self) -> str: """Returns the browser executable path based on OS and browser type""" if sys.platform == "darwin": # macOS @@ -330,9 +345,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.playwright.stop() self.playwright = None - def __del__(self): - if self.browser or self.playwright: - asyncio.get_event_loop().run_until_complete(self.close()) + # Issue #256: Remove __del__ method to avoid potential issues with async cleanup + # def __del__(self): + # if self.browser or self.playwright: + # asyncio.get_event_loop().run_until_complete(self.close()) def set_hook(self, hook_type: str, hook: Callable): if hook_type in self.hooks: diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index ceb9ad28..f580776b 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -47,17 +47,17 @@ class AsyncWebCrawler: async def awarmup(self): # Print a message for crawl4ai and its version - print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") if self.verbose: + print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") print("[LOG] 🌤️ Warming up the AsyncWebCrawler") # await async_db_manager.ainit_db() await async_db_manager.initialize() - await self.arun( - url="https://google.com/", - word_count_threshold=5, - bypass_cache=False, - verbose=False, - ) + # await self.arun( + # url="https://google.com/", + # word_count_threshold=5, + # bypass_cache=False, + # verbose=False, + # ) self.ready = True if self.verbose: print("[LOG] 🌞 AsyncWebCrawler is ready to crawl") diff --git a/crawl4ai/config.py b/crawl4ai/config.py index a07ca977..16638b6d 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -51,3 +51,5 @@ SOCIAL_MEDIA_DOMAINS = [ # If image format is in jpg, png or webp # If image is in the first half of the total images extracted from the page IMAGE_SCORE_THRESHOLD = 2 + +MAX_METRICS_HISTORY = 1000 \ No newline at end of file diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 20e9b04e..95af6c7a 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -10,6 +10,7 @@ from .extraction_strategy import * from .crawler_strategy import * from typing import List from concurrent.futures import ThreadPoolExecutor +from .content_scrapping_strategy import WebScrappingStrategy from .config import * import warnings import json @@ -181,7 +182,21 @@ class WebCrawler: # Extract content from HTML try: t1 = time.time() - result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False)) + scrapping_strategy = WebScrappingStrategy() + extra_params = {k: v for k, v in kwargs.items() if k not in ["only_text", "image_description_min_word_threshold"]} + result = scrapping_strategy.scrap( + url, + html, + word_count_threshold=word_count_threshold, + css_selector=css_selector, + only_text=kwargs.get("only_text", False), + image_description_min_word_threshold=kwargs.get( + "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD + ), + **extra_params, + ) + + # result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False)) if verbose: print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds")