From 9546773a07508e0aa8f3904596dee02ea0c7f18a Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 24 Jul 2025 21:24:40 +0800 Subject: [PATCH] fix: Move sentence-transformers to optional dependencies - Moved sentence-transformers from core to optional dependencies in pyproject.toml - Removed sentence-transformers from requirements.txt - Added proper ImportError handling with helpful installation message - This prevents ~2.5GB of NVIDIA CUDA libraries from being installed by default - Users who need embedding features can install with: pip install 'crawl4ai[transformer]' --- crawl4ai/utils.py | 8 +++++++- pyproject.toml | 9 ++++----- requirements.txt | 1 - 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 8735dee0..b2001cdd 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -3342,7 +3342,13 @@ async def get_text_embeddings( # Default: use sentence-transformers else: # Lazy load to avoid importing heavy libraries unless needed - from sentence_transformers import SentenceTransformer + try: + from sentence_transformers import SentenceTransformer + except ImportError: + raise ImportError( + "sentence-transformers is required for local embeddings. " + "Install it with: pip install 'crawl4ai[transformer]' or pip install sentence-transformers" + ) # Cache the model in function attribute to avoid reloading if not hasattr(get_text_embeddings, '_models'): diff --git a/pyproject.toml b/pyproject.toml index a582d430..3d70a68d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,7 +44,6 @@ dependencies = [ "brotli>=1.1.0", "humanize>=4.10.0", "lark>=1.2.2", - "sentence-transformers>=2.2.0", "alphashape>=1.3.1", "shapely>=2.0.0" ] @@ -62,8 +61,8 @@ classifiers = [ [project.optional-dependencies] pdf = ["PyPDF2"] torch = ["torch", "nltk", "scikit-learn"] -transformer = ["transformers", "tokenizers"] -cosine = ["torch", "transformers", "nltk"] +transformer = ["transformers", "tokenizers", "sentence-transformers"] +cosine = ["torch", "transformers", "nltk", "sentence-transformers"] sync = ["selenium"] all = [ "PyPDF2", @@ -72,8 +71,8 @@ all = [ "scikit-learn", "transformers", "tokenizers", - "selenium", - "PyPDF2" + "sentence-transformers", + "selenium" ] [project.scripts] diff --git a/requirements.txt b/requirements.txt index 37fc7959..001d090d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,7 +24,6 @@ cssselect>=1.2.0 chardet>=5.2.0 brotli>=1.1.0 httpx[http2]>=0.27.2 -sentence-transformers>=2.2.0 alphashape>=1.3.1 shapely>=2.0.0