diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index aadcda20..f635f60b 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -10,7 +10,7 @@ from functools import partial from .model_loader import * import math -import numpy as np + class ExtractionStrategy(ABC): """ Abstract base class for all extraction strategies. @@ -219,6 +219,8 @@ class CosineStrategy(ExtractionStrategy): """ super().__init__() + import numpy as np + self.semantic_filter = semantic_filter self.word_count_threshold = word_count_threshold self.max_dist = max_dist diff --git a/docs/md/installation.md b/docs/md/installation.md index 7e705b7b..0cfcc9b6 100644 --- a/docs/md/installation.md +++ b/docs/md/installation.md @@ -7,33 +7,60 @@ There are three ways to use Crawl4AI: ## Library Installation -To install Crawl4AI as a library, follow these steps: +Crawl4AI offers flexible installation options to suit various use cases. Choose the option that best fits your needs: -1. Install the package from GitHub: -``` -virtualenv venv -source venv/bin/activate -pip install "crawl4ai[all] @ git+https://github.com/unclecode/crawl4ai.git" -``` +1. **Default Installation** (Basic functionality): + ```bash + virtualenv venv + source venv/bin/activate + pip install "crawl4ai @ git+https://github.com/unclecode/crawl4ai.git" + ``` + Use this for basic web crawling and scraping tasks. -💡 Better to run the following CLI-command to load the required models. This is optional, but it will boost the performance and speed of the crawler. You need to do this only once. -``` +2. **Installation with PyTorch** (For advanced text clustering): + ```bash + virtualenv venv + source venv/bin/activate + pip install "crawl4ai[torch] @ git+https://github.com/unclecode/crawl4ai.git" + ``` + Choose this if you need the CosineSimilarity cluster strategy. + +3. **Installation with Transformers** (For summarization and Hugging Face models): + ```bash + virtualenv venv + source venv/bin/activate + pip install "crawl4ai[transformer] @ git+https://github.com/unclecode/crawl4ai.git" + ``` + Opt for this if you require text summarization or plan to use Hugging Face models. + +4. **Full Installation** (All features): + ```bash + virtualenv venv + source venv/bin/activate + pip install "crawl4ai[all] @ git+https://github.com/unclecode/crawl4ai.git" + ``` + This installs all dependencies for full functionality. + +5. **Development Installation** (For contributors): + ```bash + virtualenv venv + source venv/bin/activate + git clone https://github.com/unclecode/crawl4ai.git + cd crawl4ai + pip install -e ".[all]" + ``` + Use this if you plan to modify the source code. + +💡 After installation, it's recommended to run the following CLI command to load the required models. This is optional but will boost the performance and speed of the crawler. You need to do this only once: +```bash crawl4ai-download-models ``` -2. Alternatively, you can clone the repository and install the package locally: -``` -virtualenv venv -source venv/bin/activate -git clone https://github.com/unclecode/crawl4ai.git -cd crawl4ai -pip install -e .[all] -``` - ## Using Docker for Local Server -3. Use Docker to run the local server: -``` +To run Crawl4AI as a local server using Docker: + +```bash # For Mac users # docker build --platform linux/amd64 -t crawl4ai . # For other users @@ -43,4 +70,4 @@ docker run -d -p 8000:80 crawl4ai ## Using Google Colab -You can also use Crawl4AI in a Google Colab notebook for easy setup and experimentation. Simply open the following Colab notebook and follow the instructions: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk) +You can also use Crawl4AI in a Google Colab notebook for easy setup and experimentation. Simply open the following Colab notebook and follow the instructions: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk) \ No newline at end of file diff --git a/setup.py b/setup.py index a11abc2e..6faf1d9b 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,5 @@ from setuptools import setup, find_packages import os -import sys from pathlib import Path import subprocess from setuptools.command.install import install @@ -14,16 +13,10 @@ os.makedirs(f"{crawl4ai_folder}/cache", exist_ok=True) with open("requirements.txt") as f: requirements = f.read().splitlines() -# Read the requirements from requirements.txt -with open("requirements.crawl.txt") as f: - requirements_crawl_only = f.read().splitlines() - # Define the requirements for different environments -requirements_without_torch = [req for req in requirements if not req.startswith("torch")] -requirements_without_transformers = [req for req in requirements if not req.startswith("transformers")] -requirements_without_nltk = [req for req in requirements if not req.startswith("nltk")] -requirements_without_torch_transformers_nlkt = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")] -requirements_crawl_only = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")] +default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn", "numpy"))] +torch_requirements = [req for req in requirements if req.startswith(("torch", "nltk", "spacy", "scikit-learn", "numpy"))] +transformer_requirements = [req for req in requirements if req.startswith(("transformers", "tokenizers", "onnxruntime"))] class CustomInstallCommand(install): """Customized setuptools install command to install spacy without dependencies.""" @@ -42,11 +35,11 @@ setup( author_email="unclecode@kidocode.com", license="MIT", packages=find_packages(), - install_requires=requirements_without_torch_transformers_nlkt, + install_requires=default_requirements, extras_require={ - "all": requirements, # Include all requirements - "colab": requirements_without_torch, # Exclude torch for Colab - "crawl": requirements_crawl_only, # Include only crawl requirements + "torch": torch_requirements, + "transformer": transformer_requirements, + "all": requirements, }, cmdclass={ 'install': CustomInstallCommand, @@ -67,4 +60,4 @@ setup( "Programming Language :: Python :: 3.10", ], python_requires=">=3.7", -) +) \ No newline at end of file