Merge branch 'main' of https://github.com/unclecode/crawl4ai
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -174,3 +174,4 @@ requirements0.txt
|
|||||||
a.txt
|
a.txt
|
||||||
|
|
||||||
*.sh
|
*.sh
|
||||||
|
.idea
|
||||||
86
Dockerfile
86
Dockerfile
@@ -1,43 +1,77 @@
|
|||||||
# Use an official Python runtime as a parent image
|
|
||||||
FROM python:3.10-slim
|
# First stage: Build and install dependencies
|
||||||
|
FROM python:3.10-slim-bookworm as builder
|
||||||
|
|
||||||
# Set the working directory in the container
|
# Set the working directory in the container
|
||||||
WORKDIR /usr/src/app
|
WORKDIR /usr/src/app
|
||||||
|
|
||||||
# Copy the current directory contents into the container at /usr/src/app
|
# Install build dependencies
|
||||||
COPY . .
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
# Install dependencies for Chrome and ChromeDriver
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
||||||
wget \
|
wget \
|
||||||
xvfb \
|
|
||||||
unzip \
|
|
||||||
curl \
|
curl \
|
||||||
|
unzip
|
||||||
|
|
||||||
|
# Install Python dependencies
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt && \
|
||||||
|
pip install --no-cache-dir spacy torch torchvision torchaudio onnxruntime uvicorn && \
|
||||||
|
python -m spacy download en_core_web_sm
|
||||||
|
|
||||||
|
# Download and install ChromeDriver
|
||||||
|
RUN CHROMEDRIVER_VERSION=$(curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE) && \
|
||||||
|
wget -N https://chromedriver.storage.googleapis.com/$CHROMEDRIVER_VERSION/chromedriver_linux64.zip -P /tmp && \
|
||||||
|
unzip /tmp/chromedriver_linux64.zip -d /tmp && \
|
||||||
|
mv /tmp/chromedriver /usr/local/bin/chromedriver && \
|
||||||
|
chmod +x /usr/local/bin/chromedriver && \
|
||||||
|
rm /tmp/chromedriver_linux64.zip
|
||||||
|
|
||||||
|
# Second stage: Create final runtime image
|
||||||
|
FROM python:3.10-slim-bookworm
|
||||||
|
|
||||||
|
# Set the working directory in the container
|
||||||
|
WORKDIR /usr/src/app
|
||||||
|
|
||||||
|
# Install runtime dependencies
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
wget \
|
||||||
|
git \
|
||||||
|
xvfb \
|
||||||
gnupg2 \
|
gnupg2 \
|
||||||
ca-certificates \
|
ca-certificates \
|
||||||
apt-transport-https \
|
apt-transport-https \
|
||||||
software-properties-common \
|
software-properties-common && \
|
||||||
&& mkdir -p /etc/apt/keyrings \
|
wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - && \
|
||||||
&& curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \
|
echo "deb http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list && \
|
||||||
&& echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main' | tee /etc/apt/sources.list.d/google-chrome.list \
|
apt-get update && \
|
||||||
&& apt-get update \
|
apt-get install -y --no-install-recommends google-chrome-stable && \
|
||||||
&& apt-get install -y google-chrome-stable \
|
rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/google-chrome.list
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
|
||||||
&& apt-get install -y chromium-chromedriver
|
|
||||||
|
|
||||||
# Install Python dependencies
|
# Copy Chromedriver from the builder stage
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver
|
||||||
RUN pip install spacy torch torchvision torchaudio
|
|
||||||
|
|
||||||
# Set display port and dbus env to avoid hanging
|
# Copy installed Python packages from builder stage
|
||||||
ENV DISPLAY=:99
|
COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
|
||||||
ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
|
COPY --from=builder /usr/local/bin /usr/local/bin
|
||||||
|
|
||||||
|
# Copy the rest of the application code
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set environment to use Chrome and ChromeDriver properly
|
||||||
|
ENV CHROME_BIN=/usr/bin/google-chrome \
|
||||||
|
CHROMEDRIVER=/usr/local/bin/chromedriver \
|
||||||
|
DISPLAY=:99 \
|
||||||
|
DBUS_SESSION_BUS_ADDRESS=/dev/null \
|
||||||
|
PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
# Ensure the PATH environment variable includes the location of the installed packages
|
||||||
|
ENV PATH /usr/local/bin:$PATH
|
||||||
|
|
||||||
# Make port 80 available to the world outside this container
|
# Make port 80 available to the world outside this container
|
||||||
EXPOSE 80
|
EXPOSE 80
|
||||||
|
|
||||||
# Define environment variable
|
|
||||||
ENV PYTHONUNBUFFERED 1
|
|
||||||
|
|
||||||
# Run uvicorn
|
# Run uvicorn
|
||||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
45
Dockerfile-version-0
Normal file
45
Dockerfile-version-0
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
# Use an official Python runtime as a parent image
|
||||||
|
FROM python:3.10-slim
|
||||||
|
# In case you had some weird issues, try this Image
|
||||||
|
# FROM python:3.10-slim-bookworm as builder
|
||||||
|
|
||||||
|
# Set the working directory in the container
|
||||||
|
WORKDIR /usr/src/app
|
||||||
|
|
||||||
|
# Copy the current directory contents into the container at /usr/src/app
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Install dependencies for Chrome and ChromeDriver
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
wget \
|
||||||
|
xvfb \
|
||||||
|
unzip \
|
||||||
|
curl \
|
||||||
|
gnupg2 \
|
||||||
|
ca-certificates \
|
||||||
|
apt-transport-https \
|
||||||
|
software-properties-common \
|
||||||
|
&& mkdir -p /etc/apt/keyrings \
|
||||||
|
&& curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \
|
||||||
|
&& echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main' | tee /etc/apt/sources.list.d/google-chrome.list \
|
||||||
|
&& apt-get update \
|
||||||
|
&& apt-get install -y google-chrome-stable \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
&& apt-get install -y chromium-chromedriver
|
||||||
|
|
||||||
|
# Install Python dependencies
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
RUN pip install spacy torch torchvision torchaudio
|
||||||
|
|
||||||
|
# Set display port and dbus env to avoid hanging
|
||||||
|
ENV DISPLAY=:99
|
||||||
|
ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
|
||||||
|
|
||||||
|
# Make port 80 available to the world outside this container
|
||||||
|
EXPOSE 80
|
||||||
|
|
||||||
|
# Define environment variable
|
||||||
|
ENV PYTHONUNBUFFERED 1
|
||||||
|
|
||||||
|
# Run uvicorn
|
||||||
|
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
||||||
12
README.md
12
README.md
@@ -1,4 +1,4 @@
|
|||||||
# Crawl4AI v0.2.0 🕷️🤖
|
# Crawl4AI v0.2.2 🕷️🤖
|
||||||
|
|
||||||
[](https://github.com/unclecode/crawl4ai/stargazers)
|
[](https://github.com/unclecode/crawl4ai/stargazers)
|
||||||
[](https://github.com/unclecode/crawl4ai/network/members)
|
[](https://github.com/unclecode/crawl4ai/network/members)
|
||||||
@@ -10,8 +10,14 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
|
|||||||
|
|
||||||
[](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)
|
[](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)
|
||||||
|
|
||||||
## Recent Changes v0.2.0
|
## Recent Changes
|
||||||
|
|
||||||
|
### v0.2.2
|
||||||
|
- Support multiple JS scripts
|
||||||
|
- Fixed some of bugs
|
||||||
|
- Resolved a few issue relevant to Colab installation
|
||||||
|
|
||||||
|
### v0.2.0
|
||||||
- 🚀 10x faster!!
|
- 🚀 10x faster!!
|
||||||
- 📜 Execute custom JavaScript before crawling!
|
- 📜 Execute custom JavaScript before crawling!
|
||||||
- 🤝 Colab friendly!
|
- 🤝 Colab friendly!
|
||||||
@@ -30,8 +36,6 @@ from crawl4ai import WebCrawler
|
|||||||
# Create the WebCrawler instance
|
# Create the WebCrawler instance
|
||||||
crawler = WebCrawler()
|
crawler = WebCrawler()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Run the crawler with keyword filtering and CSS selector
|
# Run the crawler with keyword filtering and CSS selector
|
||||||
result = crawler.run(url="https://www.nbcnews.com/business")
|
result = crawler.run(url="https://www.nbcnews.com/business")
|
||||||
print(result) # {url, html, markdown, extracted_content, metadata}
|
print(result) # {url, html, markdown, extracted_content, metadata}
|
||||||
|
|||||||
@@ -103,12 +103,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Execute JS code if provided
|
# Execute JS code if provided
|
||||||
if self.js_code:
|
if self.js_code and type(self.js_code) == str:
|
||||||
self.driver.execute_script(self.js_code)
|
self.driver.execute_script(self.js_code)
|
||||||
# Optionally, wait for some condition after executing the JS code
|
# Optionally, wait for some condition after executing the JS code
|
||||||
WebDriverWait(self.driver, 10).until(
|
WebDriverWait(self.driver, 10).until(
|
||||||
lambda driver: driver.execute_script("return document.readyState") == "complete"
|
lambda driver: driver.execute_script("return document.readyState") == "complete"
|
||||||
)
|
)
|
||||||
|
elif self.js_code and type(self.js_code) == list:
|
||||||
|
for js in self.js_code:
|
||||||
|
self.driver.execute_script(js)
|
||||||
|
WebDriverWait(self.driver, 10).until(
|
||||||
|
lambda driver: driver.execute_script("return document.readyState") == "complete"
|
||||||
|
)
|
||||||
|
|
||||||
html = self.driver.page_source
|
html = self.driver.page_source
|
||||||
|
|
||||||
|
|||||||
@@ -188,11 +188,12 @@ class CosineStrategy(ExtractionStrategy):
|
|||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
|
print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
|
||||||
|
|
||||||
if False and self.device.type == "cpu":
|
# if False and self.device.type == "cpu":
|
||||||
self.model = load_onnx_all_MiniLM_l6_v2()
|
# self.model = load_onnx_all_MiniLM_l6_v2()
|
||||||
self.tokenizer = self.model.tokenizer
|
# self.tokenizer = self.model.tokenizer
|
||||||
self.get_embedding_method = "direct"
|
# self.get_embedding_method = "direct"
|
||||||
else:
|
# else:
|
||||||
|
|
||||||
self.tokenizer, self.model = load_bge_small_en_v1_5()
|
self.tokenizer, self.model = load_bge_small_en_v1_5()
|
||||||
self.model.eval()
|
self.model.eval()
|
||||||
self.get_embedding_method = "batch"
|
self.get_embedding_method = "batch"
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ from functools import lru_cache
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import subprocess, os
|
import subprocess, os
|
||||||
import shutil
|
import shutil
|
||||||
|
import tarfile
|
||||||
from crawl4ai.config import MODEL_REPO_BRANCH
|
from crawl4ai.config import MODEL_REPO_BRANCH
|
||||||
import argparse
|
import argparse
|
||||||
import urllib.request
|
import urllib.request
|
||||||
@@ -36,7 +37,6 @@ def calculate_batch_size(device):
|
|||||||
else:
|
else:
|
||||||
return 16 # Default batch size
|
return 16 # Default batch size
|
||||||
|
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def get_device():
|
def get_device():
|
||||||
import torch
|
import torch
|
||||||
@@ -82,12 +82,19 @@ def load_bge_small_en_v1_5():
|
|||||||
@lru_cache()
|
@lru_cache()
|
||||||
def load_onnx_all_MiniLM_l6_v2():
|
def load_onnx_all_MiniLM_l6_v2():
|
||||||
from crawl4ai.onnx_embedding import DefaultEmbeddingModel
|
from crawl4ai.onnx_embedding import DefaultEmbeddingModel
|
||||||
model_path = "models/onnx/model.onnx"
|
|
||||||
model_url = "https://unclecode-files.s3.us-west-2.amazonaws.com/model.onnx"
|
|
||||||
download_path = os.path.join(__location__, model_path)
|
|
||||||
|
|
||||||
|
model_path = "models/onnx.tar.gz"
|
||||||
|
model_url = "https://unclecode-files.s3.us-west-2.amazonaws.com/onnx.tar.gz"
|
||||||
|
__location__ = os.path.realpath(
|
||||||
|
os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
|
download_path = os.path.join(__location__, model_path)
|
||||||
|
onnx_dir = os.path.join(__location__, "models/onnx")
|
||||||
|
|
||||||
|
# Create the models directory if it does not exist
|
||||||
|
os.makedirs(os.path.dirname(download_path), exist_ok=True)
|
||||||
|
|
||||||
|
# Download the tar.gz file if it does not exist
|
||||||
if not os.path.exists(download_path):
|
if not os.path.exists(download_path):
|
||||||
# Define a download function with a simple progress display
|
|
||||||
def download_with_progress(url, filename):
|
def download_with_progress(url, filename):
|
||||||
def reporthook(block_num, block_size, total_size):
|
def reporthook(block_num, block_size, total_size):
|
||||||
downloaded = block_num * block_size
|
downloaded = block_num * block_size
|
||||||
@@ -95,12 +102,22 @@ def load_onnx_all_MiniLM_l6_v2():
|
|||||||
if downloaded < total_size:
|
if downloaded < total_size:
|
||||||
print(f"\rDownloading: {percentage:.2f}% ({downloaded / (1024 * 1024):.2f} MB of {total_size / (1024 * 1024):.2f} MB)", end='')
|
print(f"\rDownloading: {percentage:.2f}% ({downloaded / (1024 * 1024):.2f} MB of {total_size / (1024 * 1024):.2f} MB)", end='')
|
||||||
else:
|
else:
|
||||||
print("\rDownload complete! ")
|
print("\rDownload complete!")
|
||||||
|
|
||||||
urllib.request.urlretrieve(url, filename, reporthook)
|
urllib.request.urlretrieve(url, filename, reporthook)
|
||||||
|
|
||||||
download_with_progress(model_url, download_path)
|
download_with_progress(model_url, download_path)
|
||||||
|
|
||||||
|
# Extract the tar.gz file if the onnx directory does not exist
|
||||||
|
if not os.path.exists(onnx_dir):
|
||||||
|
with tarfile.open(download_path, "r:gz") as tar:
|
||||||
|
tar.extractall(path=os.path.join(__location__, "models"))
|
||||||
|
|
||||||
|
# remove the tar.gz file
|
||||||
|
os.remove(download_path)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
model = DefaultEmbeddingModel()
|
model = DefaultEmbeddingModel()
|
||||||
return model
|
return model
|
||||||
|
|
||||||
@@ -240,8 +257,8 @@ def download_all_models(remove_existing=False):
|
|||||||
# load_bert_base_uncased()
|
# load_bert_base_uncased()
|
||||||
# print("[LOG] Downloading BGE Small EN v1.5...")
|
# print("[LOG] Downloading BGE Small EN v1.5...")
|
||||||
# load_bge_small_en_v1_5()
|
# load_bge_small_en_v1_5()
|
||||||
print("[LOG] Downloading ONNX model...")
|
# print("[LOG] Downloading ONNX model...")
|
||||||
load_onnx_all_MiniLM_l6_v2()
|
# load_onnx_all_MiniLM_l6_v2()
|
||||||
print("[LOG] Downloading text classifier...")
|
print("[LOG] Downloading text classifier...")
|
||||||
_, device = load_text_multilabel_classifier()
|
_, device = load_text_multilabel_classifier()
|
||||||
print(f"[LOG] Text classifier loaded on {device}")
|
print(f"[LOG] Text classifier loaded on {device}")
|
||||||
|
|||||||
@@ -164,6 +164,22 @@ def interactive_extraction(crawler):
|
|||||||
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
||||||
print_result(result)
|
print_result(result)
|
||||||
|
|
||||||
|
def multiple_scrip(crawler):
|
||||||
|
# Passing JavaScript code to interact with the page
|
||||||
|
cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
|
||||||
|
cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
|
||||||
|
js_code = ["""
|
||||||
|
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||||
|
loadMoreButton && loadMoreButton.click();
|
||||||
|
"""] * 2
|
||||||
|
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||||
|
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||||
|
result = crawler.run(
|
||||||
|
url="https://www.nbcnews.com/business",
|
||||||
|
)
|
||||||
|
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
||||||
|
print_result(result)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
||||||
cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
|
cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
|
||||||
@@ -180,6 +196,7 @@ def main():
|
|||||||
add_llm_extraction_strategy(crawler)
|
add_llm_extraction_strategy(crawler)
|
||||||
targeted_extraction(crawler)
|
targeted_extraction(crawler)
|
||||||
interactive_extraction(crawler)
|
interactive_extraction(crawler)
|
||||||
|
multiple_scrip(crawler)
|
||||||
|
|
||||||
cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")
|
cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")
|
||||||
|
|
||||||
|
|||||||
2
main.py
2
main.py
@@ -66,7 +66,7 @@ async def read_index(request: Request):
|
|||||||
|
|
||||||
for filename in os.listdir(partials_dir):
|
for filename in os.listdir(partials_dir):
|
||||||
if filename.endswith(".html"):
|
if filename.endswith(".html"):
|
||||||
with open(os.path.join(partials_dir, filename), "r") as file:
|
with open(os.path.join(partials_dir, filename), "r", encoding="utf8") as file:
|
||||||
partials[filename[:-5]] = file.read()
|
partials[filename[:-5]] = file.read()
|
||||||
|
|
||||||
return templates.TemplateResponse("index.html", {"request": request, **partials})
|
return templates.TemplateResponse("index.html", {"request": request, **partials})
|
||||||
|
|||||||
13
requirements.crawl.txt
Normal file
13
requirements.crawl.txt
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
aiohttp
|
||||||
|
aiosqlite
|
||||||
|
bs4
|
||||||
|
fastapi
|
||||||
|
html2text
|
||||||
|
httpx
|
||||||
|
pydantic
|
||||||
|
python-dotenv
|
||||||
|
requests
|
||||||
|
rich
|
||||||
|
selenium
|
||||||
|
uvicorn
|
||||||
|
chromedriver-autoinstaller
|
||||||
@@ -1,20 +1,20 @@
|
|||||||
aiohttp==3.9.5
|
aiohttp
|
||||||
aiosqlite==0.20.0
|
aiosqlite
|
||||||
bs4==0.0.2
|
bs4
|
||||||
fastapi==0.111.0
|
fastapi
|
||||||
html2text==2024.2.26
|
html2text
|
||||||
httpx==0.27.0
|
httpx
|
||||||
litellm==1.37.11
|
litellm
|
||||||
nltk==3.8.1
|
nltk
|
||||||
pydantic==2.7.1
|
pydantic
|
||||||
python-dotenv==1.0.1
|
python-dotenv
|
||||||
requests==2.31.0
|
requests
|
||||||
rich==13.7.1
|
rich
|
||||||
scikit-learn==1.4.2
|
scikit-learn
|
||||||
selenium==4.20.0
|
selenium
|
||||||
uvicorn==0.29.0
|
uvicorn
|
||||||
transformers==4.40.2
|
transformers
|
||||||
chromedriver-autoinstaller==0.6.4
|
chromedriver-autoinstaller
|
||||||
torch==2.3.0
|
torch
|
||||||
onnxruntime==1.14.1
|
onnxruntime
|
||||||
tokenizers==0.13.2
|
tokenizers
|
||||||
|
|||||||
7
setup.py
7
setup.py
@@ -7,11 +7,16 @@ from setuptools.command.install import install
|
|||||||
with open("requirements.txt") as f:
|
with open("requirements.txt") as f:
|
||||||
requirements = f.read().splitlines()
|
requirements = f.read().splitlines()
|
||||||
|
|
||||||
|
# Read the requirements from requirements.txt
|
||||||
|
with open("requirements.crawl.txt") as f:
|
||||||
|
requirements_crawl_only = f.read().splitlines()
|
||||||
|
|
||||||
# Define the requirements for different environments
|
# Define the requirements for different environments
|
||||||
requirements_without_torch = [req for req in requirements if not req.startswith("torch")]
|
requirements_without_torch = [req for req in requirements if not req.startswith("torch")]
|
||||||
requirements_without_transformers = [req for req in requirements if not req.startswith("transformers")]
|
requirements_without_transformers = [req for req in requirements if not req.startswith("transformers")]
|
||||||
requirements_without_nltk = [req for req in requirements if not req.startswith("nltk")]
|
requirements_without_nltk = [req for req in requirements if not req.startswith("nltk")]
|
||||||
requirements_without_torch_transformers_nlkt = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
|
requirements_without_torch_transformers_nlkt = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
|
||||||
|
requirements_crawl_only = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
|
||||||
|
|
||||||
class CustomInstallCommand(install):
|
class CustomInstallCommand(install):
|
||||||
"""Customized setuptools install command to install spacy without dependencies."""
|
"""Customized setuptools install command to install spacy without dependencies."""
|
||||||
@@ -34,7 +39,7 @@ setup(
|
|||||||
extras_require={
|
extras_require={
|
||||||
"all": requirements, # Include all requirements
|
"all": requirements, # Include all requirements
|
||||||
"colab": requirements_without_torch, # Exclude torch for Colab
|
"colab": requirements_without_torch, # Exclude torch for Colab
|
||||||
"crawl": requirements_without_torch_transformers_nlkt
|
"crawl": requirements_crawl_only, # Include only crawl requirements
|
||||||
},
|
},
|
||||||
cmdclass={
|
cmdclass={
|
||||||
'install': CustomInstallCommand,
|
'install': CustomInstallCommand,
|
||||||
|
|||||||
Reference in New Issue
Block a user