diff --git a/.gitignore b/.gitignore index 9aac8182..846ac59a 100644 --- a/.gitignore +++ b/.gitignore @@ -173,4 +173,5 @@ Crawl4AI.egg-info/ requirements0.txt a.txt -*.sh \ No newline at end of file +*.sh +.idea \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 2c1e7927..264d4159 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,43 +1,77 @@ -# Use an official Python runtime as a parent image -FROM python:3.10-slim + +# First stage: Build and install dependencies +FROM python:3.10-slim-bookworm as builder # Set the working directory in the container WORKDIR /usr/src/app -# Copy the current directory contents into the container at /usr/src/app -COPY . . - -# Install dependencies for Chrome and ChromeDriver -RUN apt-get update && apt-get install -y --no-install-recommends \ +# Install build dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ wget \ - xvfb \ - unzip \ curl \ + unzip + +# Install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt && \ + pip install --no-cache-dir spacy torch torchvision torchaudio onnxruntime uvicorn && \ + python -m spacy download en_core_web_sm + +# Download and install ChromeDriver +RUN CHROMEDRIVER_VERSION=$(curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE) && \ + wget -N https://chromedriver.storage.googleapis.com/$CHROMEDRIVER_VERSION/chromedriver_linux64.zip -P /tmp && \ + unzip /tmp/chromedriver_linux64.zip -d /tmp && \ + mv /tmp/chromedriver /usr/local/bin/chromedriver && \ + chmod +x /usr/local/bin/chromedriver && \ + rm /tmp/chromedriver_linux64.zip + +# Second stage: Create final runtime image +FROM python:3.10-slim-bookworm + +# Set the working directory in the container +WORKDIR /usr/src/app + +# Install runtime dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + wget \ + git \ + xvfb \ gnupg2 \ ca-certificates \ apt-transport-https \ - software-properties-common \ - && mkdir -p /etc/apt/keyrings \ - && curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \ - && echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main' | tee /etc/apt/sources.list.d/google-chrome.list \ - && apt-get update \ - && apt-get install -y google-chrome-stable \ - && rm -rf /var/lib/apt/lists/* \ - && apt-get install -y chromium-chromedriver + software-properties-common && \ + wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - && \ + echo "deb http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list && \ + apt-get update && \ + apt-get install -y --no-install-recommends google-chrome-stable && \ + rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/google-chrome.list -# Install Python dependencies -RUN pip install --no-cache-dir -r requirements.txt -RUN pip install spacy torch torchvision torchaudio +# Copy Chromedriver from the builder stage +COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver -# Set display port and dbus env to avoid hanging -ENV DISPLAY=:99 -ENV DBUS_SESSION_BUS_ADDRESS=/dev/null +# Copy installed Python packages from builder stage +COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages +COPY --from=builder /usr/local/bin /usr/local/bin + +# Copy the rest of the application code +COPY . . + +# Set environment to use Chrome and ChromeDriver properly +ENV CHROME_BIN=/usr/bin/google-chrome \ + CHROMEDRIVER=/usr/local/bin/chromedriver \ + DISPLAY=:99 \ + DBUS_SESSION_BUS_ADDRESS=/dev/null \ + PYTHONUNBUFFERED=1 + +# Ensure the PATH environment variable includes the location of the installed packages +ENV PATH /usr/local/bin:$PATH # Make port 80 available to the world outside this container EXPOSE 80 -# Define environment variable -ENV PYTHONUNBUFFERED 1 - # Run uvicorn CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"] + + diff --git a/Dockerfile-version-0 b/Dockerfile-version-0 new file mode 100644 index 00000000..4c86b882 --- /dev/null +++ b/Dockerfile-version-0 @@ -0,0 +1,45 @@ +# Use an official Python runtime as a parent image +FROM python:3.10-slim +# In case you had some weird issues, try this Image +# FROM python:3.10-slim-bookworm as builder + +# Set the working directory in the container +WORKDIR /usr/src/app + +# Copy the current directory contents into the container at /usr/src/app +COPY . . + +# Install dependencies for Chrome and ChromeDriver +RUN apt-get update && apt-get install -y --no-install-recommends \ + wget \ + xvfb \ + unzip \ + curl \ + gnupg2 \ + ca-certificates \ + apt-transport-https \ + software-properties-common \ + && mkdir -p /etc/apt/keyrings \ + && curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \ + && echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main' | tee /etc/apt/sources.list.d/google-chrome.list \ + && apt-get update \ + && apt-get install -y google-chrome-stable \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get install -y chromium-chromedriver + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt +RUN pip install spacy torch torchvision torchaudio + +# Set display port and dbus env to avoid hanging +ENV DISPLAY=:99 +ENV DBUS_SESSION_BUS_ADDRESS=/dev/null + +# Make port 80 available to the world outside this container +EXPOSE 80 + +# Define environment variable +ENV PYTHONUNBUFFERED 1 + +# Run uvicorn +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"] diff --git a/README.md b/README.md index 6871adf6..0ec7b773 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Crawl4AI v0.2.0 πŸ•·οΈπŸ€– +# Crawl4AI v0.2.2 πŸ•·οΈπŸ€– [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers) [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members) @@ -10,8 +10,14 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk) -## Recent Changes v0.2.0 +## Recent Changes +### v0.2.2 +- Support multiple JS scripts +- Fixed some of bugs +- Resolved a few issue relevant to Colab installation + +### v0.2.0 - πŸš€ 10x faster!! - πŸ“œ Execute custom JavaScript before crawling! - 🀝 Colab friendly! @@ -30,8 +36,6 @@ from crawl4ai import WebCrawler # Create the WebCrawler instance crawler = WebCrawler() - - # Run the crawler with keyword filtering and CSS selector result = crawler.run(url="https://www.nbcnews.com/business") print(result) # {url, html, markdown, extracted_content, metadata} diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index a98402bc..60d5c54f 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -103,12 +103,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): ) # Execute JS code if provided - if self.js_code: + if self.js_code and type(self.js_code) == str: self.driver.execute_script(self.js_code) # Optionally, wait for some condition after executing the JS code WebDriverWait(self.driver, 10).until( lambda driver: driver.execute_script("return document.readyState") == "complete" ) + elif self.js_code and type(self.js_code) == list: + for js in self.js_code: + self.driver.execute_script(js) + WebDriverWait(self.driver, 10).until( + lambda driver: driver.execute_script("return document.readyState") == "complete" + ) html = self.driver.page_source diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 2d164ff0..a24b5fe5 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -188,14 +188,15 @@ class CosineStrategy(ExtractionStrategy): if self.verbose: print(f"[LOG] Loading Extraction Model for {self.device.type} device.") - if False and self.device.type == "cpu": - self.model = load_onnx_all_MiniLM_l6_v2() - self.tokenizer = self.model.tokenizer - self.get_embedding_method = "direct" - else: - self.tokenizer, self.model = load_bge_small_en_v1_5() - self.model.eval() - self.get_embedding_method = "batch" + # if False and self.device.type == "cpu": + # self.model = load_onnx_all_MiniLM_l6_v2() + # self.tokenizer = self.model.tokenizer + # self.get_embedding_method = "direct" + # else: + + self.tokenizer, self.model = load_bge_small_en_v1_5() + self.model.eval() + self.get_embedding_method = "batch" self.buffer_embeddings = np.array([]) diff --git a/crawl4ai/model_loader.py b/crawl4ai/model_loader.py index 832b4240..7e17f7f9 100644 --- a/crawl4ai/model_loader.py +++ b/crawl4ai/model_loader.py @@ -2,6 +2,7 @@ from functools import lru_cache from pathlib import Path import subprocess, os import shutil +import tarfile from crawl4ai.config import MODEL_REPO_BRANCH import argparse import urllib.request @@ -34,8 +35,7 @@ def calculate_batch_size(device): else: return 32 else: - return 16 # Default batch size - + return 16 # Default batch size @lru_cache() def get_device(): @@ -82,12 +82,19 @@ def load_bge_small_en_v1_5(): @lru_cache() def load_onnx_all_MiniLM_l6_v2(): from crawl4ai.onnx_embedding import DefaultEmbeddingModel - model_path = "models/onnx/model.onnx" - model_url = "https://unclecode-files.s3.us-west-2.amazonaws.com/model.onnx" - download_path = os.path.join(__location__, model_path) + model_path = "models/onnx.tar.gz" + model_url = "https://unclecode-files.s3.us-west-2.amazonaws.com/onnx.tar.gz" + __location__ = os.path.realpath( + os.path.join(os.getcwd(), os.path.dirname(__file__))) + download_path = os.path.join(__location__, model_path) + onnx_dir = os.path.join(__location__, "models/onnx") + + # Create the models directory if it does not exist + os.makedirs(os.path.dirname(download_path), exist_ok=True) + + # Download the tar.gz file if it does not exist if not os.path.exists(download_path): - # Define a download function with a simple progress display def download_with_progress(url, filename): def reporthook(block_num, block_size, total_size): downloaded = block_num * block_size @@ -95,12 +102,22 @@ def load_onnx_all_MiniLM_l6_v2(): if downloaded < total_size: print(f"\rDownloading: {percentage:.2f}% ({downloaded / (1024 * 1024):.2f} MB of {total_size / (1024 * 1024):.2f} MB)", end='') else: - print("\rDownload complete! ") + print("\rDownload complete!") urllib.request.urlretrieve(url, filename, reporthook) download_with_progress(model_url, download_path) + # Extract the tar.gz file if the onnx directory does not exist + if not os.path.exists(onnx_dir): + with tarfile.open(download_path, "r:gz") as tar: + tar.extractall(path=os.path.join(__location__, "models")) + + # remove the tar.gz file + os.remove(download_path) + + + model = DefaultEmbeddingModel() return model @@ -240,8 +257,8 @@ def download_all_models(remove_existing=False): # load_bert_base_uncased() # print("[LOG] Downloading BGE Small EN v1.5...") # load_bge_small_en_v1_5() - print("[LOG] Downloading ONNX model...") - load_onnx_all_MiniLM_l6_v2() + # print("[LOG] Downloading ONNX model...") + # load_onnx_all_MiniLM_l6_v2() print("[LOG] Downloading text classifier...") _, device = load_text_multilabel_classifier() print(f"[LOG] Text classifier loaded on {device}") diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py index bfa1dd14..6046c9bb 100644 --- a/docs/examples/quickstart.py +++ b/docs/examples/quickstart.py @@ -164,6 +164,22 @@ def interactive_extraction(crawler): cprint("[LOG] πŸ“¦ [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]") print_result(result) +def multiple_scrip(crawler): + # Passing JavaScript code to interact with the page + cprint("\nπŸ–±οΈ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True) + cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.") + js_code = [""" + const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); + loadMoreButton && loadMoreButton.click(); + """] * 2 + crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code) + crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True) + result = crawler.run( + url="https://www.nbcnews.com/business", + ) + cprint("[LOG] πŸ“¦ [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]") + print_result(result) + def main(): cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]") cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]") @@ -180,6 +196,7 @@ def main(): add_llm_extraction_strategy(crawler) targeted_extraction(crawler) interactive_extraction(crawler) + multiple_scrip(crawler) cprint("\nπŸŽ‰ [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! πŸ•ΈοΈ[/bold green]") diff --git a/main.py b/main.py index 604fff3c..5dca8771 100644 --- a/main.py +++ b/main.py @@ -66,7 +66,7 @@ async def read_index(request: Request): for filename in os.listdir(partials_dir): if filename.endswith(".html"): - with open(os.path.join(partials_dir, filename), "r") as file: + with open(os.path.join(partials_dir, filename), "r", encoding="utf8") as file: partials[filename[:-5]] = file.read() return templates.TemplateResponse("index.html", {"request": request, **partials}) diff --git a/requirements.crawl.txt b/requirements.crawl.txt new file mode 100644 index 00000000..d72800cf --- /dev/null +++ b/requirements.crawl.txt @@ -0,0 +1,13 @@ +aiohttp +aiosqlite +bs4 +fastapi +html2text +httpx +pydantic +python-dotenv +requests +rich +selenium +uvicorn +chromedriver-autoinstaller diff --git a/requirements.txt b/requirements.txt index 1b10fc48..f4fdce65 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,20 +1,20 @@ -aiohttp==3.9.5 -aiosqlite==0.20.0 -bs4==0.0.2 -fastapi==0.111.0 -html2text==2024.2.26 -httpx==0.27.0 -litellm==1.37.11 -nltk==3.8.1 -pydantic==2.7.1 -python-dotenv==1.0.1 -requests==2.31.0 -rich==13.7.1 -scikit-learn==1.4.2 -selenium==4.20.0 -uvicorn==0.29.0 -transformers==4.40.2 -chromedriver-autoinstaller==0.6.4 -torch==2.3.0 -onnxruntime==1.14.1 -tokenizers==0.13.2 \ No newline at end of file +aiohttp +aiosqlite +bs4 +fastapi +html2text +httpx +litellm +nltk +pydantic +python-dotenv +requests +rich +scikit-learn +selenium +uvicorn +transformers +chromedriver-autoinstaller +torch +onnxruntime +tokenizers diff --git a/setup.py b/setup.py index 9217a32f..8f490469 100644 --- a/setup.py +++ b/setup.py @@ -7,11 +7,16 @@ from setuptools.command.install import install with open("requirements.txt") as f: requirements = f.read().splitlines() +# Read the requirements from requirements.txt +with open("requirements.crawl.txt") as f: + requirements_crawl_only = f.read().splitlines() + # Define the requirements for different environments requirements_without_torch = [req for req in requirements if not req.startswith("torch")] requirements_without_transformers = [req for req in requirements if not req.startswith("transformers")] requirements_without_nltk = [req for req in requirements if not req.startswith("nltk")] requirements_without_torch_transformers_nlkt = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")] +requirements_crawl_only = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")] class CustomInstallCommand(install): """Customized setuptools install command to install spacy without dependencies.""" @@ -34,7 +39,7 @@ setup( extras_require={ "all": requirements, # Include all requirements "colab": requirements_without_torch, # Exclude torch for Colab - "crawl": requirements_without_torch_transformers_nlkt + "crawl": requirements_crawl_only, # Include only crawl requirements }, cmdclass={ 'install': CustomInstallCommand,