Compare commits
2 Commits
docker-tes
...
format-inl
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2f246d19f4 | ||
|
|
413595542a |
BIN
.files/screenshot.png
Normal file
BIN
.files/screenshot.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.5 MiB |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -179,6 +179,3 @@ docs/examples/.chainlit/
|
||||
docs/examples/.chainlit/*
|
||||
.chainlit/config.toml
|
||||
.chainlit/translations/en-US.json
|
||||
|
||||
local/
|
||||
.files/
|
||||
@@ -1,5 +1,10 @@
|
||||
# Changelog
|
||||
|
||||
## [0.2.5] - 2024-06-17
|
||||
### Added
|
||||
- Enhancement issue #24: Replaced inline HTML tags (e.g., DEL, INS, SUB, ABBR) with textual format for better context handling in LLM.
|
||||
|
||||
|
||||
## [0.2.4] - 2024-06-17
|
||||
### Fixed
|
||||
- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
|
||||
@@ -1,37 +0,0 @@
|
||||
|
||||
# First stage: Build and install dependencies
|
||||
FROM python:3.10-slim-bookworm as builder
|
||||
|
||||
# Set the working directory in the container
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
curl \
|
||||
unzip
|
||||
|
||||
# Install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt && \
|
||||
pip install --no-cache-dir spacy
|
||||
|
||||
# Copy the rest of the application code
|
||||
COPY . .
|
||||
|
||||
# Set environment to use Chrome and ChromeDriver properly
|
||||
ENV CHROME_BIN=/usr/bin/google-chrome \
|
||||
CHROMEDRIVER=/usr/local/bin/chromedriver \
|
||||
DISPLAY=:99 \
|
||||
DBUS_SESSION_BUS_ADDRESS=/dev/null \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
# Ensure the PATH environment variable includes the location of the installed packages
|
||||
ENV PATH /usr/local/bin:$PATH
|
||||
|
||||
# Make port 80 available to the world outside this container
|
||||
EXPOSE 80
|
||||
|
||||
# Print helloworld when the container launches
|
||||
CMD ["echo", "Hello, World!"]
|
||||
@@ -1,73 +0,0 @@
|
||||
# First stage: Build and install dependencies
|
||||
FROM pytorch/pytorch:latest as builder
|
||||
|
||||
# Set the working directory in the container
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
git \
|
||||
curl \
|
||||
unzip \
|
||||
gnupg \
|
||||
xvfb \
|
||||
ca-certificates \
|
||||
apt-transport-https \
|
||||
software-properties-common && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt && \
|
||||
pip install --no-cache-dir spacy onnxruntime && \
|
||||
python -m spacy download en_core_web_sm
|
||||
|
||||
# Install Google Chrome and ChromeDriver
|
||||
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
|
||||
sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
|
||||
apt-get update && \
|
||||
apt-get install -y google-chrome-stable && \
|
||||
wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \
|
||||
unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
|
||||
|
||||
# Second stage: Create the final image
|
||||
FROM pytorch/pytorch:latest
|
||||
|
||||
# Set the working directory in the container
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
# Copy Chromedriver and Chrome from the builder stage
|
||||
COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver
|
||||
COPY --from=builder /usr/bin/google-chrome /usr/bin/google-chrome
|
||||
|
||||
# Copy installed Python packages from builder stage
|
||||
COPY --from=builder /opt/conda/lib/python3.10/site-packages /opt/conda/lib/python3.10/site-packages
|
||||
COPY --from=builder /opt/conda/bin /opt/conda/bin
|
||||
|
||||
# Copy the rest of the application code
|
||||
COPY . .
|
||||
|
||||
# Set environment to use Chrome and ChromeDriver properly
|
||||
ENV CHROME_BIN=/usr/bin/google-chrome \
|
||||
CHROMEDRIVER=/usr/local/bin/chromedriver \
|
||||
DISPLAY=:99 \
|
||||
DBUS_SESSION_BUS_ADDRESS=/dev/null \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
# pip install -e .[all]
|
||||
RUN pip install --no-cache-dir -e .[all]
|
||||
|
||||
# Ensure the PATH environment variable includes the location of the installed packages
|
||||
ENV PATH /opt/conda/bin:$PATH
|
||||
|
||||
# Make port 80 available to the world outside this container
|
||||
EXPOSE 80
|
||||
|
||||
# Download models call cli "crawl4ai-download-models"
|
||||
RUN crawl4ai-download-models
|
||||
# RUN python crawl4ai/model_loader.py
|
||||
|
||||
# Run uvicorn
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
||||
@@ -1,61 +0,0 @@
|
||||
# First stage: Build and install dependencies
|
||||
FROM pytorch/pytorch:latest
|
||||
|
||||
# Set the working directory in the container
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
git \
|
||||
curl \
|
||||
unzip \
|
||||
gnupg \
|
||||
xvfb \
|
||||
ca-certificates \
|
||||
apt-transport-https \
|
||||
software-properties-common && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt && \
|
||||
pip install --no-cache-dir spacy onnxruntime && \
|
||||
python -m spacy download en_core_web_sm
|
||||
|
||||
# Install Google Chrome and ChromeDriver
|
||||
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
|
||||
sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
|
||||
apt-get update && \
|
||||
apt-get install -y google-chrome-stable && \
|
||||
wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \
|
||||
unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
|
||||
|
||||
# Copy the rest of the application code
|
||||
COPY . .
|
||||
|
||||
# Set environment to use Chrome and ChromeDriver properly
|
||||
ENV CHROME_BIN=/usr/bin/google-chrome \
|
||||
CHROMEDRIVER=/usr/local/bin/chromedriver \
|
||||
DISPLAY=:99 \
|
||||
DBUS_SESSION_BUS_ADDRESS=/dev/null \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
# pip install -e .[all]
|
||||
RUN pip install --no-cache-dir -e .[all]
|
||||
|
||||
# Ensure the PATH environment variable includes the location of the installed packages
|
||||
ENV PATH /opt/conda/bin:$PATH
|
||||
|
||||
# Make port 80 available to the world outside this container
|
||||
EXPOSE 80
|
||||
|
||||
# Download models call cli "crawl4ai-download-models"
|
||||
RUN crawl4ai-download-models
|
||||
# RUN python crawl4ai/model_loader.py
|
||||
|
||||
# Run uvicorn
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
||||
|
||||
|
||||
@@ -13,6 +13,9 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
|
||||
|
||||
## Recent Changes
|
||||
|
||||
### v0.2.5
|
||||
- ✨ Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness.
|
||||
|
||||
### v0.2.4
|
||||
- 🐞 Resolve the issue with the long url. (Issue #22)
|
||||
|
||||
@@ -188,7 +191,7 @@ pip install -e .[all]
|
||||
# docker build --platform linux/amd64 -t crawl4ai .
|
||||
# For other users
|
||||
# docker build -t crawl4ai .
|
||||
docker run -d -p 8000:80 --name crawl4ai_container_1 crawl4ai
|
||||
docker run -d -p 8000:80 crawl4ai
|
||||
```
|
||||
|
||||
|
||||
|
||||
@@ -53,6 +53,7 @@ def set_model_device(model):
|
||||
model.to(device)
|
||||
return model, device
|
||||
|
||||
@lru_cache()
|
||||
def get_home_folder():
|
||||
home_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
os.makedirs(home_folder, exist_ok=True)
|
||||
@@ -201,7 +202,7 @@ def load_spacy_model():
|
||||
repo_folder = os.path.join(home_folder, "crawl4ai")
|
||||
model_folder = os.path.join(home_folder, name)
|
||||
|
||||
print("[LOG] ⏬ Downloading Spacy model for the first time...")
|
||||
# print("[LOG] ⏬ Downloading Spacy model for the first time...")
|
||||
|
||||
# Remove existing repo folder if it exists
|
||||
if Path(repo_folder).exists():
|
||||
@@ -229,7 +230,7 @@ def load_spacy_model():
|
||||
shutil.rmtree(repo_folder)
|
||||
|
||||
# Print completion message
|
||||
print("[LOG] ✅ Spacy Model downloaded successfully")
|
||||
# print("[LOG] ✅ Spacy Model downloaded successfully")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"An error occurred while cloning the repository: {e}")
|
||||
except Exception as e:
|
||||
@@ -254,8 +255,8 @@ def download_all_models(remove_existing=False):
|
||||
# Load each model to trigger download
|
||||
# print("[LOG] Downloading BERT Base Uncased...")
|
||||
# load_bert_base_uncased()
|
||||
print("[LOG] Downloading BGE Small EN v1.5...")
|
||||
load_bge_small_en_v1_5()
|
||||
# print("[LOG] Downloading BGE Small EN v1.5...")
|
||||
# load_bge_small_en_v1_5()
|
||||
# print("[LOG] Downloading ONNX model...")
|
||||
# load_onnx_all_MiniLM_l6_v2()
|
||||
print("[LOG] Downloading text classifier...")
|
||||
|
||||
@@ -151,6 +151,38 @@ class CustomHTML2Text(HTML2Text):
|
||||
|
||||
super().handle_tag(tag, attrs, start)
|
||||
|
||||
def replace_inline_tags(soup, tags):
|
||||
tag_replacements = {
|
||||
'b': lambda tag: f"**{tag.text}**",
|
||||
'i': lambda tag: f"*{tag.text}*",
|
||||
'u': lambda tag: f"__{tag.text}__",
|
||||
'span': lambda tag: f"{tag.text}",
|
||||
'del': lambda tag: f"~~{tag.text}~~",
|
||||
'ins': lambda tag: f"++{tag.text}++",
|
||||
'sub': lambda tag: f"~{tag.text}~",
|
||||
'sup': lambda tag: f"^^{tag.text}^^",
|
||||
'strong': lambda tag: f"**{tag.text}**",
|
||||
'em': lambda tag: f"*{tag.text}*",
|
||||
'code': lambda tag: f"`{tag.text}`",
|
||||
'kbd': lambda tag: f"`{tag.text}`",
|
||||
'var': lambda tag: f"_{tag.text}_",
|
||||
's': lambda tag: f"~~{tag.text}~~",
|
||||
'q': lambda tag: f'"{tag.text}"',
|
||||
'abbr': lambda tag: f"{tag.text} ({tag.get('title', '')})",
|
||||
'cite': lambda tag: f"_{tag.text}_",
|
||||
'dfn': lambda tag: f"_{tag.text}_",
|
||||
'time': lambda tag: f"{tag.text}",
|
||||
'small': lambda tag: f"<small>{tag.text}</small>",
|
||||
'mark': lambda tag: f"=={tag.text}=="
|
||||
}
|
||||
|
||||
for tag_name in tags:
|
||||
for tag in soup.find_all(tag_name):
|
||||
replacement_text = tag_replacements.get(tag_name, lambda t: t.text)(tag)
|
||||
tag.replace_with(replacement_text)
|
||||
|
||||
return soup
|
||||
|
||||
def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
|
||||
try:
|
||||
if not html:
|
||||
@@ -249,6 +281,9 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
|
||||
|
||||
# Replace all "pre" tags with their inner text
|
||||
body = replace_pre_tags_with_text(body)
|
||||
|
||||
# Replace inline tags with their text content
|
||||
body = replace_inline_tags(body, ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark'])
|
||||
|
||||
# Recursively remove empty elements, their parent elements, and elements with word count below threshold
|
||||
def remove_empty_and_low_word_count_elements(node, word_count_threshold):
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
<header class="bg-zinc-950 text-lime-500 py-4 flex">
|
||||
|
||||
<div class="mx-auto px-4">
|
||||
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.4</h1>
|
||||
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.5</h1>
|
||||
</div>
|
||||
<div class="mx-auto px-4 flex font-bold text-xl gap-2">
|
||||
<span>📊 Total Website Processed</span>
|
||||
|
||||
14
setup.py
14
setup.py
@@ -1,18 +1,8 @@
|
||||
from setuptools import setup, find_packages
|
||||
import os, sys
|
||||
from pathlib import Path
|
||||
import os
|
||||
import subprocess
|
||||
from setuptools.command.install import install
|
||||
|
||||
def get_home_folder():
|
||||
home_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
os.makedirs(home_folder, exist_ok=True)
|
||||
os.makedirs(f"{home_folder}/cache", exist_ok=True)
|
||||
os.makedirs(f"{home_folder}/models", exist_ok=True)
|
||||
return home_folder
|
||||
|
||||
home_folder = get_home_folder()
|
||||
|
||||
# Read the requirements from requirements.txt
|
||||
with open("requirements.txt") as f:
|
||||
requirements = f.read().splitlines()
|
||||
@@ -36,7 +26,7 @@ class CustomInstallCommand(install):
|
||||
|
||||
setup(
|
||||
name="Crawl4AI",
|
||||
version="0.2.4",
|
||||
version="0.2.5",
|
||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
||||
long_description=open("README.md").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
|
||||
Reference in New Issue
Block a user