Enhancement: Replaced inline HTML tags with textual format for better LLM context handling #45

Enhancement: Replaced inline HTML tags with textual format for better LLM context handling #24
2024-06-17 15:14:56 +08:00 · 2024-06-17 15:14:34 +08:00
11 changed files with 52 additions and 192 deletions
--- a/.files/screenshot.png
+++ b/.files/screenshot.png
--- a/.gitignore
+++ b/.gitignore
@@ -179,6 +179,3 @@ docs/examples/.chainlit/
 docs/examples/.chainlit/*
 .chainlit/config.toml
 .chainlit/translations/en-US.json
-
-local/
-.files/
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog

+## [0.2.5] - 2024-06-17
+### Added
+- Enhancement issue #24: Replaced inline HTML tags (e.g., DEL, INS, SUB, ABBR) with textual format for better context handling in LLM.
+
+
 ## [0.2.4] - 2024-06-17
 ### Fixed
 - Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
--- a/37
+++ b/37
@@ -1,37 +0,0 @@
-
-# First stage: Build and install dependencies
-FROM python:3.10-slim-bookworm as builder
-
-# Set the working directory in the container
-WORKDIR /usr/src/app
-
-# Install build dependencies
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-    wget \
-    curl \
-    unzip 
-
-# Install Python dependencies
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt && \
-    pip install --no-cache-dir spacy
-
-# Copy the rest of the application code
-COPY . .
-
-# Set environment to use Chrome and ChromeDriver properly
-ENV CHROME_BIN=/usr/bin/google-chrome \
-    CHROMEDRIVER=/usr/local/bin/chromedriver \
-    DISPLAY=:99 \
-    DBUS_SESSION_BUS_ADDRESS=/dev/null \
-    PYTHONUNBUFFERED=1
-
-# Ensure the PATH environment variable includes the location of the installed packages
-ENV PATH /usr/local/bin:$PATH   
-
-# Make port 80 available to the world outside this container
-EXPOSE 80
-
-# Print helloworld when the container launches
-CMD ["echo", "Hello, World!"]
--- a/73
+++ b/73
@@ -1,73 +0,0 @@
-# First stage: Build and install dependencies
-FROM pytorch/pytorch:latest as builder
-
-# Set the working directory in the container
-WORKDIR /usr/src/app
-
-# Install build dependencies
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-    wget \
-    git \
-    curl \
-    unzip \
-    gnupg \
-    xvfb \
-    ca-certificates \
-    apt-transport-https \
-    software-properties-common && \
-    rm -rf /var/lib/apt/lists/*    
-
-# Install Python dependencies
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt && \
-    pip install --no-cache-dir spacy onnxruntime && \
-    python -m spacy download en_core_web_sm
-
-# Install Google Chrome and ChromeDriver
-RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
-    sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
-    apt-get update && \
-    apt-get install -y google-chrome-stable && \
-    wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \
-    unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
-
-# Second stage: Create the final image
-FROM pytorch/pytorch:latest
-
-# Set the working directory in the container
-WORKDIR /usr/src/app
-
-# Copy Chromedriver and Chrome from the builder stage
-COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver
-COPY --from=builder /usr/bin/google-chrome /usr/bin/google-chrome
-
-# Copy installed Python packages from builder stage
-COPY --from=builder /opt/conda/lib/python3.10/site-packages /opt/conda/lib/python3.10/site-packages
-COPY --from=builder /opt/conda/bin /opt/conda/bin
-
-# Copy the rest of the application code
-COPY . .
-
-# Set environment to use Chrome and ChromeDriver properly
-ENV CHROME_BIN=/usr/bin/google-chrome \
-    CHROMEDRIVER=/usr/local/bin/chromedriver \
-    DISPLAY=:99 \
-    DBUS_SESSION_BUS_ADDRESS=/dev/null \
-    PYTHONUNBUFFERED=1
-
-#  pip install -e .[all]
-RUN pip install --no-cache-dir -e .[all]
-
-# Ensure the PATH environment variable includes the location of the installed packages
-ENV PATH /opt/conda/bin:$PATH   
-
-# Make port 80 available to the world outside this container
-EXPOSE 80
-
-# Download models call cli "crawl4ai-download-models"
-RUN crawl4ai-download-models
-# RUN python crawl4ai/model_loader.py
-
-# Run uvicorn
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
--- a/61
+++ b/61
@@ -1,61 +0,0 @@
-# First stage: Build and install dependencies
-FROM pytorch/pytorch:latest 
-
-# Set the working directory in the container
-WORKDIR /usr/src/app
-
-# Install build dependencies
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-    wget \
-    git \
-    curl \
-    unzip \
-    gnupg \
-    xvfb \
-    ca-certificates \
-    apt-transport-https \
-    software-properties-common && \
-    rm -rf /var/lib/apt/lists/*    
-
-# Install Python dependencies
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt && \
-    pip install --no-cache-dir spacy onnxruntime && \
-    python -m spacy download en_core_web_sm
-
-# Install Google Chrome and ChromeDriver
-RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
-    sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
-    apt-get update && \
-    apt-get install -y google-chrome-stable && \
-    wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \
-    unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
-
-# Copy the rest of the application code
-COPY . .
-
-# Set environment to use Chrome and ChromeDriver properly
-ENV CHROME_BIN=/usr/bin/google-chrome \
-    CHROMEDRIVER=/usr/local/bin/chromedriver \
-    DISPLAY=:99 \
-    DBUS_SESSION_BUS_ADDRESS=/dev/null \
-    PYTHONUNBUFFERED=1
-
-#  pip install -e .[all]
-RUN pip install --no-cache-dir -e .[all]
-
-# Ensure the PATH environment variable includes the location of the installed packages
-ENV PATH /opt/conda/bin:$PATH   
-
-# Make port 80 available to the world outside this container
-EXPOSE 80
-
-# Download models call cli "crawl4ai-download-models"
-RUN crawl4ai-download-models
-# RUN python crawl4ai/model_loader.py
-
-# Run uvicorn
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
-
-
--- a/README.md
+++ b/README.md
@@ -13,6 +13,9 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information

 ## Recent Changes 

+### v0.2.5
+- ✨ Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness.
+
 ### v0.2.4
 - 🐞 Resolve the issue with the long url. (Issue #22)

@@ -188,7 +191,7 @@ pip install -e .[all]
 # docker build --platform linux/amd64 -t crawl4ai .
 # For other users
 # docker build -t crawl4ai .
-docker run -d -p 8000:80 --name crawl4ai_container_1 crawl4ai
+docker run -d -p 8000:80 crawl4ai
 ```


--- a/crawl4ai/model_loader.py
+++ b/crawl4ai/model_loader.py
@@ -53,6 +53,7 @@ def set_model_device(model):
    model.to(device)    
    return model, device

+@lru_cache()
 def get_home_folder():
    home_folder = os.path.join(Path.home(), ".crawl4ai")
    os.makedirs(home_folder, exist_ok=True)
@@ -201,7 +202,7 @@ def load_spacy_model():
        repo_folder = os.path.join(home_folder, "crawl4ai")
        model_folder = os.path.join(home_folder, name)

-        print("[LOG] ⏬ Downloading Spacy model for the first time...")
+        # print("[LOG] ⏬ Downloading Spacy model for the first time...")

        # Remove existing repo folder if it exists
        if Path(repo_folder).exists():
@@ -229,7 +230,7 @@ def load_spacy_model():
            shutil.rmtree(repo_folder)

            # Print completion message
-            print("[LOG] ✅ Spacy Model downloaded successfully")
+            # print("[LOG] ✅ Spacy Model downloaded successfully")
        except subprocess.CalledProcessError as e:
            print(f"An error occurred while cloning the repository: {e}")
        except Exception as e:
@@ -254,8 +255,8 @@ def download_all_models(remove_existing=False):
    # Load each model to trigger download
    # print("[LOG] Downloading BERT Base Uncased...")
    # load_bert_base_uncased()
-    print("[LOG] Downloading BGE Small EN v1.5...")
-    load_bge_small_en_v1_5()
+    # print("[LOG] Downloading BGE Small EN v1.5...")
+    # load_bge_small_en_v1_5()
    # print("[LOG] Downloading ONNX model...")
    # load_onnx_all_MiniLM_l6_v2()
    print("[LOG] Downloading text classifier...")
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -151,6 +151,38 @@ class CustomHTML2Text(HTML2Text):

        super().handle_tag(tag, attrs, start)

+def replace_inline_tags(soup, tags):
+    tag_replacements = {
+        'b': lambda tag: f"**{tag.text}**",
+        'i': lambda tag: f"*{tag.text}*",
+        'u': lambda tag: f"__{tag.text}__",
+        'span': lambda tag: f"{tag.text}",
+        'del': lambda tag: f"~~{tag.text}~~",
+        'ins': lambda tag: f"++{tag.text}++",
+        'sub': lambda tag: f"~{tag.text}~",
+        'sup': lambda tag: f"^^{tag.text}^^",
+        'strong': lambda tag: f"**{tag.text}**",
+        'em': lambda tag: f"*{tag.text}*",
+        'code': lambda tag: f"`{tag.text}`",
+        'kbd': lambda tag: f"`{tag.text}`",
+        'var': lambda tag: f"_{tag.text}_",
+        's': lambda tag: f"~~{tag.text}~~",
+        'q': lambda tag: f'"{tag.text}"',
+        'abbr': lambda tag: f"{tag.text} ({tag.get('title', '')})",
+        'cite': lambda tag: f"_{tag.text}_",
+        'dfn': lambda tag: f"_{tag.text}_",
+        'time': lambda tag: f"{tag.text}",
+        'small': lambda tag: f"<small>{tag.text}</small>",
+        'mark': lambda tag: f"=={tag.text}=="
+    }
+
+    for tag_name in tags:
+        for tag in soup.find_all(tag_name):
+            replacement_text = tag_replacements.get(tag_name, lambda t: t.text)(tag)
+            tag.replace_with(replacement_text)
+
+    return soup
+
 def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
    try:
        if not html:
@@ -249,6 +281,9 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
        
        # Replace all "pre" tags with their inner text
        body = replace_pre_tags_with_text(body)
+        
+        # Replace inline tags with their text content
+        body = replace_inline_tags(body, ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark'])

        # Recursively remove empty elements, their parent elements, and elements with word count below threshold
        def remove_empty_and_low_word_count_elements(node, word_count_threshold):
--- a/pages/index.html
+++ b/pages/index.html
@@ -25,7 +25,7 @@
        <header class="bg-zinc-950 text-lime-500 py-4 flex">
            
            <div class="mx-auto px-4">
-                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.4</h1>
+                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.5</h1>
            </div>
            <div class="mx-auto px-4 flex font-bold text-xl gap-2">
                <span>📊 Total Website Processed</span>
--- a/setup.py
+++ b/setup.py
@@ -1,18 +1,8 @@
 from setuptools import setup, find_packages
-import os, sys
-from pathlib import Path
+import os
 import subprocess
 from setuptools.command.install import install

-def get_home_folder():
-    home_folder = os.path.join(Path.home(), ".crawl4ai")
-    os.makedirs(home_folder, exist_ok=True)
-    os.makedirs(f"{home_folder}/cache", exist_ok=True)
-    os.makedirs(f"{home_folder}/models", exist_ok=True)
-    return home_folder 
-
-home_folder = get_home_folder()
-
 # Read the requirements from requirements.txt
 with open("requirements.txt") as f:
    requirements = f.read().splitlines()
@@ -36,7 +26,7 @@ class CustomInstallCommand(install):

 setup(
    name="Crawl4AI",
-    version="0.2.4",
+    version="0.2.5",
    description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
    long_description=open("README.md").read(),
    long_description_content_type="text/markdown",
Author	SHA1	Message	Date
unclecode	2f246d19f4	Enhancement: Replaced inline HTML tags with textual format for better LLM context handling #45	2024-06-17 15:14:56 +08:00
unclecode	413595542a	Enhancement: Replaced inline HTML tags with textual format for better LLM context handling #24	2024-06-17 15:14:34 +08:00