chore: Update configuration values for chunk token threshold, overlap rate, and minimum word threshold. Create a new example for LLMExtraction Strategy, update Dockerfile, and README

2024-06-19 18:32:20 +08:00
parent 3f0e265baf
commit 539263a8ba
11 changed files with 212 additions and 130 deletions
--- a/65
+++ b/65
@@ -1,6 +1,5 @@
 # First stage: Build and install dependencies
-FROM python:3.10-slim-bookworm as builder
+FROM python:3.10-slim-bookworm
 # Set the working directory in the container
 WORKDIR /usr/src/app
@@ -9,51 +8,30 @@ WORKDIR /usr/src/app
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
    wget \
    git \
    curl \
-    unzip 
+    unzip \
    gnupg \
    xvfb \
    ca-certificates \
    apt-transport-https \
    software-properties-common && \
    rm -rf /var/lib/apt/lists/*    
 # Install Python dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt && \
-    pip install --no-cache-dir spacy torch torchvision torchaudio onnxruntime uvicorn && \
+    pip install --no-cache-dir spacy torch onnxruntime uvicorn && \
    python -m spacy download en_core_web_sm
    # pip install --no-cache-dir spacy torch torchvision torchaudio onnxruntime uvicorn && \
-# Download and install ChromeDriver
+# Install Google Chrome and ChromeDriver
-RUN CHROMEDRIVER_VERSION=$(curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE) && \
+RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
-    wget -N https://chromedriver.storage.googleapis.com/$CHROMEDRIVER_VERSION/chromedriver_linux64.zip -P /tmp && \
+    sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
    unzip /tmp/chromedriver_linux64.zip -d /tmp && \
    mv /tmp/chromedriver /usr/local/bin/chromedriver && \
    chmod +x /usr/local/bin/chromedriver && \
    rm /tmp/chromedriver_linux64.zip
 # Second stage: Create final runtime image
 FROM python:3.10-slim-bookworm
 # Set the working directory in the container
 WORKDIR /usr/src/app
 # Install runtime dependencies
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
    wget \
    git \
    xvfb \
    gnupg2 \
    ca-certificates \
    apt-transport-https \
    software-properties-common && \
    wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - && \
    echo "deb http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list && \
    apt-get update && \
-    apt-get install -y --no-install-recommends google-chrome-stable && \
+    apt-get install -y google-chrome-stable && \
-    rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/google-chrome.list
+    wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \
-
+    unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
 # Copy Chromedriver from the builder stage
 COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver
 # Copy installed Python packages from builder stage
 COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
 COPY --from=builder /usr/local/bin /usr/local/bin
 # Copy the rest of the application code
 COPY . .
@@ -65,12 +43,19 @@ ENV CHROME_BIN=/usr/bin/google-chrome \
    DBUS_SESSION_BUS_ADDRESS=/dev/null \
    PYTHONUNBUFFERED=1
 #  pip install -e .[all]
 RUN pip install --no-cache-dir -e .[all]
 # Ensure the PATH environment variable includes the location of the installed packages
-ENV PATH /usr/local/bin:$PATH   
+ENV PATH /opt/conda/bin:$PATH   
 # Make port 80 available to the world outside this container
 EXPOSE 80
 # Download models call cli "crawl4ai-download-models"
 RUN crawl4ai-download-models
 # RUN python crawl4ai/model_loader.py
 # Run uvicorn
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
--- a/45
+++ b/45
@@ -1,45 +0,0 @@
 # Use an official Python runtime as a parent image
 FROM python:3.10-slim
 # In case you had some weird issues, try this Image
 # FROM python:3.10-slim-bookworm as builder
 # Set the working directory in the container
 WORKDIR /usr/src/app
 # Copy the current directory contents into the container at /usr/src/app
 COPY . .
 # Install dependencies for Chrome and ChromeDriver
 RUN apt-get update && apt-get install -y --no-install-recommends \
    wget \
    xvfb \
    unzip \
    curl \
    gnupg2 \
    ca-certificates \
    apt-transport-https \
    software-properties-common \
    && mkdir -p /etc/apt/keyrings \
    && curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \
    && echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main' | tee /etc/apt/sources.list.d/google-chrome.list \
    && apt-get update \
    && apt-get install -y google-chrome-stable \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get install -y chromium-chromedriver
 # Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 RUN pip install spacy torch torchvision torchaudio
 # Set display port and dbus env to avoid hanging
 ENV DISPLAY=:99
 ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
 # Make port 80 available to the world outside this container
 EXPOSE 80
 # Define environment variable
 ENV PYTHONUNBUFFERED 1
 # Run uvicorn
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
--- a/README.md
+++ b/README.md
@@ -22,6 +22,7 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
  - 🟡 on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize.
 - 📄 Added an example in [`quickstart.py`](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart.py) in the example folder under the docs.
 - ✨ Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness.
 - 🐳 Updated Dockerfile to ensure compatibility across multiple platforms (Hopefully!).
 ### v0.2.4
 - 🐞 Resolve the issue with the long url. (Issue #22)
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -21,7 +21,9 @@ PROVIDER_MODELS = {
 # Chunk token threshold
-CHUNK_TOKEN_THRESHOLD = 1000
+CHUNK_TOKEN_THRESHOLD = 500
 OVERLAP_RATE = 0.1
 WORD_TOKEN_RATE = 1.3
 # Threshold for the minimum number of word in a HTML tag to be considered 
-MIN_WORD_THRESHOLD = 5
+MIN_WORD_THRESHOLD = 1
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -79,8 +79,15 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
        self.options.headless = True
        if kwargs.get("user_agent"):
            self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
        else:
            # Set user agent
            user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
            self.options.add_argument(f"--user-agent={user_agent}")          
        self.options.add_argument("--no-sandbox")
-        self.options.add_argument("--headless")
+        self.options.headless = kwargs.get("headless", True)
        if self.options.headless:
            self.options.add_argument("--headless")
        # self.options.add_argument("--disable-dev-shm-usage")
        self.options.add_argument("--disable-gpu")
        # self.options.add_argument("--disable-extensions")
@@ -112,10 +119,19 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
        # chromedriver_autoinstaller.install()
        import chromedriver_autoinstaller
-        self.service = Service(chromedriver_autoinstaller.install())
+        crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
        chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver(crawl4ai_folder, False)
        # self.service = Service(chromedriver_autoinstaller.install())
        self.service = Service(chromedriver_path)
        self.service.log_path = "NUL"
        self.driver = webdriver.Chrome(service=self.service, options=self.options)
        self.driver = self.execute_hook('on_driver_created', self.driver)
        if kwargs.get("cookies"):
            for cookie in kwargs.get("cookies"):
                self.driver.add_cookie(cookie)
    def set_hook(self, hook_type: str, hook: Callable):
        if hook_type in self.hooks:
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -3,12 +3,12 @@ from typing import Any, List, Dict, Optional, Union
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import json, time
 # from optimum.intel import IPEXModel
-from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
+from .prompts import *
 from .config import *
 from .utils import *
 from functools import partial
 from .model_loader import *
-
+import math
 import numpy as np
 class ExtractionStrategy(ABC):
@@ -55,7 +55,9 @@ class NoExtractionStrategy(ExtractionStrategy):
        return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
 class LLMExtractionStrategy(ExtractionStrategy):
-    def __init__(self, provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, instruction:str = None, **kwargs):
+    def __init__(self, 
                 provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, 
                 instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs):
        """
        Initialize the strategy with clustering parameters.
@@ -67,6 +69,13 @@ class LLMExtractionStrategy(ExtractionStrategy):
        self.provider = provider
        self.api_token = api_token or PROVIDER_MODELS.get(provider, None) or os.getenv("OPENAI_API_KEY")
        self.instruction = instruction
        self.extract_type = extraction_type
        self.schema = schema
        self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD)
        self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
        self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
        self.verbose = kwargs.get("verbose", False)
        if not self.api_token:
@@ -81,10 +90,15 @@ class LLMExtractionStrategy(ExtractionStrategy):
            "HTML": escape_json_string(sanitize_html(html)),
        }
        prompt_with_variables = PROMPT_EXTRACT_BLOCKS
        if self.instruction:
            variable_values["REQUEST"] = self.instruction
            prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
        if self.extract_type == "schema":
            variable_values["SCHEMA"] = json.dumps(self.schema)
            prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
        prompt_with_variables = PROMPT_EXTRACT_BLOCKS if not self.instruction else PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
        for variable in variable_values:
            prompt_with_variables = prompt_with_variables.replace(
                "{" + variable + "}", variable_values[variable]
@@ -112,32 +126,62 @@ class LLMExtractionStrategy(ExtractionStrategy):
            print("[LOG] Extracted", len(blocks), "blocks from URL:", url, "block index:", ix)
        return blocks
-    def _merge(self, documents):
+    def _merge(self, documents, chunk_token_threshold, overlap):
        chunks = []
        sections = []
        total_tokens = 0
        # Calculate the total tokens across all documents
        for document in documents:
            total_tokens += len(document.split(' ')) * self.word_token_rate
        # Calculate the number of sections needed
        num_sections = math.floor(total_tokens / chunk_token_threshold)
        if num_sections < 1:
            num_sections = 1  # Ensure there is at least one section
        adjusted_chunk_threshold = total_tokens / num_sections
        total_token_so_far = 0
        current_chunk = []
        for document in documents:
-            if total_token_so_far < CHUNK_TOKEN_THRESHOLD:
+            tokens = document.split(' ')
-                chunk = document.split(' ')
+            token_count = len(tokens) * self.word_token_rate
                total_token_so_far += len(chunk) * 1.3
                chunks.append(document)
            else:
                sections.append('\n\n'.join(chunks))
                chunks = [document]
                total_token_so_far = len(document.split(' ')) * 1.3 
        if chunks:
            sections.append('\n\n'.join(chunks))
-        return sections       
+            if total_token_so_far + token_count <= adjusted_chunk_threshold:
                current_chunk.extend(tokens)
                total_token_so_far += token_count
            else:
                # Ensure to handle the last section properly
                if len(sections) == num_sections - 1:
                    current_chunk.extend(tokens)
                    continue
                # Add overlap if specified
                if overlap > 0 and current_chunk:
                    overlap_tokens = current_chunk[-overlap:]
                    current_chunk.extend(overlap_tokens)
                sections.append(' '.join(current_chunk))
                current_chunk = tokens
                total_token_so_far = token_count
        # Add the last chunk
        if current_chunk:
            sections.append(' '.join(current_chunk))
        return sections
    def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
        """
        Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
        """
-        merged_sections = self._merge(sections)
+        merged_sections = self._merge(
            sections, self.chunk_token_threshold,
            overlap= int(self.chunk_token_threshold * self.overlap_rate)
        )
        extracted_content = []
        if self.provider.startswith("groq/"):
            # Sequential processing with a delay
--- a/crawl4ai/prompts.py
+++ b/crawl4ai/prompts.py
@@ -164,4 +164,35 @@ Please provide your output within <blocks> tags, like this:
 **Make sure to follow the user instruction to extract blocks aligin with the instruction.**
-Remember, the output should be a complete, parsable JSON wrapped in <blocks> tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order."""
+Remember, the output should be a complete, parsable JSON wrapped in <blocks> tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order."""
 PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION = """Here is the content from the URL:
 <url>{URL}</url>
 <url_content>
 {HTML}
 </url_content>
 The user has made the following request for what information to extract from the above content:
 <user_request>
 {REQUEST}
 </user_request>
 <schema_block>
 {SCHEMA}
 </schema_block>
 Please carefully read the URL content and the user's request. If the user provided a desired JSON schema in the <schema_block> above, extract the requested information from the URL content according to that schema. If no schema was provided, infer an appropriate JSON schema based on the user's request that will best capture the key information they are looking for.
 Extraction instructions:
 Return the extracted information as a list of JSON objects, with each object in the list corresponding to a block of content from the URL, in the same order as it appears on the page. Wrap the entire JSON list in <blocks> tags.
 Quality Reflection:
 Before outputting your final answer, double check that the JSON you are returning is complete, containing all the information requested by the user, and is valid JSON that could be parsed by json.loads() with no errors or omissions. The outputted JSON objects should fully match the schema, either provided or inferred.
 Quality Score:
 After reflecting, score the quality and completeness of the JSON data you are about to return on a scale of 1 to 5. Write the score inside <score> tags.
 Result
 Output the final list of JSON objects, wrapped in <blocks> tags."""
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -42,7 +42,7 @@ class WebCrawler:
    def warmup(self):
        print("[LOG] 🌤️  Warming up the WebCrawler")
        result = self.run(
-            url='https://crawl4ai.uccode.io/',
+            url='https://google.com/',
            word_count_threshold=5,
            extraction_strategy= NoExtractionStrategy(),
            bypass_cache=False,
--- a/docs/examples/llm_extraction_openai_pricing.py
+++ b/docs/examples/llm_extraction_openai_pricing.py
@@ -0,0 +1,40 @@
 import os
 import time
 from crawl4ai.web_crawler import WebCrawler
 from crawl4ai.chunking_strategy import *
 from crawl4ai.extraction_strategy import *
 from crawl4ai.crawler_strategy import *
 url = r'https://openai.com/api/pricing/'
 crawler = WebCrawler()
 crawler.warmup()
 from pydantic import BaseModel, Field
 class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="Name of the OpenAI model.")
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
    output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
 result = crawler.run(
    url=url,
    word_count_threshold=1,
    extraction_strategy= LLMExtractionStrategy(
        provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), 
        schema=OpenAIModelFee.model_json_schema(),
        extraction_type="schema",
        instruction="From the crawled content, extract all mentioned model names along with their "\
            "fees for input and output tokens. Make sure not to miss anything in the entire content. "\
            'One extracted model JSON format should look like this: '\
            '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
    ),
    bypass_cache=True,
 )
 model_fees = json.loads(result.extracted_content)
 print(len(model_fees))
 with open(".data/data.json", "w") as f:
    f.write(result.extracted_content)
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,21 +1,22 @@
-aiohttp
+numpy==1.25.0
-aiosqlite
+aiohttp==3.9.5
-bs4
+aiosqlite==0.20.0
-fastapi
+beautifulsoup4==4.12.3
-html2text
+fastapi==0.111.0
-httpx
+html2text==2024.2.26
-litellm
+httpx==0.27.0
-nltk
+litellm==1.40.17
-pydantic
+nltk==3.8.1
-python-dotenv
+pydantic==2.7.4
-requests
+python-dotenv==1.0.1
-rich
+requests==2.32.3
-scikit-learn
+rich==13.7.1
-selenium
+scikit-learn==1.5.0
-uvicorn
+selenium==4.21.0
-transformers
+uvicorn==0.30.1
-chromedriver-autoinstaller
+transformers==4.41.2
-torch
+chromedriver-autoinstaller==0.6.4
-onnxruntime
+torch==2.3.1
-tokenizers
+onnxruntime==1.18.0
-pillow
+tokenizers==0.19.1
 pillow==10.3.0
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,15 @@
 from setuptools import setup, find_packages
 import os
 import sys
 from pathlib import Path
 import subprocess
 from setuptools.command.install import install
 # Create the .crawl4ai folder in the user's home directory if it doesn't exist
 crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
 os.makedirs(crawl4ai_folder, exist_ok=True)
 os.makedirs(f"{crawl4ai_folder}/cache", exist_ok=True)
 # Read the requirements from requirements.txt
 with open("requirements.txt") as f:
    requirements = f.read().splitlines()