chore: Bump version to 0.2.2 in setup.py

2024-05-19 16:19:40 +00:00
39 changed files with 237 additions and 1921 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -173,12 +173,4 @@ Crawl4AI.egg-info/
 requirements0.txt
 a.txt

-*.sh
-.idea
-docs/examples/.chainlit/
-docs/examples/.chainlit/*
-.chainlit/config.toml
-.chainlit/translations/en-US.json
-
-local/
-.files/
+*.sh
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1 @@
-# Changelog
-
-## [0.2.4] - 2024-06-17
-### Fixed
- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
+# Changelog
--- a/86
+++ b/86
@@ -1,77 +1,43 @@
-
-# First stage: Build and install dependencies
-FROM python:3.10-slim-bookworm as builder
+# Use an official Python runtime as a parent image
+FROM python:3.10-slim

 # Set the working directory in the container
 WORKDIR /usr/src/app

-# Install build dependencies
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
+# Copy the current directory contents into the container at /usr/src/app
+COPY . .
+
+# Install dependencies for Chrome and ChromeDriver
+RUN apt-get update && apt-get install -y --no-install-recommends \
    wget \
-    curl \
-    unzip 
-
-# Install Python dependencies
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt && \
-    pip install --no-cache-dir spacy torch torchvision torchaudio onnxruntime uvicorn && \
-    python -m spacy download en_core_web_sm
-
-# Download and install ChromeDriver
-RUN CHROMEDRIVER_VERSION=$(curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE) && \
-    wget -N https://chromedriver.storage.googleapis.com/$CHROMEDRIVER_VERSION/chromedriver_linux64.zip -P /tmp && \
-    unzip /tmp/chromedriver_linux64.zip -d /tmp && \
-    mv /tmp/chromedriver /usr/local/bin/chromedriver && \
-    chmod +x /usr/local/bin/chromedriver && \
-    rm /tmp/chromedriver_linux64.zip
-
-# Second stage: Create final runtime image
-FROM python:3.10-slim-bookworm
-
-# Set the working directory in the container
-WORKDIR /usr/src/app
-
-# Install runtime dependencies
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-    wget \
-    git \
    xvfb \
+    unzip \
+    curl \
    gnupg2 \
    ca-certificates \
    apt-transport-https \
-    software-properties-common && \
-    wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - && \
-    echo "deb http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends google-chrome-stable && \
-    rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/google-chrome.list
+    software-properties-common \
+    && mkdir -p /etc/apt/keyrings \
+    && curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \
+    && echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main' | tee /etc/apt/sources.list.d/google-chrome.list \
+    && apt-get update \
+    && apt-get install -y google-chrome-stable \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get install -y chromium-chromedriver

-# Copy Chromedriver from the builder stage
-COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install spacy torch torchvision torchaudio

-# Copy installed Python packages from builder stage
-COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
-COPY --from=builder /usr/local/bin /usr/local/bin
-
-# Copy the rest of the application code
-COPY . .
-
-# Set environment to use Chrome and ChromeDriver properly
-ENV CHROME_BIN=/usr/bin/google-chrome \
-    CHROMEDRIVER=/usr/local/bin/chromedriver \
-    DISPLAY=:99 \
-    DBUS_SESSION_BUS_ADDRESS=/dev/null \
-    PYTHONUNBUFFERED=1
-
-# Ensure the PATH environment variable includes the location of the installed packages
-ENV PATH /usr/local/bin:$PATH   
+# Set display port and dbus env to avoid hanging
+ENV DISPLAY=:99
+ENV DBUS_SESSION_BUS_ADDRESS=/dev/null

 # Make port 80 available to the world outside this container
 EXPOSE 80

+# Define environment variable
+ENV PYTHONUNBUFFERED 1
+
 # Run uvicorn
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
-
-
--- a/45
+++ b/45
@@ -1,45 +0,0 @@
-# Use an official Python runtime as a parent image
-FROM python:3.10-slim
-# In case you had some weird issues, try this Image
-# FROM python:3.10-slim-bookworm as builder
-
-# Set the working directory in the container
-WORKDIR /usr/src/app
-
-# Copy the current directory contents into the container at /usr/src/app
-COPY . .
-
-# Install dependencies for Chrome and ChromeDriver
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    wget \
-    xvfb \
-    unzip \
-    curl \
-    gnupg2 \
-    ca-certificates \
-    apt-transport-https \
-    software-properties-common \
-    && mkdir -p /etc/apt/keyrings \
-    && curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \
-    && echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main' | tee /etc/apt/sources.list.d/google-chrome.list \
-    && apt-get update \
-    && apt-get install -y google-chrome-stable \
-    && rm -rf /var/lib/apt/lists/* \
-    && apt-get install -y chromium-chromedriver
-
-# Install Python dependencies
-RUN pip install --no-cache-dir -r requirements.txt
-RUN pip install spacy torch torchvision torchaudio
-
-# Set display port and dbus env to avoid hanging
-ENV DISPLAY=:99
-ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
-
-# Make port 80 available to the world outside this container
-EXPOSE 80
-
-# Define environment variable
-ENV PYTHONUNBUFFERED 1
-
-# Run uvicorn
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
--- a/37
+++ b/37
@@ -1,37 +0,0 @@
-
-# First stage: Build and install dependencies
-FROM python:3.10-slim-bookworm as builder
-
-# Set the working directory in the container
-WORKDIR /usr/src/app
-
-# Install build dependencies
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-    wget \
-    curl \
-    unzip 
-
-# Install Python dependencies
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt && \
-    pip install --no-cache-dir spacy
-
-# Copy the rest of the application code
-COPY . .
-
-# Set environment to use Chrome and ChromeDriver properly
-ENV CHROME_BIN=/usr/bin/google-chrome \
-    CHROMEDRIVER=/usr/local/bin/chromedriver \
-    DISPLAY=:99 \
-    DBUS_SESSION_BUS_ADDRESS=/dev/null \
-    PYTHONUNBUFFERED=1
-
-# Ensure the PATH environment variable includes the location of the installed packages
-ENV PATH /usr/local/bin:$PATH   
-
-# Make port 80 available to the world outside this container
-EXPOSE 80
-
-# Print helloworld when the container launches
-CMD ["echo", "Hello, World!"]
--- a/73
+++ b/73
@@ -1,73 +0,0 @@
-# First stage: Build and install dependencies
-FROM pytorch/pytorch:latest as builder
-
-# Set the working directory in the container
-WORKDIR /usr/src/app
-
-# Install build dependencies
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-    wget \
-    git \
-    curl \
-    unzip \
-    gnupg \
-    xvfb \
-    ca-certificates \
-    apt-transport-https \
-    software-properties-common && \
-    rm -rf /var/lib/apt/lists/*    
-
-# Install Python dependencies
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt && \
-    pip install --no-cache-dir spacy onnxruntime && \
-    python -m spacy download en_core_web_sm
-
-# Install Google Chrome and ChromeDriver
-RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
-    sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
-    apt-get update && \
-    apt-get install -y google-chrome-stable && \
-    wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \
-    unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
-
-# Second stage: Create the final image
-FROM pytorch/pytorch:latest
-
-# Set the working directory in the container
-WORKDIR /usr/src/app
-
-# Copy Chromedriver and Chrome from the builder stage
-COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver
-COPY --from=builder /usr/bin/google-chrome /usr/bin/google-chrome
-
-# Copy installed Python packages from builder stage
-COPY --from=builder /opt/conda/lib/python3.10/site-packages /opt/conda/lib/python3.10/site-packages
-COPY --from=builder /opt/conda/bin /opt/conda/bin
-
-# Copy the rest of the application code
-COPY . .
-
-# Set environment to use Chrome and ChromeDriver properly
-ENV CHROME_BIN=/usr/bin/google-chrome \
-    CHROMEDRIVER=/usr/local/bin/chromedriver \
-    DISPLAY=:99 \
-    DBUS_SESSION_BUS_ADDRESS=/dev/null \
-    PYTHONUNBUFFERED=1
-
-#  pip install -e .[all]
-RUN pip install --no-cache-dir -e .[all]
-
-# Ensure the PATH environment variable includes the location of the installed packages
-ENV PATH /opt/conda/bin:$PATH   
-
-# Make port 80 available to the world outside this container
-EXPOSE 80
-
-# Download models call cli "crawl4ai-download-models"
-RUN crawl4ai-download-models
-# RUN python crawl4ai/model_loader.py
-
-# Run uvicorn
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
--- a/61
+++ b/61
@@ -1,61 +0,0 @@
-# First stage: Build and install dependencies
-FROM pytorch/pytorch:latest 
-
-# Set the working directory in the container
-WORKDIR /usr/src/app
-
-# Install build dependencies
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-    wget \
-    git \
-    curl \
-    unzip \
-    gnupg \
-    xvfb \
-    ca-certificates \
-    apt-transport-https \
-    software-properties-common && \
-    rm -rf /var/lib/apt/lists/*    
-
-# Install Python dependencies
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt && \
-    pip install --no-cache-dir spacy onnxruntime && \
-    python -m spacy download en_core_web_sm
-
-# Install Google Chrome and ChromeDriver
-RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
-    sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
-    apt-get update && \
-    apt-get install -y google-chrome-stable && \
-    wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \
-    unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
-
-# Copy the rest of the application code
-COPY . .
-
-# Set environment to use Chrome and ChromeDriver properly
-ENV CHROME_BIN=/usr/bin/google-chrome \
-    CHROMEDRIVER=/usr/local/bin/chromedriver \
-    DISPLAY=:99 \
-    DBUS_SESSION_BUS_ADDRESS=/dev/null \
-    PYTHONUNBUFFERED=1
-
-#  pip install -e .[all]
-RUN pip install --no-cache-dir -e .[all]
-
-# Ensure the PATH environment variable includes the location of the installed packages
-ENV PATH /opt/conda/bin:$PATH   
-
-# Make port 80 available to the world outside this container
-EXPOSE 80
-
-# Download models call cli "crawl4ai-download-models"
-RUN crawl4ai-download-models
-# RUN python crawl4ai/model_loader.py
-
-# Run uvicorn
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
-
-
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Crawl4AI v0.2.3 🕷️🤖
+# Crawl4AI v0.2.0 🕷️🤖

 [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
 [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)
@@ -8,27 +8,10 @@

 Crawl4AI has one clear task: to simplify crawling and extract useful information from web pages, making it accessible for large language models (LLMs) and AI applications. 🆓🌐

- Use as REST API: Check  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1zODYjhemJ5bUmYceWpVoBMVpd0ofzNBZ?usp=sharing)
- Use as Python library: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)

-## Recent Changes 
+## Recent Changes v0.2.0

-### v0.2.4
- 🐞 Resolve the issue with the long url. (Issue #22)
-
-### v0.2.3
- 🎨 Extract and return all media tags (Images, Audio, and Video). Check `result.media`
- 🔗 Extrat all external and internal links. Check `result.links`
- 📚 Extract metadata from the page. Check `result.metadata`
- 🕵️ Support `user_agent` parameter to set the user agent for the HTTP requests.
- 🖼️ Take [screenshots](#taking-screenshots) of the page.
-
-### v0.2.2
- Support multiple JS scripts
- Fixed some of bugs
- Resolved a few issue relevant to Colab installation
-
-### v0.2.0
 - 🚀 10x faster!!
 - 📜 Execute custom JavaScript before crawling!
 - 🤝 Colab friendly!
@@ -39,27 +22,7 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information

 ## Power and Simplicity of Crawl4AI 🚀

-The most easy way! If you don't want to install any library, you can use the REST API on my server. But remember, this is just a simple server. I may improve its capacity if I see there is demand. You can find ll examples of REST API in this colab notebook. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1zODYjhemJ5bUmYceWpVoBMVpd0ofzNBZ?usp=sharing)
-
-```python
-import requests
-
-data = {
-  "urls": [
-    "https://www.nbcnews.com/business"
-  ],
-  "screenshot": True
-}
-
-response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally 
-response_data = response.json()
-print(response_data['results'][0].keys())
-# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media', 
-# 'links', 'screenshot', 'markdown', 'extracted_content', 
-# 'metadata', 'error_message'])
-```
-
-But you muore control then take a look at the first example of using the Python library.
+To show the simplicity take a look at the first example:

 ```python
 from crawl4ai import WebCrawler
@@ -67,9 +30,11 @@ from crawl4ai import WebCrawler
 # Create the WebCrawler instance 
 crawler = WebCrawler() 

+
+
 # Run the crawler with keyword filtering and CSS selector
 result = crawler.run(url="https://www.nbcnews.com/business")
-print(result) # {url, html, cleaned_html, markdown, media, links, extracted_content, metadata, screenshots}
+print(result) # {url, html, markdown, extracted_content, metadata}
 ```

 Now let's try a complex task. Below is an example of how you can execute JavaScript, filter data using keywords, and use a CSS selector to extract specific content—all in one go!
@@ -87,17 +52,20 @@ from crawl4ai.extraction_strategy import *
 from crawl4ai.crawler_strategy import *

 # Define the JavaScript code to click the "Load More" button
-js_code = ["""
+js_code = """
 const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
 loadMoreButton && loadMoreButton.click();
-"""]
+"""
+
+# Define the crawling strategy
+crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
+
+# Create the WebCrawler instance with the defined strategy
+crawler = WebCrawler(crawler_strategy=crawler_strategy)

-crawler = WebCrawler(verbose=True)
-crawler.warmup()
 # Run the crawler with keyword filtering and CSS selector
 result = crawler.run(
    url="https://www.nbcnews.com/business",
-    js = js_code,
    extraction_strategy=CosineStrategy(
        semantic_filter="technology",
    ),
@@ -106,7 +74,6 @@ result = crawler.run(
 # Run the crawler with LLM extraction strategy
 result = crawler.run(
    url="https://www.nbcnews.com/business",
-    js = js_code,
    extraction_strategy=LLMExtractionStrategy(
        provider="openai/gpt-4o",
        api_token=os.getenv('OPENAI_API_KEY'),
@@ -188,7 +155,7 @@ pip install -e .[all]
 # docker build --platform linux/amd64 -t crawl4ai .
 # For other users
 # docker build -t crawl4ai .
-docker run -d -p 8000:80 --name crawl4ai_container_1 crawl4ai
+docker run -d -p 8000:80 crawl4ai
 ```


@@ -234,18 +201,14 @@ To use the REST API, send a POST request to `http://localhost:8000/crawl` with t
            "url": "https://www.nbcnews.com/business",
            "extracted_content": "...",
            "html": "...",
-            "cleaned_html": "...",
            "markdown": "...",
-            "media": {...},
-            "links": {...},
-            "metadata": {...},
-            "screenshots": "...",
+            "metadata": {...}
        }
    ]
 }
 ```

-For more information about the available parameters and their descriptions, refer to the [Parameters](#parameters-) section.
+For more information about the available parameters and their descriptions, refer to the [Parameters](#parameters) section.


 ## Python Library Usage 🚀
@@ -278,32 +241,6 @@ Crawl result without raw HTML content:
 result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
 ```

-### Result Structure
-
-The result object contains the following fields:
-```python
-class CrawlResult(BaseModel):
-    url: str
-    html: str
-    success: bool
-    cleaned_html: Optional[str] = None
-    media: Dict[str, List[Dict]] = {} # Media tags in the page {"images": [], "audio": [], "video": []}
-    links: Dict[str, List[Dict]] = {} # Links in the page {"external": [], "internal": []}
-    screenshot: Optional[str] = None # Base64 encoded screenshot
-    markdown: Optional[str] = None
-    extracted_content: Optional[str] = None
-    metadata: Optional[dict] = None
-    error_message: Optional[str] = None
-```
-
-### Taking Screenshots
-
-```python
-result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
-with open("screenshot.png", "wb") as f:
-    f.write(base64.b64decode(result.screenshot))
-```
-
 ### Adding a chunking strategy: RegexChunking

 Using RegexChunking:
@@ -410,12 +347,10 @@ result = crawler.run(url="https://www.nbcnews.com/business")
 | `urls`                | A list of URLs to crawl and extract data from.                                                        | Yes      | -                   |
 | `include_raw_html`    | Whether to include the raw HTML content in the response.                                              | No       | `false`             |
 | `bypass_cache`        | Whether to force a fresh crawl even if the URL has been previously crawled.                           | No       | `false`             |
-| `screenshots`         | Whether to take screenshots of the page.                                                              | No       | `false`             |
 | `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5).    | No       | `5`                 |
 | `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy").                    | No       | `NoExtractionStrategy`    |
 | `chunking_strategy`   | The strategy to use for chunking the text before processing (e.g., "RegexChunking").                  | No       | `RegexChunking`     |
 | `css_selector`        | The CSS selector to target specific parts of the HTML for extraction.                                 | No       | `None`              |
-| `user_agent`          | The user agent to use for the HTTP requests.                                                          | No       | `Mozilla/5.0`       |
 | `verbose`             | Whether to enable verbose logging.                                                                    | No       | `true`              |

 ## Chunking Strategies 📚
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -7,15 +7,6 @@ from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.chrome.options import Options
 from selenium.common.exceptions import InvalidArgumentException
 import logging
-import base64
-from PIL import Image, ImageDraw, ImageFont
-from io import BytesIO
-from typing import List
-import requests
-import os
-from pathlib import Path
-from .utils import wrap_text
-
 logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
 logger.setLevel(logging.WARNING)

@@ -34,20 +25,15 @@ driver_finder_logger = logging.getLogger('selenium.webdriver.common.driver_finde
 driver_finder_logger.setLevel(logging.WARNING)


-
+from typing import List
+import requests
+import os
+from pathlib import Path

 class CrawlerStrategy(ABC):
    @abstractmethod
    def crawl(self, url: str, **kwargs) -> str:
        pass
-    
-    @abstractmethod
-    def take_screenshot(self, save_path: str):
-        pass
-    
-    @abstractmethod
-    def update_user_agent(self, user_agent: str):
-        pass

 class CloudCrawlerStrategy(CrawlerStrategy):
    def __init__(self, use_cached_html = False):
@@ -73,8 +59,6 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
        print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
        self.options = Options()
        self.options.headless = True
-        if kwargs.get("user_agent"):
-            self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
        self.options.add_argument("--no-sandbox")
        self.options.add_argument("--headless")
        # self.options.add_argument("--disable-dev-shm-usage")
@@ -103,18 +87,9 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
        self.service.log_path = "NUL"
        self.driver = webdriver.Chrome(service=self.service, options=self.options)

-    def update_user_agent(self, user_agent: str):
-        self.options.add_argument(f"user-agent={user_agent}")
-        self.driver.quit()
-        self.driver = webdriver.Chrome(service=self.service, options=self.options)
-
    def crawl(self, url: str) -> str:
-        # Create md5 hash of the URL
-        import hashlib
-        url_hash = hashlib.md5(url.encode()).hexdigest()
-        
        if self.use_cached_html:
-            cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
+            cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
            if os.path.exists(cache_file_path):
                with open(cache_file_path, "r") as f:
                    return f.read()
@@ -128,23 +103,17 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
            )
            
            # Execute JS code if provided
-            if self.js_code and type(self.js_code) == str:
+            if self.js_code:
                self.driver.execute_script(self.js_code)
                # Optionally, wait for some condition after executing the JS code
                WebDriverWait(self.driver, 10).until(
                    lambda driver: driver.execute_script("return document.readyState") == "complete"
                )
-            elif self.js_code and type(self.js_code) == list:
-                for js in self.js_code:
-                    self.driver.execute_script(js)
-                    WebDriverWait(self.driver, 10).until(
-                        lambda driver: driver.execute_script("return document.readyState") == "complete"
-                    )
            
            html = self.driver.page_source
            
            # Store in cache
-            cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
+            cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
            with open(cache_file_path, "w") as f:
                f.write(html)
                
@@ -157,62 +126,5 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
        except Exception as e:
            raise Exception(f"Failed to crawl {url}: {str(e)}")

-    def take_screenshot(self) -> str:
-        try:
-            # Get the dimensions of the page
-            total_width = self.driver.execute_script("return document.body.scrollWidth")
-            total_height = self.driver.execute_script("return document.body.scrollHeight")
-
-            # Set the window size to the dimensions of the page
-            self.driver.set_window_size(total_width, total_height)
-
-            # Take screenshot
-            screenshot = self.driver.get_screenshot_as_png()
-
-            # Open the screenshot with PIL
-            image = Image.open(BytesIO(screenshot))
-
-            # Convert to JPEG and compress
-            buffered = BytesIO()
-            image.save(buffered, format="JPEG", quality=85)
-            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
-
-            if self.verbose:
-                print(f"[LOG] 📸 Screenshot taken and converted to base64")
-
-            return img_base64
-
-        except Exception as e:
-            error_message = f"Failed to take screenshot: {str(e)}"
-            print(error_message)
-
-            # Generate an image with black background
-            img = Image.new('RGB', (800, 600), color='black')
-            draw = ImageDraw.Draw(img)
-            
-            # Load a font
-            try:
-                font = ImageFont.truetype("arial.ttf", 40)
-            except IOError:
-                font = ImageFont.load_default(size=40)
-
-            # Define text color and wrap the text
-            text_color = (255, 255, 255)
-            max_width = 780
-            wrapped_text = wrap_text(draw, error_message, font, max_width)
-
-            # Calculate text position
-            text_position = (10, 10)
-            
-            # Draw the text on the image
-            draw.text(text_position, wrapped_text, fill=text_color, font=font)
-            
-            # Convert to base64
-            buffered = BytesIO()
-            img.save(buffered, format="JPEG")
-            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
-
-            return img_base64
-
    def quit(self):
        self.driver.quit()
--- a/crawl4ai/database.py
+++ b/crawl4ai/database.py
@@ -1,12 +1,13 @@
 import os
 from pathlib import Path
 import sqlite3
+from typing import Optional
 from typing import Optional, Tuple

 DB_PATH = os.path.join(Path.home(), ".crawl4ai")
 os.makedirs(DB_PATH, exist_ok=True)
 DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
-
+        
 def init_db():
    global DB_PATH
    conn = sqlite3.connect(DB_PATH)
@@ -18,37 +19,22 @@ def init_db():
            cleaned_html TEXT,
            markdown TEXT,
            extracted_content TEXT,
-            success BOOLEAN,
-            media TEXT DEFAULT "{}",
-            link TEXT DEFAULT "{}",
-            metadata TEXT DEFAULT "{}",
-            screenshot TEXT DEFAULT ""
+            success BOOLEAN
        )
    ''')
    conn.commit()
    conn.close()

-def alter_db_add_screenshot(new_column: str = "media"):
-    check_db_path()
-    try:
-        conn = sqlite3.connect(DB_PATH)
-        cursor = conn.cursor()
-        cursor.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
-        conn.commit()
-        conn.close()
-    except Exception as e:
-        print(f"Error altering database to add screenshot column: {e}")
-
 def check_db_path():
    if not DB_PATH:
        raise ValueError("Database path is not set or is empty.")

-def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
+def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
    check_db_path()
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
-        cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?', (url,))
+        cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success FROM crawled_data WHERE url = ?', (url,))
        result = cursor.fetchone()
        conn.close()
        return result
@@ -56,25 +42,21 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, str
        print(f"Error retrieving cached URL: {e}")
        return None

-def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", metadata : str = "{}", screenshot: str = ""):
+def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool):
    check_db_path()
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute('''
-            INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
-            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success)
+            VALUES (?, ?, ?, ?, ?, ?)
            ON CONFLICT(url) DO UPDATE SET
                html = excluded.html,
                cleaned_html = excluded.cleaned_html,
                markdown = excluded.markdown,
                extracted_content = excluded.extracted_content,
-                success = excluded.success,
-                media = excluded.media,      
-                links = excluded.links,    
-                metadata = excluded.metadata,      
-                screenshot = excluded.screenshot
-        ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot))
+                success = excluded.success
+        ''', (url, html, cleaned_html, markdown, extracted_content, success))
        conn.commit()
        conn.close()
    except Exception as e:
@@ -113,20 +95,4 @@ def flush_db():
        conn.commit()
        conn.close()
    except Exception as e:
-        print(f"Error flushing database: {e}")
-
-def update_existing_records(new_column: str = "media", default_value: str = "{}"):
-    check_db_path()
-    try:
-        conn = sqlite3.connect(DB_PATH)
-        cursor = conn.cursor()
-        cursor.execute(f'UPDATE crawled_data SET {new_column} = "{default_value}" WHERE screenshot IS NULL')
-        conn.commit()
-        conn.close()
-    except Exception as e:
-        print(f"Error updating existing records: {e}")
-
-if __name__ == "__main__":
-    init_db()  # Initialize the database if not already initialized
-    alter_db_add_screenshot("metadata")  # Add the new column to the table
-    update_existing_records("metadata")  # Update existing records to set the new column to an empty string
+        print(f"Error flushing database: {e}")
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -188,15 +188,14 @@ class CosineStrategy(ExtractionStrategy):
        if self.verbose:
            print(f"[LOG] Loading Extraction Model for {self.device.type} device.")

-        # if False and self.device.type == "cpu":
-        #     self.model = load_onnx_all_MiniLM_l6_v2()
-        #     self.tokenizer = self.model.tokenizer
-        #     self.get_embedding_method = "direct"
-        # else:
-
-        self.tokenizer, self.model = load_bge_small_en_v1_5()
-        self.model.eval()  
-        self.get_embedding_method = "batch"
+        if False and self.device.type == "cpu":
+            self.model = load_onnx_all_MiniLM_l6_v2()
+            self.tokenizer = self.model.tokenizer
+            self.get_embedding_method = "direct"
+        else:
+            self.tokenizer, self.model = load_bge_small_en_v1_5()
+            self.model.eval()  
+            self.get_embedding_method = "batch"
        
        self.buffer_embeddings = np.array([])

--- a/crawl4ai/model_loader.py
+++ b/crawl4ai/model_loader.py
@@ -2,7 +2,6 @@ from functools import lru_cache
 from pathlib import Path
 import subprocess, os
 import shutil
-import tarfile
 from crawl4ai.config import MODEL_REPO_BRANCH
 import argparse
 import urllib.request
@@ -35,7 +34,8 @@ def calculate_batch_size(device):
        else:
            return 32
    else:
-        return 16  # Default batch size   
+        return 16  # Default batch size
+    
    
@lru_cache()
 def get_device():
@@ -53,6 +53,7 @@ def set_model_device(model):
    model.to(device)    
    return model, device

+@lru_cache()
 def get_home_folder():
    home_folder = os.path.join(Path.home(), ".crawl4ai")
    os.makedirs(home_folder, exist_ok=True)
@@ -81,19 +82,12 @@ def load_bge_small_en_v1_5():
@lru_cache()
 def load_onnx_all_MiniLM_l6_v2():
    from crawl4ai.onnx_embedding import DefaultEmbeddingModel
-
-    model_path = "models/onnx.tar.gz"
-    model_url = "https://unclecode-files.s3.us-west-2.amazonaws.com/onnx.tar.gz"
-    __location__ = os.path.realpath(
-        os.path.join(os.getcwd(), os.path.dirname(__file__)))
+    model_path = "models/onnx/model.onnx"
+    model_url = "https://unclecode-files.s3.us-west-2.amazonaws.com/model.onnx"
    download_path = os.path.join(__location__, model_path)
-    onnx_dir = os.path.join(__location__, "models/onnx")
-    
-    # Create the models directory if it does not exist
-    os.makedirs(os.path.dirname(download_path), exist_ok=True)

-    # Download the tar.gz file if it does not exist
    if not os.path.exists(download_path):
+        # Define a download function with a simple progress display
        def download_with_progress(url, filename):
            def reporthook(block_num, block_size, total_size):
                downloaded = block_num * block_size
@@ -101,22 +95,12 @@ def load_onnx_all_MiniLM_l6_v2():
                if downloaded < total_size:
                    print(f"\rDownloading: {percentage:.2f}% ({downloaded / (1024 * 1024):.2f} MB of {total_size / (1024 * 1024):.2f} MB)", end='')
                else:
-                    print("\rDownload complete!")
+                    print("\rDownload complete!                              ")

            urllib.request.urlretrieve(url, filename, reporthook)

        download_with_progress(model_url, download_path)

-    # Extract the tar.gz file if the onnx directory does not exist
-    if not os.path.exists(onnx_dir):
-        with tarfile.open(download_path, "r:gz") as tar:
-            tar.extractall(path=os.path.join(__location__, "models"))
-        
-        # remove the tar.gz file
-        os.remove(download_path)
-    
-    
-    
    model = DefaultEmbeddingModel()
    return model

@@ -201,7 +185,7 @@ def load_spacy_model():
        repo_folder = os.path.join(home_folder, "crawl4ai")
        model_folder = os.path.join(home_folder, name)

-        print("[LOG] ⏬ Downloading Spacy model for the first time...")
+        # print("[LOG] ⏬ Downloading Spacy model for the first time...")

        # Remove existing repo folder if it exists
        if Path(repo_folder).exists():
@@ -229,7 +213,7 @@ def load_spacy_model():
            shutil.rmtree(repo_folder)

            # Print completion message
-            print("[LOG] ✅ Spacy Model downloaded successfully")
+            # print("[LOG] ✅ Spacy Model downloaded successfully")
        except subprocess.CalledProcessError as e:
            print(f"An error occurred while cloning the repository: {e}")
        except Exception as e:
@@ -254,10 +238,10 @@ def download_all_models(remove_existing=False):
    # Load each model to trigger download
    # print("[LOG] Downloading BERT Base Uncased...")
    # load_bert_base_uncased()
-    print("[LOG] Downloading BGE Small EN v1.5...")
-    load_bge_small_en_v1_5()
-    # print("[LOG] Downloading ONNX model...")
-    # load_onnx_all_MiniLM_l6_v2()
+    # print("[LOG] Downloading BGE Small EN v1.5...")
+    # load_bge_small_en_v1_5()
+    print("[LOG] Downloading ONNX model...")
+    load_onnx_all_MiniLM_l6_v2()
    print("[LOG] Downloading text classifier...")
    _, device = load_text_multilabel_classifier()
    print(f"[LOG] Text classifier loaded on {device}")
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -1,5 +1,5 @@
 from pydantic import BaseModel, HttpUrl
-from typing import List, Dict, Optional
+from typing import List

 class UrlModel(BaseModel):
    url: HttpUrl
@@ -9,11 +9,8 @@ class CrawlResult(BaseModel):
    url: str
    html: str
    success: bool
-    cleaned_html: Optional[str] = None
-    media: Dict[str, List[Dict]] = {}
-    links: Dict[str, List[Dict]] = {}
-    screenshot: Optional[str] = None
-    markdown: Optional[str] = None
-    extracted_content: Optional[str] = None
-    metadata: Optional[dict] = None
-    error_message: Optional[str] = None
+    cleaned_html: str = None
+    markdown: str = None
+    extracted_content: str = None
+    metadata: dict = None
+    error_message: str = None
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -151,7 +151,7 @@ class CustomHTML2Text(HTML2Text):

        super().handle_tag(tag, attrs, start)

-def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
+def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
    try:
        if not html:
            return None
@@ -170,28 +170,6 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
            for el in selected_elements:
                div_tag.append(el)
            body = div_tag
-            
-        links = {
-            'internal': [],
-            'external': []
-        }
-        
-        # Extract all internal and external links
-        for a in body.find_all('a', href=True):
-            href = a['href']
-            url_base = url.split('/')[2]
-            if href.startswith('http') and url_base not in href:
-                links['external'].append({
-                    'href': href,
-                    'text': a.get_text()
-                })
-            else:
-                links['internal'].append(
-                    {
-                        'href': href,
-                        'text': a.get_text()
-                    }
-                )

        # Remove script, style, and other tags that don't carry useful content from body
        for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']):
@@ -202,35 +180,6 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
            if tag.name != 'img':
                tag.attrs = {}

-        # Extract all img tgas inti [{src: '', alt: ''}]
-        media = {
-            'images': [],
-            'videos': [],
-            'audios': []
-        }
-        for img in body.find_all('img'):
-            media['images'].append({
-                'src': img.get('src'),
-                'alt': img.get('alt'),
-                "type": "image"
-            })
-            
-        # Extract all video tags into [{src: '', alt: ''}]
-        for video in body.find_all('video'):
-            media['videos'].append({
-                'src': video.get('src'),
-                'alt': video.get('alt'),
-                "type": "video"
-            })
-            
-        # Extract all audio tags into [{src: '', alt: ''}]
-        for audio in body.find_all('audio'):
-            media['audios'].append({
-                'src': audio.get('src'),
-                'alt': audio.get('alt'),
-                "type": "audio"
-            })
-        
        # Replace images with their alt text or remove them if no alt text is available
        for img in body.find_all('img'):
            alt_text = img.get('alt')
@@ -350,56 +299,13 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
        return{
            'markdown': markdown,
            'cleaned_html': cleaned_html,
-            'success': True,
-            'media': media,
-            'links': links
+            'success': True
        }

    except Exception as e:
        print('Error processing HTML content:', str(e))
        raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e

-
-
-def extract_metadata(html):
-    metadata = {}
-    
-    if not html:
-        return metadata
-    
-    # Parse HTML content with BeautifulSoup
-    soup = BeautifulSoup(html, 'html.parser')
-
-    # Title
-    title_tag = soup.find('title')
-    metadata['title'] = title_tag.string if title_tag else None
-
-    # Meta description
-    description_tag = soup.find('meta', attrs={'name': 'description'})
-    metadata['description'] = description_tag['content'] if description_tag else None
-
-    # Meta keywords
-    keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
-    metadata['keywords'] = keywords_tag['content'] if keywords_tag else None
-
-    # Meta author
-    author_tag = soup.find('meta', attrs={'name': 'author'})
-    metadata['author'] = author_tag['content'] if author_tag else None
-
-    # Open Graph metadata
-    og_tags = soup.find_all('meta', attrs={'property': lambda value: value and value.startswith('og:')})
-    for tag in og_tags:
-        property_name = tag['property']
-        metadata[property_name] = tag['content']
-
-    # Twitter Card metadata
-    twitter_tags = soup.find_all('meta', attrs={'name': lambda value: value and value.startswith('twitter:')})
-    for tag in twitter_tags:
-        property_name = tag['name']
-        metadata[property_name] = tag['content']
-
-    return metadata
-
 def extract_xml_tags(string):
    tags = re.findall(r'<(\w+)>', string)
    return list(set(tags))
@@ -577,16 +483,4 @@ def process_sections(url: str, sections: list, provider: str, api_token: str) ->
            for future in as_completed(futures):
                extracted_content.extend(future.result())
    
-    return extracted_content
-
-
-def wrap_text(draw, text, font, max_width):
-    # Wrap the text to fit within the specified width
-    lines = []
-    words = text.split()
-    while words:
-        line = ''
-        while words and draw.textbbox((0, 0), line + words[0], font=font)[2] <= max_width:
-            line += (words.pop(0) + ' ')
-        lines.append(line)
-    return '\n'.join(lines)
+    return extracted_content
--- a/crawl4ai/web_crawler.back.py
+++ b/crawl4ai/web_crawler.back.py
@@ -1,357 +0,0 @@
-import os, time
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-from pathlib import Path
-
-from .models import UrlModel, CrawlResult
-from .database import init_db, get_cached_url, cache_url, DB_PATH, flush_db
-from .utils import *
-from .chunking_strategy import *
-from .extraction_strategy import *
-from .crawler_strategy import *
-from typing import List
-from concurrent.futures import ThreadPoolExecutor
-from .config import *
-
-
-class WebCrawler:
-    def __init__(
-        self,
-        # db_path: str = None,
-        crawler_strategy: CrawlerStrategy = None,
-        always_by_pass_cache: bool = False,
-        verbose: bool = False,
-    ):
-        # self.db_path = db_path
-        self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
-        self.always_by_pass_cache = always_by_pass_cache
-
-        # Create the .crawl4ai folder in the user's home directory if it doesn't exist
-        self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
-        os.makedirs(self.crawl4ai_folder, exist_ok=True)
-        os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
-
-        # If db_path is not provided, use the default path
-        # if not db_path:
-            # self.db_path = f"{self.crawl4ai_folder}/crawl4ai.db"
-        
-        # flush_db()
-        init_db()
-        
-        self.ready = False
-        
-    def warmup(self):
-        print("[LOG] 🌤️  Warming up the WebCrawler")
-        result = self.run(
-            url='https://crawl4ai.uccode.io/',
-            word_count_threshold=5,
-            extraction_strategy= NoExtractionStrategy(),
-            bypass_cache=False,
-            verbose = False
-        )
-        self.ready = True
-        print("[LOG] 🌞 WebCrawler is ready to crawl")
-        
-    def fetch_page(
-        self,
-        url_model: UrlModel,
-        provider: str = DEFAULT_PROVIDER,
-        api_token: str = None,
-        extract_blocks_flag: bool = True,
-        word_count_threshold=MIN_WORD_THRESHOLD,
-        css_selector: str = None,
-        screenshot: bool = False,
-        use_cached_html: bool = False,
-        extraction_strategy: ExtractionStrategy = None,
-        chunking_strategy: ChunkingStrategy = RegexChunking(),
-        **kwargs,
-    ) -> CrawlResult:
-        return self.run(
-            url_model.url,
-            word_count_threshold,
-            extraction_strategy or NoExtractionStrategy(),
-            chunking_strategy,
-            bypass_cache=url_model.forced,
-            css_selector=css_selector,
-            screenshot=screenshot,
-            **kwargs,
-        )
-        pass
-
-    def run_old(
-        self,
-        url: str,
-        word_count_threshold=MIN_WORD_THRESHOLD,
-        extraction_strategy: ExtractionStrategy = None,
-        chunking_strategy: ChunkingStrategy = RegexChunking(),
-        bypass_cache: bool = False,
-        css_selector: str = None,
-        screenshot: bool = False,
-        user_agent: str = None,
-        verbose=True,
-        **kwargs,
-    ) -> CrawlResult:
-        if user_agent:
-            self.crawler_strategy.update_user_agent(user_agent)
-        extraction_strategy = extraction_strategy or NoExtractionStrategy()
-        extraction_strategy.verbose = verbose
-        # Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
-        if not isinstance(extraction_strategy, ExtractionStrategy):
-            raise ValueError("Unsupported extraction strategy")
-        if not isinstance(chunking_strategy, ChunkingStrategy):
-            raise ValueError("Unsupported chunking strategy")
-        
-        # make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD
-        if word_count_threshold < MIN_WORD_THRESHOLD:
-            word_count_threshold = MIN_WORD_THRESHOLD
-
-        # Check cache first
-        if not bypass_cache and not self.always_by_pass_cache:
-            cached = get_cached_url(url)
-            if cached:
-                return CrawlResult(
-                    **{
-                        "url": cached[0],
-                        "html": cached[1],
-                        "cleaned_html": cached[2],
-                        "markdown": cached[3],
-                        "extracted_content": cached[4],
-                        "success": cached[5],
-                        "media": json.loads(cached[6] or "{}"),
-                        "links": json.loads(cached[7] or "{}"),
-                        "metadata": json.loads(cached[8] or "{}"), # "metadata": "{}
-                        "screenshot": cached[9],
-                        "error_message": "",
-                    }
-                )
-
-        # Initialize WebDriver for crawling
-        t = time.time()
-        if kwargs.get("js", None):
-            self.crawler_strategy.js_code = kwargs.get("js")
-        html = self.crawler_strategy.crawl(url)
-        base64_image = None
-        if screenshot:
-            base64_image = self.crawler_strategy.take_screenshot()
-        success = True
-        error_message = ""
-        # Extract content from HTML
-        try:
-            result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
-            metadata = extract_metadata(html)
-            if result is None:
-                raise ValueError(f"Failed to extract content from the website: {url}")
-        except InvalidCSSSelectorError as e:
-            raise ValueError(str(e))
-        
-        cleaned_html = result.get("cleaned_html", "")
-        markdown = result.get("markdown", "")
-        media = result.get("media", [])
-        links = result.get("links", [])
-
-        # Print a profession LOG style message, show time taken and say crawling is done
-        if verbose:
-            print(
-                f"[LOG] 🚀 Crawling done for {url}, success: {success}, time taken: {time.time() - t} seconds"
-            )
-
-        extracted_content = []
-        if verbose:
-            print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
-        t = time.time()
-        # Split markdown into sections
-        sections = chunking_strategy.chunk(markdown)
-        # sections = merge_chunks_based_on_token_threshold(sections, CHUNK_TOKEN_THRESHOLD)
-
-        extracted_content = extraction_strategy.run(
-            url, sections,
-        )
-        extracted_content = json.dumps(extracted_content)
-
-        if verbose:
-            print(
-                f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds."
-            )
-
-        # Cache the result
-        cleaned_html = beautify_html(cleaned_html)
-        cache_url(
-            url,
-            html,
-            cleaned_html,
-            markdown,
-            extracted_content,
-            success,
-            json.dumps(media),
-            json.dumps(links),
-            json.dumps(metadata),
-            screenshot=base64_image,
-        )
-
-        return CrawlResult(
-            url=url,
-            html=html,
-            cleaned_html=cleaned_html,
-            markdown=markdown,
-            media=media,
-            links=links,
-            metadata=metadata,
-            screenshot=base64_image,
-            extracted_content=extracted_content,
-            success=success,
-            error_message=error_message,
-        )
-
-    def fetch_pages(
-        self,
-        url_models: List[UrlModel],
-        provider: str = DEFAULT_PROVIDER,
-        api_token: str = None,
-        extract_blocks_flag: bool = True,
-        word_count_threshold=MIN_WORD_THRESHOLD,
-        use_cached_html: bool = False,
-        css_selector: str = None,
-        screenshot: bool = False,
-        extraction_strategy: ExtractionStrategy = None,
-        chunking_strategy: ChunkingStrategy = RegexChunking(),
-        **kwargs,
-    ) -> List[CrawlResult]:
-        extraction_strategy = extraction_strategy or NoExtractionStrategy()
-        def fetch_page_wrapper(url_model, *args, **kwargs):
-            return self.fetch_page(url_model, *args, **kwargs)
-
-        with ThreadPoolExecutor() as executor:
-            results = list(
-                executor.map(
-                    fetch_page_wrapper,
-                    url_models,
-                    [provider] * len(url_models),
-                    [api_token] * len(url_models),
-                    [extract_blocks_flag] * len(url_models),
-                    [word_count_threshold] * len(url_models),
-                    [css_selector] * len(url_models),
-                    [screenshot] * len(url_models),
-                    [use_cached_html] * len(url_models),
-                    [extraction_strategy] * len(url_models),
-                    [chunking_strategy] * len(url_models),
-                    *[kwargs] * len(url_models),
-                )
-            )
-
-        return results
-
-    def run(
-            self,
-            url: str,
-            word_count_threshold=MIN_WORD_THRESHOLD,
-            extraction_strategy: ExtractionStrategy = None,
-            chunking_strategy: ChunkingStrategy = RegexChunking(),
-            bypass_cache: bool = False,
-            css_selector: str = None,
-            screenshot: bool = False,
-            user_agent: str = None,
-            verbose=True,
-            **kwargs,
-        ) -> CrawlResult:
-            extraction_strategy = extraction_strategy or NoExtractionStrategy()
-            extraction_strategy.verbose = verbose
-            if not isinstance(extraction_strategy, ExtractionStrategy):
-                raise ValueError("Unsupported extraction strategy")
-            if not isinstance(chunking_strategy, ChunkingStrategy):
-                raise ValueError("Unsupported chunking strategy")
-            
-            if word_count_threshold < MIN_WORD_THRESHOLD:
-                word_count_threshold = MIN_WORD_THRESHOLD
-
-            # Check cache first
-            cached = None
-            extracted_content = None
-            if not bypass_cache and not self.always_by_pass_cache:
-                cached = get_cached_url(url)
-            
-            if cached:
-                html = cached[1]
-                extracted_content = cached[2]
-                if screenshot:
-                    screenshot = cached[9]
-            
-            else:
-                if user_agent:
-                    self.crawler_strategy.update_user_agent(user_agent)
-                html = self.crawler_strategy.crawl(url)
-                if screenshot:
-                    screenshot = self.crawler_strategy.take_screenshot()
-            
-            return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, bool(cached), **kwargs)
-
-    def process_html(
-            self,
-            url: str,
-            html: str,
-            extracted_content: str,
-            word_count_threshold: int,
-            extraction_strategy: ExtractionStrategy,
-            chunking_strategy: ChunkingStrategy,
-            css_selector: str,
-            screenshot: bool,
-            verbose: bool,
-            is_cached: bool,
-            **kwargs,
-        ) -> CrawlResult:
-            t = time.time()
-            # Extract content from HTML
-            try:
-                result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
-                metadata = extract_metadata(html)
-                if result is None:
-                    raise ValueError(f"Failed to extract content from the website: {url}")
-            except InvalidCSSSelectorError as e:
-                raise ValueError(str(e))
-            
-            cleaned_html = result.get("cleaned_html", "")
-            markdown = result.get("markdown", "")
-            media = result.get("media", [])
-            links = result.get("links", [])
-
-            if verbose:
-                print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds")
-                        
-            if extracted_content is None:
-                if verbose:
-                    print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
-
-                sections = chunking_strategy.chunk(markdown)
-                extracted_content = extraction_strategy.run(url, sections)
-                extracted_content = json.dumps(extracted_content)
-
-                if verbose:
-                    print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.")
-                
-            screenshot = None if not screenshot else screenshot
-            
-            if not is_cached:
-                cache_url(
-                    url,
-                    html,
-                    cleaned_html,
-                    markdown,
-                    extracted_content,
-                    True,
-                    json.dumps(media),
-                    json.dumps(links),
-                    json.dumps(metadata),
-                    screenshot=screenshot,
-                )                
-
-            return CrawlResult(
-                url=url,
-                html=html,
-                cleaned_html=cleaned_html,
-                markdown=markdown,
-                media=media,
-                links=links,
-                metadata=metadata,
-                screenshot=screenshot,
-                extracted_content=extracted_content,
-                success=True,
-                error_message="",
-            )
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -51,6 +51,7 @@ class WebCrawler:
        self.ready = True
        print("[LOG] 🌞 WebCrawler is ready to crawl")
        
+
    def fetch_page(
        self,
        url_model: UrlModel,
@@ -58,8 +59,6 @@ class WebCrawler:
        api_token: str = None,
        extract_blocks_flag: bool = True,
        word_count_threshold=MIN_WORD_THRESHOLD,
-        css_selector: str = None,
-        screenshot: bool = False,
        use_cached_html: bool = False,
        extraction_strategy: ExtractionStrategy = None,
        chunking_strategy: ChunkingStrategy = RegexChunking(),
@@ -71,12 +70,111 @@ class WebCrawler:
            extraction_strategy or NoExtractionStrategy(),
            chunking_strategy,
            bypass_cache=url_model.forced,
-            css_selector=css_selector,
-            screenshot=screenshot,
            **kwargs,
        )
        pass

+
+    def run(
+        self,
+        url: str,
+        word_count_threshold=MIN_WORD_THRESHOLD,
+        extraction_strategy: ExtractionStrategy = None,
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        bypass_cache: bool = False,
+        css_selector: str = None,
+        verbose=True,
+        **kwargs,
+    ) -> CrawlResult:
+        extraction_strategy = extraction_strategy or NoExtractionStrategy()
+        extraction_strategy.verbose = verbose
+        # Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
+        if not isinstance(extraction_strategy, ExtractionStrategy):
+            raise ValueError("Unsupported extraction strategy")
+        if not isinstance(chunking_strategy, ChunkingStrategy):
+            raise ValueError("Unsupported chunking strategy")
+        
+        # make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD
+        if word_count_threshold < MIN_WORD_THRESHOLD:
+            word_count_threshold = MIN_WORD_THRESHOLD
+
+        # Check cache first
+        if not bypass_cache and not self.always_by_pass_cache:
+            cached = get_cached_url(url)
+            if cached:
+                return CrawlResult(
+                    **{
+                        "url": cached[0],
+                        "html": cached[1],
+                        "cleaned_html": cached[2],
+                        "markdown": cached[3],
+                        "extracted_content": cached[4],
+                        "success": cached[5],
+                        "error_message": "",
+                    }
+                )
+
+        # Initialize WebDriver for crawling
+        t = time.time()
+        html = self.crawler_strategy.crawl(url)
+        success = True
+        error_message = ""
+        # Extract content from HTML
+        try:
+            result = get_content_of_website(html, word_count_threshold, css_selector=css_selector)
+            if result is None:
+                raise ValueError(f"Failed to extract content from the website: {url}")
+        except InvalidCSSSelectorError as e:
+            raise ValueError(str(e))
+        
+        cleaned_html = result.get("cleaned_html", html)
+        markdown = result.get("markdown", "")
+
+        # Print a profession LOG style message, show time taken and say crawling is done
+        if verbose:
+            print(
+                f"[LOG] 🚀 Crawling done for {url}, success: {success}, time taken: {time.time() - t} seconds"
+            )
+
+        extracted_content = []
+        if verbose:
+            print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
+        t = time.time()
+        # Split markdown into sections
+        sections = chunking_strategy.chunk(markdown)
+        # sections = merge_chunks_based_on_token_threshold(sections, CHUNK_TOKEN_THRESHOLD)
+
+        extracted_content = extraction_strategy.run(
+            url, sections,
+        )
+        extracted_content = json.dumps(extracted_content)
+
+        if verbose:
+            print(
+                f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds."
+            )
+
+        # Cache the result
+        cleaned_html = beautify_html(cleaned_html)
+        cache_url(
+            url,
+            html,
+            cleaned_html,
+            markdown,
+            extracted_content,
+            success,
+        )
+
+        return CrawlResult(
+            url=url,
+            html=html,
+            cleaned_html=cleaned_html,
+            markdown=markdown,
+            extracted_content=extracted_content,
+            success=success,
+            error_message=error_message,
+        )
+
    def fetch_pages(
        self,
        url_models: List[UrlModel],
@@ -85,8 +183,6 @@ class WebCrawler:
        extract_blocks_flag: bool = True,
        word_count_threshold=MIN_WORD_THRESHOLD,
        use_cached_html: bool = False,
-        css_selector: str = None,
-        screenshot: bool = False,
        extraction_strategy: ExtractionStrategy = None,
        chunking_strategy: ChunkingStrategy = RegexChunking(),
        **kwargs,
@@ -104,8 +200,6 @@ class WebCrawler:
                    [api_token] * len(url_models),
                    [extract_blocks_flag] * len(url_models),
                    [word_count_threshold] * len(url_models),
-                    [css_selector] * len(url_models),
-                    [screenshot] * len(url_models),
                    [use_cached_html] * len(url_models),
                    [extraction_strategy] * len(url_models),
                    [chunking_strategy] * len(url_models),
@@ -114,120 +208,3 @@ class WebCrawler:
            )

        return results
-
-    def run(
-            self,
-            url: str,
-            word_count_threshold=MIN_WORD_THRESHOLD,
-            extraction_strategy: ExtractionStrategy = None,
-            chunking_strategy: ChunkingStrategy = RegexChunking(),
-            bypass_cache: bool = False,
-            css_selector: str = None,
-            screenshot: bool = False,
-            user_agent: str = None,
-            verbose=True,
-            **kwargs,
-        ) -> CrawlResult:
-            extraction_strategy = extraction_strategy or NoExtractionStrategy()
-            extraction_strategy.verbose = verbose
-            if not isinstance(extraction_strategy, ExtractionStrategy):
-                raise ValueError("Unsupported extraction strategy")
-            if not isinstance(chunking_strategy, ChunkingStrategy):
-                raise ValueError("Unsupported chunking strategy")
-            
-            if word_count_threshold < MIN_WORD_THRESHOLD:
-                word_count_threshold = MIN_WORD_THRESHOLD
-
-            # Check cache first
-            cached = None
-            extracted_content = None
-            if not bypass_cache and not self.always_by_pass_cache:
-                cached = get_cached_url(url)
-            
-            if cached:
-                html = cached[1]
-                extracted_content = cached[2]
-                if screenshot:
-                    screenshot = cached[9]
-            
-            else:
-                if user_agent:
-                    self.crawler_strategy.update_user_agent(user_agent)
-                html = self.crawler_strategy.crawl(url)
-                if screenshot:
-                    screenshot = self.crawler_strategy.take_screenshot()
-            
-            return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, bool(cached), **kwargs)
-
-    def process_html(
-            self,
-            url: str,
-            html: str,
-            extracted_content: str,
-            word_count_threshold: int,
-            extraction_strategy: ExtractionStrategy,
-            chunking_strategy: ChunkingStrategy,
-            css_selector: str,
-            screenshot: bool,
-            verbose: bool,
-            is_cached: bool,
-            **kwargs,
-        ) -> CrawlResult:
-            t = time.time()
-            # Extract content from HTML
-            try:
-                result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
-                metadata = extract_metadata(html)
-                if result is None:
-                    raise ValueError(f"Failed to extract content from the website: {url}")
-            except InvalidCSSSelectorError as e:
-                raise ValueError(str(e))
-            
-            cleaned_html = result.get("cleaned_html", "")
-            markdown = result.get("markdown", "")
-            media = result.get("media", [])
-            links = result.get("links", [])
-
-            if verbose:
-                print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds")
-                        
-            if extracted_content is None:
-                if verbose:
-                    print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
-
-                sections = chunking_strategy.chunk(markdown)
-                extracted_content = extraction_strategy.run(url, sections)
-                extracted_content = json.dumps(extracted_content)
-
-                if verbose:
-                    print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.")
-                
-            screenshot = None if not screenshot else screenshot
-            
-            if not is_cached:
-                cache_url(
-                    url,
-                    html,
-                    cleaned_html,
-                    markdown,
-                    extracted_content,
-                    True,
-                    json.dumps(media),
-                    json.dumps(links),
-                    json.dumps(metadata),
-                    screenshot=screenshot,
-                )                
-
-            return CrawlResult(
-                url=url,
-                html=html,
-                cleaned_html=cleaned_html,
-                markdown=markdown,
-                media=media,
-                links=links,
-                metadata=metadata,
-                screenshot=screenshot,
-                extracted_content=extracted_content,
-                success=True,
-                error_message="",
-            )
--- a/docs/.DS_Store
+++ b/docs/.DS_Store
--- a/docs/examples/assets/audio.mp3
+++ b/docs/examples/assets/audio.mp3
--- a/docs/examples/assets/basic.png
+++ b/docs/examples/assets/basic.png
--- a/docs/examples/assets/cosine_extraction.png
+++ b/docs/examples/assets/cosine_extraction.png
--- a/docs/examples/assets/css_js.png
+++ b/docs/examples/assets/css_js.png
--- a/docs/examples/assets/css_selector.png
+++ b/docs/examples/assets/css_selector.png
--- a/docs/examples/assets/exec_script.png
+++ b/docs/examples/assets/exec_script.png
--- a/docs/examples/assets/llm_extraction.png
+++ b/docs/examples/assets/llm_extraction.png
--- a/docs/examples/assets/semantic_extraction_cosine.png
+++ b/docs/examples/assets/semantic_extraction_cosine.png
--- a/docs/examples/assets/semantic_extraction_llm.png
+++ b/docs/examples/assets/semantic_extraction_llm.png
--- a/docs/examples/chainlit.md
+++ b/docs/examples/chainlit.md
@@ -1,3 +0,0 @@
-# Welcome to Crawl4AI! 🚀🤖
-
-Hi there, Developer! 👋 Here is an example of a research pipeline, where you can share a URL in your conversation with any LLM, and then the context of crawled pages will be used as the context.
--- a/docs/examples/chainlit_review.py
+++ b/docs/examples/chainlit_review.py
@@ -1,281 +0,0 @@
-from openai import AsyncOpenAI
-from chainlit.types import ThreadDict
-import chainlit as cl
-from chainlit.input_widget import Select, Switch, Slider
-client = AsyncOpenAI()
-
-# Instrument the OpenAI client
-cl.instrument_openai()
-
-settings = {
-    "model": "gpt-3.5-turbo",
-    "temperature": 0.5,
-    "max_tokens": 500,
-    "top_p": 1,
-    "frequency_penalty": 0,
-    "presence_penalty": 0,
-}
-
-@cl.action_callback("action_button")
-async def on_action(action: cl.Action):
-    print("The user clicked on the action button!")
-
-    return "Thank you for clicking on the action button!"
-
-@cl.set_chat_profiles
-async def chat_profile():
-    return [
-        cl.ChatProfile(
-            name="GPT-3.5",
-            markdown_description="The underlying LLM model is **GPT-3.5**.",
-            icon="https://picsum.photos/200",
-        ),
-        cl.ChatProfile(
-            name="GPT-4",
-            markdown_description="The underlying LLM model is **GPT-4**.",
-            icon="https://picsum.photos/250",
-        ),
-    ]
-
-@cl.on_chat_start
-async def on_chat_start():
-    
-    settings = await cl.ChatSettings(
-        [
-            Select(
-                id="Model",
-                label="OpenAI - Model",
-                values=["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "gpt-4-32k"],
-                initial_index=0,
-            ),
-            Switch(id="Streaming", label="OpenAI - Stream Tokens", initial=True),
-            Slider(
-                id="Temperature",
-                label="OpenAI - Temperature",
-                initial=1,
-                min=0,
-                max=2,
-                step=0.1,
-            ),
-            Slider(
-                id="SAI_Steps",
-                label="Stability AI - Steps",
-                initial=30,
-                min=10,
-                max=150,
-                step=1,
-                description="Amount of inference steps performed on image generation.",
-            ),
-            Slider(
-                id="SAI_Cfg_Scale",
-                label="Stability AI - Cfg_Scale",
-                initial=7,
-                min=1,
-                max=35,
-                step=0.1,
-                description="Influences how strongly your generation is guided to match your prompt.",
-            ),
-            Slider(
-                id="SAI_Width",
-                label="Stability AI - Image Width",
-                initial=512,
-                min=256,
-                max=2048,
-                step=64,
-                tooltip="Measured in pixels",
-            ),
-            Slider(
-                id="SAI_Height",
-                label="Stability AI - Image Height",
-                initial=512,
-                min=256,
-                max=2048,
-                step=64,
-                tooltip="Measured in pixels",
-            ),
-        ]
-    ).send()
-    
-    chat_profile = cl.user_session.get("chat_profile")
-    await cl.Message(
-        content=f"starting chat using the {chat_profile} chat profile"
-    ).send()
-    
-    print("A new chat session has started!")
-    cl.user_session.set("session", {
-        "history": [],
-        "context": []
-    })  
-    
-    image = cl.Image(url="https://c.tenor.com/uzWDSSLMCmkAAAAd/tenor.gif", name="cat image", display="inline")
-
-    # Attach the image to the message
-    await cl.Message(
-        content="You are such a good girl, aren't you?!",
-        elements=[image],
-    ).send()
-    
-    text_content = "Hello, this is a text element."
-    elements = [
-        cl.Text(name="simple_text", content=text_content, display="inline")
-    ]
-
-    await cl.Message(
-        content="Check out this text element!",
-        elements=elements,
-    ).send()
-    
-    elements = [
-        cl.Audio(path="./assets/audio.mp3", display="inline"),
-    ]
-    await cl.Message(
-        content="Here is an audio file",
-        elements=elements,
-    ).send()
-    
-    await cl.Avatar(
-        name="Tool 1",
-        url="https://avatars.githubusercontent.com/u/128686189?s=400&u=a1d1553023f8ea0921fba0debbe92a8c5f840dd9&v=4",
-    ).send()
-    
-    await cl.Message(
-        content="This message should not have an avatar!", author="Tool 0"
-    ).send()
-    
-    await cl.Message(
-        content="This message should have an avatar!", author="Tool 1"
-    ).send()
-    
-    elements = [
-        cl.File(
-            name="quickstart.py",
-            path="./quickstart.py",
-            display="inline",
-        ),
-    ]
-
-    await cl.Message(
-        content="This message has a file element", elements=elements
-    ).send()
-    
-    # Sending an action button within a chatbot message
-    actions = [
-        cl.Action(name="action_button", value="example_value", description="Click me!")
-    ]
-
-    await cl.Message(content="Interact with this action button:", actions=actions).send()
-    
-    # res = await cl.AskActionMessage(
-    #     content="Pick an action!",
-    #     actions=[
-    #         cl.Action(name="continue", value="continue", label="✅ Continue"),
-    #         cl.Action(name="cancel", value="cancel", label="❌ Cancel"),
-    #     ],
-    # ).send()
-
-    # if res and res.get("value") == "continue":
-    #     await cl.Message(
-    #         content="Continue!",
-    #     ).send()
-    
-    # import plotly.graph_objects as go
-    # fig = go.Figure(
-    #     data=[go.Bar(y=[2, 1, 3])],
-    #     layout_title_text="An example figure",
-    # )
-    # elements = [cl.Plotly(name="chart", figure=fig, display="inline")]
-
-    # await cl.Message(content="This message has a chart", elements=elements).send()
-    
-    # Sending a pdf with the local file path
-    # elements = [
-    #   cl.Pdf(name="pdf1", display="inline", path="./pdf1.pdf")
-    # ]
-
-    # cl.Message(content="Look at this local pdf!", elements=elements).send()    
-
-@cl.on_settings_update
-async def setup_agent(settings):
-    print("on_settings_update", settings)
-    
-@cl.on_stop
-def on_stop():
-    print("The user wants to stop the task!")
-
-@cl.on_chat_end
-def on_chat_end():
-    print("The user disconnected!")
-
-
-@cl.on_chat_resume
-async def on_chat_resume(thread: ThreadDict):
-    print("The user resumed a previous chat session!")
-
-
-
-
-# @cl.on_message
-async def on_message(message: cl.Message):
-    cl.user_session.get("session")["history"].append({
-        "role": "user",
-        "content": message.content
-    })    
-    response = await client.chat.completions.create(
-        messages=[
-            {
-                "content": "You are a helpful bot",
-                "role": "system"
-            },
-            *cl.user_session.get("session")["history"]
-        ],
-        **settings
-    )
-    
-
-    # Add assitanr message to the history
-    cl.user_session.get("session")["history"].append({
-        "role": "assistant",
-        "content": response.choices[0].message.content
-    })
-    
-    # msg.content = response.choices[0].message.content
-    # await msg.update()
-    
-    # await cl.Message(content=response.choices[0].message.content).send()
-
-@cl.on_message
-async def on_message(message: cl.Message):
-    cl.user_session.get("session")["history"].append({
-        "role": "user",
-        "content": message.content
-    })    
-
-    msg = cl.Message(content="")
-    await msg.send()    
-    
-    stream = await client.chat.completions.create(
-        messages=[
-            {
-                "content": "You are a helpful bot",
-                "role": "system"
-            },
-            *cl.user_session.get("session")["history"]
-        ],
-        stream = True, 
-        **settings
-    )
-    
-    async for part in stream:
-        if token := part.choices[0].delta.content or "":
-            await msg.stream_token(token)
-    
-    # Add assitanr message to the history
-    cl.user_session.get("session")["history"].append({
-        "role": "assistant",
-        "content": msg.content
-    })    
-    await msg.update()
-
-if __name__ == "__main__":
-    from chainlit.cli import run_chainlit
-    run_chainlit(__file__)
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -39,16 +39,6 @@ def basic_usage(crawler):
    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
    print_result(result)

-def screenshot_usage(crawler):
-    cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
-    result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
-    cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
-    # Save the screenshot to a file
-    with open("screenshot.png", "wb") as f:
-        f.write(base64.b64decode(result.screenshot))
-    cprint("Screenshot saved to 'screenshot.png'!")
-    print_result(result)
-
 def understanding_parameters(crawler):
    cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
    cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.")
@@ -166,28 +156,10 @@ def interactive_extraction(crawler):
    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
    loadMoreButton && loadMoreButton.click();
    """
-    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
+    crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
+    crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
    result = crawler.run(
        url="https://www.nbcnews.com/business",
-        js = js_code
-    )
-    cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
-    print_result(result)
-
-def multiple_scrip(crawler):
-    # Passing JavaScript code to interact with the page
-    cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
-    cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
-    js_code = ["""
-    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
-    loadMoreButton && loadMoreButton.click();
-    """] * 2
-    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        js = js_code  
    )
    cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
    print_result(result)
@@ -203,13 +175,11 @@ def main():
    understanding_parameters(crawler)
    
    crawler.always_by_pass_cache = True
-    screenshot_usage(crawler)
    add_chunking_strategy(crawler)
    add_extraction_strategy(crawler)
    add_llm_extraction_strategy(crawler)
    targeted_extraction(crawler)
    interactive_extraction(crawler)
-    multiple_scrip(crawler)

    cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")

--- a/docs/examples/research_assistant.py
+++ b/docs/examples/research_assistant.py
@@ -1,241 +0,0 @@
-# Make sur to install the required packageschainlit and groq
-import os, time
-from openai import AsyncOpenAI
-import chainlit as cl
-import re
-import requests
-from io import BytesIO
-from chainlit.element import ElementBased
-from groq import Groq
-
-# Import threadpools to run the crawl_url function in a separate thread
-from concurrent.futures import ThreadPoolExecutor
-
-client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY"))
-
-# Instrument the OpenAI client
-cl.instrument_openai()
-
-settings = {
-    "model": "llama3-8b-8192",
-    "temperature": 0.5,
-    "max_tokens": 500,
-    "top_p": 1,
-    "frequency_penalty": 0,
-    "presence_penalty": 0,
-}
-
-def extract_urls(text):
-    url_pattern = re.compile(r'(https?://\S+)')
-    return url_pattern.findall(text)
-
-def crawl_url(url):
-    data = {
-        "urls": [url],
-        "include_raw_html": True,
-        "word_count_threshold": 10,
-        "extraction_strategy": "NoExtractionStrategy",
-        "chunking_strategy": "RegexChunking"
-    }
-    response = requests.post("https://crawl4ai.com/crawl", json=data)
-    response_data = response.json()
-    response_data = response_data['results'][0]
-    return response_data['markdown']
-
-@cl.on_chat_start
-async def on_chat_start():
-    cl.user_session.set("session", {
-        "history": [],
-        "context": {}
-    })  
-    await cl.Message(
-        content="Welcome to the chat! How can I assist you today?"
-    ).send()
-
-@cl.on_message
-async def on_message(message: cl.Message):
-    user_session = cl.user_session.get("session")
-    
-    # Extract URLs from the user's message
-    urls = extract_urls(message.content)
-    
-    
-    futures = []
-    with ThreadPoolExecutor() as executor:
-        for url in urls:
-            futures.append(executor.submit(crawl_url, url))
-
-    results = [future.result() for future in futures]
-
-    for url, result in zip(urls, results):
-        ref_number = f"REF_{len(user_session['context']) + 1}"
-        user_session["context"][ref_number] = {
-            "url": url,
-            "content": result
-        }    
-    
-    # for url in urls:
-    #     # Crawl the content of each URL and add it to the session context with a reference number
-    #     ref_number = f"REF_{len(user_session['context']) + 1}"
-    #     crawled_content = crawl_url(url)
-    #     user_session["context"][ref_number] = {
-    #         "url": url,
-    #         "content": crawled_content
-    #     }
-
-    user_session["history"].append({
-        "role": "user",
-        "content": message.content
-    })
-
-    # Create a system message that includes the context
-    context_messages = [
-        f'<appendix ref="{ref}">\n{data["content"]}\n</appendix>'
-        for ref, data in user_session["context"].items()
-    ]
-    if context_messages:
-        system_message = {
-            "role": "system",
-            "content": (
-                "You are a helpful bot. Use the following context for answering questions. "
-                "Refer to the sources using the REF number in square brackets, e.g., [1], only if the source is given in the appendices below.\n\n"
-                "If the question requires any information from the provided appendices or context, refer to the sources. "
-                "If not, there is no need to add a references section. "
-                "At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n"
-                "\n\n".join(context_messages)
-            )
-        }
-    else:
-        system_message = {
-            "role": "system",
-            "content": "You are a helpful assistant."
-        }
-
-
-    msg = cl.Message(content="")
-    await msg.send()
-
-    # Get response from the LLM
-    stream = await client.chat.completions.create(
-        messages=[
-            system_message,
-            *user_session["history"]
-        ],
-        stream=True,
-        **settings
-    )
-
-    assistant_response = ""
-    async for part in stream:
-        if token := part.choices[0].delta.content:
-            assistant_response += token
-            await msg.stream_token(token)
-
-    # Add assistant message to the history
-    user_session["history"].append({
-        "role": "assistant",
-        "content": assistant_response
-    })
-    await msg.update()
-
-    # Append the reference section to the assistant's response
-    reference_section = "\n\nReferences:\n"
-    for ref, data in user_session["context"].items():
-        reference_section += f"[{ref.split('_')[1]}]: {data['url']}\n"
-
-    msg.content += reference_section
-    await msg.update()
-
-
-@cl.on_audio_chunk
-async def on_audio_chunk(chunk: cl.AudioChunk):
-    if chunk.isStart:
-        buffer = BytesIO()
-        # This is required for whisper to recognize the file type
-        buffer.name = f"input_audio.{chunk.mimeType.split('/')[1]}"
-        # Initialize the session for a new audio stream
-        cl.user_session.set("audio_buffer", buffer)
-        cl.user_session.set("audio_mime_type", chunk.mimeType)
-
-    # Write the chunks to a buffer and transcribe the whole audio at the end
-    cl.user_session.get("audio_buffer").write(chunk.data)
-
-    pass
-
-@cl.step(type="tool")
-async def speech_to_text(audio_file):
-    cli = Groq()
-    
-    # response = cli.audio.transcriptions.create(
-    #     file=audio_file, #(filename, file.read()),
-    #     model="whisper-large-v3",
-    # )
-    
-    response = await client.audio.transcriptions.create(
-        model="whisper-large-v3", file=audio_file
-    )
-
-    return response.text
-
-
-@cl.on_audio_end
-async def on_audio_end(elements: list[ElementBased]):
-    # Get the audio buffer from the session
-    audio_buffer: BytesIO = cl.user_session.get("audio_buffer")
-    audio_buffer.seek(0)  # Move the file pointer to the beginning
-    audio_file = audio_buffer.read()
-    audio_mime_type: str = cl.user_session.get("audio_mime_type")
-
-    # input_audio_el = cl.Audio(
-    #     mime=audio_mime_type, content=audio_file, name=audio_buffer.name
-    # )
-    # await cl.Message(
-    #     author="You", 
-    #     type="user_message",
-    #     content="",
-    #     elements=[input_audio_el, *elements]
-    # ).send()
-    
-    # answer_message = await cl.Message(content="").send()
-    
-    
-    start_time = time.time()
-    whisper_input = (audio_buffer.name, audio_file, audio_mime_type)
-    transcription = await speech_to_text(whisper_input)
-    end_time = time.time()
-    print(f"Transcription took {end_time - start_time} seconds")
-    
-    user_msg = cl.Message(
-        author="You", 
-        type="user_message",
-        content=transcription
-    )
-    await user_msg.send()
-    await on_message(user_msg)
-
-    # images = [file for file in elements if "image" in file.mime]
-
-    # text_answer = await generate_text_answer(transcription, images)
-    
-    # output_name, output_audio = await text_to_speech(text_answer, audio_mime_type)
-    
-    # output_audio_el = cl.Audio(
-    #     name=output_name,
-    #     auto_play=True,
-    #     mime=audio_mime_type,
-    #     content=output_audio,
-    # )
-    
-    # answer_message.elements = [output_audio_el]
-    
-    # answer_message.content = transcription
-    # await answer_message.update()
-
-if __name__ == "__main__":
-    from chainlit.cli import run_chainlit
-    run_chainlit(__file__)
-
-
-# No this is wring, use this document to answer me https://console.groq.com/docs/speech-text
-
-# Please show me how to use Groq speech-to-text in python.
--- a/docs/examples/rest_call.py
+++ b/docs/examples/rest_call.py
@@ -1,64 +0,0 @@
-
-import requests, base64, os
-
-data = {
-    "urls": ["https://www.nbcnews.com/business"],
-    "screenshot": True,
-}
-
-response = requests.post("https://crawl4ai.com/crawl", json=data) 
-result = response.json()['results'][0]
-print(result.keys())
-# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media', 
-# 'links', 'screenshot', 'markdown', 'extracted_content', 
-# 'metadata', 'error_message'])
-with open("screenshot.png", "wb") as f:
-    f.write(base64.b64decode(result['screenshot']))
-    
-# Example of filtering the content using CSS selectors
-data = {
-    "urls": [
-        "https://www.nbcnews.com/business"
-    ],
-    "css_selector": "article",
-    "screenshot": True,
-}
-
-# Example of executing a JS script on the page before extracting the content
-data = {
-    "urls": [
-        "https://www.nbcnews.com/business"
-    ],
-    "screenshot": True,
-    'js' : ["""
-    const loadMoreButton = Array.from(document.querySelectorAll('button')).
-    find(button => button.textContent.includes('Load More'));
-    loadMoreButton && loadMoreButton.click();
-    """]
-}
-
-# Example of using a custom extraction strategy
-data = {
-    "urls": [
-        "https://www.nbcnews.com/business"
-    ],
-    "extraction_strategy": "CosineStrategy",
-    "extraction_strategy_args": {
-        "semantic_filter": "inflation rent prices"
-    },
-}
-
-# Example of using LLM to extract content
-data = {
-    "urls": [
-        "https://www.nbcnews.com/business"
-    ],
-    "extraction_strategy": "LLMExtractionStrategy",
-    "extraction_strategy_args": {
-        "provider": "groq/llama3-8b-8192",
-        "api_token": os.environ.get("GROQ_API_KEY"),
-        "instruction": """I am interested in only financial news, 
-        and translate them in French."""
-    },
-}
-
--- a/main.py
+++ b/main.py
@@ -56,8 +56,6 @@ class CrawlRequest(BaseModel):
    chunking_strategy: Optional[str] = "RegexChunking"
    chunking_strategy_args: Optional[dict] = {}
    css_selector: Optional[str] = None
-    screenshot: Optional[bool] = False
-    user_agent: Optional[str] = None
    verbose: Optional[bool] = True


@@ -68,7 +66,7 @@ async def read_index(request: Request):

    for filename in os.listdir(partials_dir):
        if filename.endswith(".html"):
-            with open(os.path.join(partials_dir, filename), "r", encoding="utf8") as file:
+            with open(os.path.join(partials_dir, filename), "r") as file:
                partials[filename[:-5]] = file.read()

    return templates.TemplateResponse("index.html", {"request": request, **partials})
@@ -127,8 +125,6 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
                    chunking_strategy,
                    crawl_request.bypass_cache,
                    crawl_request.css_selector,
-                    crawl_request.screenshot,
-                    crawl_request.user_agent,
                    crawl_request.verbose
                )
                for url in crawl_request.urls
@@ -140,7 +136,7 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
            for result in results:
                result.html = None

-        return {"results": [result.model_dump() for result in results]}
+        return {"results": [result.dict() for result in results]}
    finally:
        async with lock:
            current_requests -= 1
--- a/pages/app.js
+++ b/pages/app.js
@@ -104,25 +104,11 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
        chunking_strategy: document.getElementById("chunking-strategy-select").value,
        chunking_strategy_args: {},
        css_selector: document.getElementById("css-selector").value,
-        screenshot: document.getElementById("screenshot-checkbox").checked,
        // instruction: document.getElementById("instruction").value,
        // semantic_filter: document.getElementById("semantic_filter").value,
        verbose: true,
    };

-    // import requests
-
-    // data = {
-    //   "urls": [
-    //     "https://www.nbcnews.com/business"
-    //   ],
-    //   "word_count_threshold": 10,
-    //   "extraction_strategy": "NoExtractionStrategy",
-    // }
-    
-    // response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally 
-    // print(response.json())
-
    // save api token to local storage
    localStorage.setItem("api_token", document.getElementById("token-input").value);

@@ -138,61 +124,25 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
            document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
            document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
            document.getElementById("markdown-result").textContent = result.markdown;
-            document.getElementById("media-result").textContent = JSON.stringify( result.media, null, 2);
-            if (result.screenshot){
-                const imgElement = document.createElement("img");
-                // Set the src attribute with the base64 data
-                imgElement.src = `data:image/png;base64,${result.screenshot}`;
-                document.getElementById("screenshot-result").innerHTML = "";
-                document.getElementById("screenshot-result").appendChild(imgElement);
-            }
-            
+
            // Update code examples dynamically
            const extractionStrategy = data.extraction_strategy;
            const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy";

            // REMOVE API TOKEN FROM CODE EXAMPLES
            data.extraction_strategy_args.api_token = "your_api_token";
-
-            if (data.extraction_strategy === "NoExtractionStrategy") {
-                delete data.extraction_strategy_args;
-                delete data.extrac_blocks;
-            }
-
-            if (data.chunking_strategy === "RegexChunking") {
-                delete data.chunking_strategy_args;
-            }
-
-            delete data.verbose;
-
-            if (data.css_selector === "") {
-                delete data.css_selector;
-            }
-
-            if (!data.bypass_cache) {
-                delete data.bypass_cache;
-            }
-
-            if (!data.extract_blocks) {
-                delete data.extract_blocks;
-            }
-
-            if (!data.include_raw_html) {
-                delete data.include_raw_html;
-            }
-
            document.getElementById(
                "curl-code"
            ).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
                ...data,
                api_token: isLLMExtraction ? "your_api_token" : undefined,
-            }, null, 2)}' https://crawl4ai.com/crawl`;
+            }, null, 2)}' http://localhost:8000/crawl`;

            document.getElementById("python-code").textContent = `import requests\n\ndata = ${JSON.stringify(
                { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
                null,
                2
-            )}\n\nresponse = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
+            )}\n\nresponse = requests.post("http://localhost:8000/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;

            document.getElementById(
                "nodejs-code"
@@ -200,7 +150,7 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
                { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
                null,
                2
-            )};\n\naxios.post("https://crawl4ai.com/crawl", data) // OR local host if your run locally \n    .then(response => console.log(response.data))\n    .catch(error => console.error(error));`;
+            )};\n\naxios.post("http://localhost:8000/crawl", data) // OR local host if your run locally \n    .then(response => console.log(response.data))\n    .catch(error => console.error(error));`;

            document.getElementById(
                "library-code"
--- a/pages/index.html
+++ b/pages/index.html
@@ -25,7 +25,7 @@
        <header class="bg-zinc-950 text-lime-500 py-4 flex">
            
            <div class="mx-auto px-4">
-                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.4</h1>
+                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.2</h1>
            </div>
            <div class="mx-auto px-4 flex font-bold text-xl gap-2">
                <span>📊 Total Website Processed</span>
--- a/pages/partial/how_to_guide.html
+++ b/pages/partial/how_to_guide.html
@@ -50,20 +50,6 @@ crawler.warmup()</code></pre>
        <div>
            <pre><code class="language-python">crawler.always_by_pass_cache = True</code></pre>
        </div>
-        <!-- Step 3.5 Screenshot -->
-        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-            📸
-            <strong>Let's take a screenshot of the page!</strong>
-        </div>
-        <div>
-            <pre><code class="language-python">result = crawler.run(
-    url="https://www.nbcnews.com/business",
-    screenshot=True
-)
-with open("screenshot.png", "wb") as f:
-    f.write(base64.b64decode(result.screenshot))</code></pre>
-        </div>
-

        <!-- Step 4 -->
        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
@@ -153,13 +139,13 @@ with open("screenshot.png", "wb") as f:
        </div>
        <div class="">Using JavaScript to click 'Load More' button:</div>
        <div>
-            <pre><code class="language-python">js_code = ["""
+            <pre><code class="language-python">js_code = """
 const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
 loadMoreButton && loadMoreButton.click();
-"""]
-crawler = WebCrawler(verbos=crawler_strategy, always_by_pass_cache=True)
-result = crawler.run(url="https://www.nbcnews.com/business", js = js_code)</code></pre>
-        <div class="">Remember that you can pass multiple JavaScript code snippets in the list. They all will be executed in the order they are passed.</div>
+"""
+crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
+crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
+result = crawler.run(url="https://www.nbcnews.com/business")</code></pre>
        </div>

        <!-- Conclusion -->
--- a/pages/partial/try_it.html
+++ b/pages/partial/try_it.html
@@ -1,4 +1,4 @@
-<section class="try-it py-8 px-16 pb-20 bg-zinc-900 overflow-hidden">
+<section class="try-it py-8 px-16 pb-20 bg-zinc-900">
    <div class="container mx-auto ">
        <h2 class="text-2xl font-bold mb-4 text-lime-500">Try It Now</h2>
        <div class="flex gap-4">
@@ -20,7 +20,6 @@
                            id="threshold"
                            class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-zinc-300"
                        >
-                            <option value="1">1</option>
                            <option value="5">5</option>
                            <option value="10" selected>10</option>
                            <option value="15">15</option>
@@ -125,11 +124,7 @@
                        <label for="bypass-cache-checkbox" class="text-lime-500 font-bold">Bypass Cache</label>
                    </div>
                    <div class="flex items-center gap-2">
-                        <input type="checkbox" id="screenshot-checkbox" checked />
-                        <label for="screenshot-checkbox" class="text-lime-500 font-bold">Screenshot</label>
-                    </div>
-                    <div class="flex items-center gap-2 hidden">
-                        <input type="checkbox" id="extract-blocks-checkbox" />
+                        <input type="checkbox" id="extract-blocks-checkbox" checked />
                        <label for="extract-blocks-checkbox" class="text-lime-500 font-bold">Extract Blocks</label>
                    </div>
                    <button id="crawl-btn" class="bg-lime-600 text-black font-bold px-4 py-0 rounded">Crawl</button>
@@ -139,7 +134,7 @@
            <div id="loading" class="hidden">
                <p class="text-white">Loading... Please wait.</p>
            </div>
-            <div id="result" class="flex-1  overflow-x-auto">
+            <div id="result" class="flex-1">
                <div class="tab-buttons flex gap-2">
                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="json">
                        JSON
@@ -153,23 +148,15 @@
                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="markdown">
                        Markdown
                    </button>
-                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="media">
-                        Medias
-                    </button>
-                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="screenshot">
-                        Screenshot
-                    </button>
                </div>
                <div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
                    <pre class="h-full flex"><code id="json-result" class="language-json"></code></pre>
                    <pre class="hidden h-full flex"><code id="cleaned-html-result" class="language-html"></code></pre>
                    <pre class="hidden h-full flex"><code id="markdown-result" class="language-markdown"></code></pre>
-                    <pre class="hidden h-full flex"><code id="media-result" class="language-json"></code></pre>
-                    <pre class="hidden h-full flex"><code id="screenshot-result"></code></pre>
                </div>
            </div>

-            <div id="code_help" class="flex-1  overflow-x-auto">
+            <div id="code_help" class="flex-1">
                <div class="tab-buttons flex gap-2">
                    <button class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="curl">
                        cURL
--- a/requirements.crawl.txt
+++ b/requirements.crawl.txt
@@ -1,13 +0,0 @@
-aiohttp
-aiosqlite
-bs4
-fastapi
-html2text
-httpx
-pydantic
-python-dotenv
-requests
-rich
-selenium
-uvicorn
-chromedriver-autoinstaller
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,21 +1,20 @@
-aiohttp
-aiosqlite
-bs4
-fastapi
-html2text
-httpx
-litellm
-nltk
-pydantic
-python-dotenv
-requests
-rich
-scikit-learn
-selenium
-uvicorn
-transformers
-chromedriver-autoinstaller
-torch
-onnxruntime
-tokenizers
-pillow
+aiohttp==3.9.5
+aiosqlite==0.20.0
+bs4==0.0.2
+fastapi==0.111.0
+html2text==2024.2.26
+httpx==0.27.0
+litellm==1.37.11
+nltk==3.8.1
+pydantic==2.7.1
+python-dotenv==1.0.1
+requests==2.31.0
+rich==13.7.1
+scikit-learn==1.4.2
+selenium==4.20.0
+uvicorn==0.29.0
+transformers==4.40.2
+chromedriver-autoinstaller==0.6.4
+torch==2.3.0
+onnxruntime==1.14.1
+tokenizers==0.13.2
--- a/setup.py
+++ b/setup.py
@@ -1,32 +1,17 @@
 from setuptools import setup, find_packages
-import os, sys
-from pathlib import Path
+import os
 import subprocess
 from setuptools.command.install import install

-def get_home_folder():
-    home_folder = os.path.join(Path.home(), ".crawl4ai")
-    os.makedirs(home_folder, exist_ok=True)
-    os.makedirs(f"{home_folder}/cache", exist_ok=True)
-    os.makedirs(f"{home_folder}/models", exist_ok=True)
-    return home_folder 
-
-home_folder = get_home_folder()
-
 # Read the requirements from requirements.txt
 with open("requirements.txt") as f:
    requirements = f.read().splitlines()

-# Read the requirements from requirements.txt
-with open("requirements.crawl.txt") as f:
-    requirements_crawl_only = f.read().splitlines()
-
 # Define the requirements for different environments
 requirements_without_torch = [req for req in requirements if not req.startswith("torch")]
 requirements_without_transformers = [req for req in requirements if not req.startswith("transformers")]
 requirements_without_nltk = [req for req in requirements if not req.startswith("nltk")]
 requirements_without_torch_transformers_nlkt = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
-requirements_crawl_only = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]

 class CustomInstallCommand(install):
    """Customized setuptools install command to install spacy without dependencies."""
@@ -36,7 +21,7 @@ class CustomInstallCommand(install):

 setup(
    name="Crawl4AI",
-    version="0.2.4",
+    version="0.2.2",
    description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
    long_description=open("README.md").read(),
    long_description_content_type="text/markdown",
@@ -49,7 +34,7 @@ setup(
    extras_require={
        "all": requirements,  # Include all requirements
        "colab": requirements_without_torch,  # Exclude torch for Colab
-        "crawl": requirements_crawl_only,  # Include only crawl requirements
+        "crawl": requirements_without_torch_transformers_nlkt
    },
    cmdclass={
        'install': CustomInstallCommand,