chore: Add custom headers to LocalSeleniumCrawlerStrategy

chore: Add hooks for customizing the LocalSeleniumCrawlerStrategy
Fix typo in README
2024-06-17 15:50:03 +08:00 · 2024-06-17 15:37:18 +08:00 · 2024-06-17 15:15:37 +08:00 · 2024-06-17 14:47:58 +08:00 · 2024-06-17 14:44:01 +08:00 · 2024-06-10 23:03:32 +08:00
46 changed files with 63523 additions and 313 deletions
--- a/.files/screenshot.png
+++ b/.files/screenshot.png
--- a/.gitignore
+++ b/.gitignore
@@ -172,3 +172,10 @@ Crawl4AI.egg-info/

 requirements0.txt
 a.txt
+
+*.sh
+.idea
+docs/examples/.chainlit/
+docs/examples/.chainlit/*
+.chainlit/config.toml
+.chainlit/translations/en-US.json
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,31 +1,5 @@
 # Changelog

-All notable changes to this project will be documented in this file.
-
-## [Unreleased]
-
-### Added
- 🔧 Separate Crawl and Extract JSON Semantic Chunk: Enhancing flexibility and efficiency in large-scale web crawling tasks.
- 🔍 Colab Integration: Exploring integration with Google Colab for easy experimentation in a collaborative notebook environment.
- 🎯 XPath and CSS Selector Support: Adding support for selective retrieval of specific elements from web pages.
- 📷 Image Captioning: Incorporating image captioning capabilities to extract meaningful descriptions from images.
- 💾 Embedding Data Generation and Storage: Developing functionalities to generate and store embedding data for each crawled website.
- 🔍 Semantic Search Engine: Building a semantic search engine that fetches content, performs vector search similarity, and generates labeled chunk data based on user queries and URLs.
-
-### Changed
- None
-
-### Deprecated
- None
-
-### Removed
- None
-
+## [0.2.4] - 2024-06-17
 ### Fixed
- None
-
-### Security
- None
-
-## [1.0.0] - YYYY-MM-DD
- Initial release
+- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
--- a/87
+++ b/87
@@ -1,40 +1,77 @@
-# Use an official Python runtime as a parent image
-FROM python:3.10-slim
+
+# First stage: Build and install dependencies
+FROM python:3.10-slim-bookworm as builder

 # Set the working directory in the container
 WORKDIR /usr/src/app

-# Copy the current directory contents into the container at /usr/src/app
-COPY . .
-
-# Install any needed packages specified in requirements.txt
-RUN pip install --no-cache-dir -r requirements.txt
-
-# Install dependencies for Chrome and ChromeDriver
-RUN apt-get update && apt-get install -y --no-install-recommends \
+# Install build dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
    wget \
-    xvfb \
-    unzip \
    curl \
+    unzip 
+
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt && \
+    pip install --no-cache-dir spacy torch torchvision torchaudio onnxruntime uvicorn && \
+    python -m spacy download en_core_web_sm
+
+# Download and install ChromeDriver
+RUN CHROMEDRIVER_VERSION=$(curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE) && \
+    wget -N https://chromedriver.storage.googleapis.com/$CHROMEDRIVER_VERSION/chromedriver_linux64.zip -P /tmp && \
+    unzip /tmp/chromedriver_linux64.zip -d /tmp && \
+    mv /tmp/chromedriver /usr/local/bin/chromedriver && \
+    chmod +x /usr/local/bin/chromedriver && \
+    rm /tmp/chromedriver_linux64.zip
+
+# Second stage: Create final runtime image
+FROM python:3.10-slim-bookworm
+
+# Set the working directory in the container
+WORKDIR /usr/src/app
+
+# Install runtime dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    wget \
+    git \
+    xvfb \
    gnupg2 \
    ca-certificates \
    apt-transport-https \
-    software-properties-common \
-    && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
-    && echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \
-    && apt-get update \
-    && apt-get install -y google-chrome-stable \
-    && rm -rf /var/lib/apt/lists/*
+    software-properties-common && \
+    wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - && \
+    echo "deb http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends google-chrome-stable && \
+    rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/google-chrome.list

-# Set display port and dbus env to avoid hanging
-ENV DISPLAY=:99
-ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
+# Copy Chromedriver from the builder stage
+COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver
+
+# Copy installed Python packages from builder stage
+COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
+COPY --from=builder /usr/local/bin /usr/local/bin
+
+# Copy the rest of the application code
+COPY . .
+
+# Set environment to use Chrome and ChromeDriver properly
+ENV CHROME_BIN=/usr/bin/google-chrome \
+    CHROMEDRIVER=/usr/local/bin/chromedriver \
+    DISPLAY=:99 \
+    DBUS_SESSION_BUS_ADDRESS=/dev/null \
+    PYTHONUNBUFFERED=1
+
+# Ensure the PATH environment variable includes the location of the installed packages
+ENV PATH /usr/local/bin:$PATH   

 # Make port 80 available to the world outside this container
 EXPOSE 80

-# Define environment variable
-ENV PYTHONUNBUFFERED 1
-
 # Run uvicorn
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
+
+
--- a/45
+++ b/45
@@ -0,0 +1,45 @@
+# Use an official Python runtime as a parent image
+FROM python:3.10-slim
+# In case you had some weird issues, try this Image
+# FROM python:3.10-slim-bookworm as builder
+
+# Set the working directory in the container
+WORKDIR /usr/src/app
+
+# Copy the current directory contents into the container at /usr/src/app
+COPY . .
+
+# Install dependencies for Chrome and ChromeDriver
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    wget \
+    xvfb \
+    unzip \
+    curl \
+    gnupg2 \
+    ca-certificates \
+    apt-transport-https \
+    software-properties-common \
+    && mkdir -p /etc/apt/keyrings \
+    && curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \
+    && echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main' | tee /etc/apt/sources.list.d/google-chrome.list \
+    && apt-get update \
+    && apt-get install -y google-chrome-stable \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get install -y chromium-chromedriver
+
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install spacy torch torchvision torchaudio
+
+# Set display port and dbus env to avoid hanging
+ENV DISPLAY=:99
+ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
+
+# Make port 80 available to the world outside this container
+EXPOSE 80
+
+# Define environment variable
+ENV PYTHONUNBUFFERED 1
+
+# Run uvicorn
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
--- a/44
+++ b/44
@@ -0,0 +1,44 @@
+# Use an official Python runtime as a parent image
+FROM python:3.10-slim
+
+# Set the working directory in the container
+WORKDIR /usr/src/app
+
+# Copy the current directory contents into the container at /usr/src/app
+COPY . .
+
+# Install any needed packages specified in requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Install dependencies for Chrome and ChromeDriver
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    wget \
+    xvfb \
+    unzip \
+    curl \
+    gnupg2 \
+    ca-certificates \
+    apt-transport-https \
+    software-properties-common \
+    && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
+    && echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \
+    && apt-get update \
+    && apt-get install -y google-chrome-stable \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt install chromium-chromedriver -y
+
+# Install spacy library using pip
+RUN pip install spacy
+
+# Set display port and dbus env to avoid hanging
+ENV DISPLAY=:99
+ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
+
+# Make port 80 available to the world outside this container
+EXPOSE 80
+
+# Define environment variable
+ENV PYTHONUNBUFFERED 1
+
+# Run uvicorn
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Crawl4AI 🕷️🤖
+# Crawl4AI v0.2.3 🕷️🤖

 [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
 [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)
@@ -8,10 +8,27 @@

 Crawl4AI has one clear task: to simplify crawling and extract useful information from web pages, making it accessible for large language models (LLMs) and AI applications. 🆓🌐

-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)
+- Use as REST API: Check  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1zODYjhemJ5bUmYceWpVoBMVpd0ofzNBZ?usp=sharing)
+- Use as Python library: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)

-## Recent Changes
+## Recent Changes 

+### v0.2.4
+- 🐞 Resolve the issue with the long url. (Issue #22)
+
+### v0.2.3
+- 🎨 Extract and return all media tags (Images, Audio, and Video). Check `result.media`
+- 🔗 Extrat all external and internal links. Check `result.links`
+- 📚 Extract metadata from the page. Check `result.metadata`
+- 🕵️ Support `user_agent` parameter to set the user agent for the HTTP requests.
+- 🖼️ Take [screenshots](#taking-screenshots) of the page.
+
+### v0.2.2
+- Support multiple JS scripts
+- Fixed some of bugs
+- Resolved a few issue relevant to Colab installation
+
+### v0.2.0
 - 🚀 10x faster!!
 - 📜 Execute custom JavaScript before crawling!
 - 🤝 Colab friendly!
@@ -22,17 +39,37 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information

 ## Power and Simplicity of Crawl4AI 🚀

-To show the simplicity take a look at the first example:
+The most easy way! If you don't want to install any library, you can use the REST API on my server. But remember, this is just a simple server. I may improve its capacity if I see there is demand. You can find ll examples of REST API in this colab notebook. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1zODYjhemJ5bUmYceWpVoBMVpd0ofzNBZ?usp=sharing)
+
+```python
+import requests
+
+data = {
+  "urls": [
+    "https://www.nbcnews.com/business"
+  ],
+  "screenshot": True
+}
+
+response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally 
+response_data = response.json()
+print(response_data['results'][0].keys())
+# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media', 
+# 'links', 'screenshot', 'markdown', 'extracted_content', 
+# 'metadata', 'error_message'])
+```
+
+But you muore control then take a look at the first example of using the Python library.

 ```python
 from crawl4ai import WebCrawler

 # Create the WebCrawler instance 
-crawler = WebCrawler()
+crawler = WebCrawler() 

 # Run the crawler with keyword filtering and CSS selector
 result = crawler.run(url="https://www.nbcnews.com/business")
-print(result) # {url, html, markdown, extracted_content, metadata}
+print(result) # {url, html, cleaned_html, markdown, media, links, extracted_content, metadata, screenshots}
 ```

 Now let's try a complex task. Below is an example of how you can execute JavaScript, filter data using keywords, and use a CSS selector to extract specific content—all in one go!
@@ -50,20 +87,17 @@ from crawl4ai.extraction_strategy import *
 from crawl4ai.crawler_strategy import *

 # Define the JavaScript code to click the "Load More" button
-js_code = """
+js_code = ["""
 const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
 loadMoreButton && loadMoreButton.click();
-"""
-
-# Define the crawling strategy
-crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-
-# Create the WebCrawler instance with the defined strategy
-crawler = WebCrawler(crawler_strategy=crawler_strategy)
+"""]

+crawler = WebCrawler(verbose=True)
+crawler.warmup()
 # Run the crawler with keyword filtering and CSS selector
 result = crawler.run(
    url="https://www.nbcnews.com/business",
+    js = js_code,
    extraction_strategy=CosineStrategy(
        semantic_filter="technology",
    ),
@@ -72,6 +106,7 @@ result = crawler.run(
 # Run the crawler with LLM extraction strategy
 result = crawler.run(
    url="https://www.nbcnews.com/business",
+    js = js_code,
    extraction_strategy=LLMExtractionStrategy(
        provider="openai/gpt-4o",
        api_token=os.getenv('OPENAI_API_KEY'),
@@ -134,7 +169,7 @@ source venv/bin/activate
 pip install "crawl4ai[all] @ git+https://github.com/unclecode/crawl4ai.git"
 ```

-    💡 Better to run the following CLI-command to load the required models. This is optional, but it will boost the performance and speed of the crawler. You need to do this only once.
+💡 Better to run the following CLI-command to load the required models. This is optional, but it will boost the performance and speed of the crawler. You need to do this only once.

    crawl4ai-download-models

@@ -149,21 +184,22 @@ pip install -e .[all]

 3. Use docker to run the local server:
 ```bash
-docker build -t crawl4ai .
 # For Mac users
 # docker build --platform linux/amd64 -t crawl4ai .
+# For other users
+# docker build -t crawl4ai .
 docker run -d -p 8000:80 crawl4ai
 ```

-For more information about how to run Crawl4AI as a local server, please refer to the [GitHub repository](https://github.com/unclecode/crawl4ai).
+

 ## Using the Local server ot REST API 🌐

-You can also use Crawl4AI through the REST API. This method allows you to send HTTP requests to the Crawl4AI server and receive structured data in response. The base URL for the API is `https://crawl4ai.com/crawl`. If you run the local server, you can use `http://localhost:8000/crawl`. (Port is dependent on your docker configuration)
+You can also use Crawl4AI through the REST API. This method allows you to send HTTP requests to the Crawl4AI server and receive structured data in response. The base URL for the API is `https://crawl4ai.com/crawl` [Available now, on a CPU server, of course will be faster on GPU]. If you run the local server, you can use `http://localhost:8000/crawl`. (Port is dependent on your docker configuration)

 ### Example Usage

-To use the REST API, send a POST request to `https://crawl4ai.com/crawl` with the following parameters in the request body.
+To use the REST API, send a POST request to `http://localhost:8000/crawl` with the following parameters in the request body.

 **Example Request:**
 ```json
@@ -198,14 +234,18 @@ To use the REST API, send a POST request to `https://crawl4ai.com/crawl` with th
            "url": "https://www.nbcnews.com/business",
            "extracted_content": "...",
            "html": "...",
+            "cleaned_html": "...",
            "markdown": "...",
-            "metadata": {...}
+            "media": {...},
+            "links": {...},
+            "metadata": {...},
+            "screenshots": "...",
        }
    ]
 }
 ```

-For more information about the available parameters and their descriptions, refer to the [Parameters](#parameters) section.
+For more information about the available parameters and their descriptions, refer to the [Parameters](#parameters-) section.


 ## Python Library Usage 🚀
@@ -238,6 +278,32 @@ Crawl result without raw HTML content:
 result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
 ```

+### Result Structure
+
+The result object contains the following fields:
+```python
+class CrawlResult(BaseModel):
+    url: str
+    html: str
+    success: bool
+    cleaned_html: Optional[str] = None
+    media: Dict[str, List[Dict]] = {} # Media tags in the page {"images": [], "audio": [], "video": []}
+    links: Dict[str, List[Dict]] = {} # Links in the page {"external": [], "internal": []}
+    screenshot: Optional[str] = None # Base64 encoded screenshot
+    markdown: Optional[str] = None
+    extracted_content: Optional[str] = None
+    metadata: Optional[dict] = None
+    error_message: Optional[str] = None
+```
+
+### Taking Screenshots
+
+```python
+result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
+with open("screenshot.png", "wb") as f:
+    f.write(base64.b64decode(result.screenshot))
+```
+
 ### Adding a chunking strategy: RegexChunking

 Using RegexChunking:
@@ -344,10 +410,12 @@ result = crawler.run(url="https://www.nbcnews.com/business")
 | `urls`                | A list of URLs to crawl and extract data from.                                                        | Yes      | -                   |
 | `include_raw_html`    | Whether to include the raw HTML content in the response.                                              | No       | `false`             |
 | `bypass_cache`        | Whether to force a fresh crawl even if the URL has been previously crawled.                           | No       | `false`             |
+| `screenshots`         | Whether to take screenshots of the page.                                                              | No       | `false`             |
 | `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5).    | No       | `5`                 |
 | `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy").                    | No       | `NoExtractionStrategy`    |
 | `chunking_strategy`   | The strategy to use for chunking the text before processing (e.g., "RegexChunking").                  | No       | `RegexChunking`     |
 | `css_selector`        | The CSS selector to target specific parts of the HTML for extraction.                                 | No       | `None`              |
+| `user_agent`          | The user agent to use for the HTTP requests.                                                          | No       | `Mozilla/5.0`       |
 | `verbose`             | Whether to enable verbose logging.                                                                    | No       | `true`              |

 ## Chunking Strategies 📚
--- a/crawl4ai/chunking_strategy.py
+++ b/crawl4ai/chunking_strategy.py
@@ -16,7 +16,7 @@ class ChunkingStrategy(ABC):
    
 # Regex-based chunking
 class RegexChunking(ChunkingStrategy):
-    def __init__(self, patterns=None):
+    def __init__(self, patterns=None, **kwargs):
        if patterns is None:
            patterns = [r'\n\n']  # Default split pattern
        self.patterns = patterns
@@ -32,7 +32,7 @@ class RegexChunking(ChunkingStrategy):
    
 # NLP-based sentence chunking 
 class NlpSentenceChunking(ChunkingStrategy):
-    def __init__(self):
+    def __init__(self, **kwargs):
        load_nltk_punkt()
        pass

@@ -52,7 +52,7 @@ class NlpSentenceChunking(ChunkingStrategy):
 # Topic-based segmentation using TextTiling
 class TopicSegmentationChunking(ChunkingStrategy):
    
-    def __init__(self, num_keywords=3):
+    def __init__(self, num_keywords=3, **kwargs):
        import nltk as nl
        self.tokenizer = nl.toknize.TextTilingTokenizer()
        self.num_keywords = num_keywords
@@ -82,7 +82,7 @@ class TopicSegmentationChunking(ChunkingStrategy):
    
 # Fixed-length word chunks
 class FixedLengthWordChunking(ChunkingStrategy):
-    def __init__(self, chunk_size=100):
+    def __init__(self, chunk_size=100, **kwargs):
        self.chunk_size = chunk_size

    def chunk(self, text: str) -> list:
@@ -91,7 +91,7 @@ class FixedLengthWordChunking(ChunkingStrategy):
    
 # Sliding window chunking
 class SlidingWindowChunking(ChunkingStrategy):
-    def __init__(self, window_size=100, step=50):
+    def __init__(self, window_size=100, step=50, **kwargs):
        self.window_size = window_size
        self.step = step

--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -6,16 +6,52 @@ from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.chrome.options import Options
 from selenium.common.exceptions import InvalidArgumentException
-
-from typing import List
+import logging
+import base64
+from PIL import Image, ImageDraw, ImageFont
+from io import BytesIO
+from typing import List, Callable
 import requests
 import os
 from pathlib import Path
+from .utils import wrap_text
+
+logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
+logger.setLevel(logging.WARNING)
+
+logger_driver = logging.getLogger('selenium.webdriver.common.service')
+logger_driver.setLevel(logging.WARNING)
+
+urllib3_logger = logging.getLogger('urllib3.connectionpool')
+urllib3_logger.setLevel(logging.WARNING)
+
+# Disable http.client logging
+http_client_logger = logging.getLogger('http.client')
+http_client_logger.setLevel(logging.WARNING)
+
+# Disable driver_finder and service logging
+driver_finder_logger = logging.getLogger('selenium.webdriver.common.driver_finder')
+driver_finder_logger.setLevel(logging.WARNING)
+
+
+

 class CrawlerStrategy(ABC):
    @abstractmethod
    def crawl(self, url: str, **kwargs) -> str:
        pass
+    
+    @abstractmethod
+    def take_screenshot(self, save_path: str):
+        pass
+    
+    @abstractmethod
+    def update_user_agent(self, user_agent: str):
+        pass
+    
+    @abstractmethod
+    def set_hook(self, hook_type: str, hook: Callable):
+        pass

 class CloudCrawlerStrategy(CrawlerStrategy):
    def __init__(self, use_cached_html = False):
@@ -36,51 +72,125 @@ class CloudCrawlerStrategy(CrawlerStrategy):
        return html

 class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
-    def __init__(self, use_cached_html=False, js_code=None):
+    def __init__(self, use_cached_html=False, js_code=None, **kwargs):
        super().__init__()
        print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
        self.options = Options()
        self.options.headless = True
+        if kwargs.get("user_agent"):
+            self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
        self.options.add_argument("--no-sandbox")
-        self.options.add_argument("--disable-dev-shm-usage")
-        self.options.add_argument("--disable-gpu")
-        self.options.add_argument("--disable-extensions")
        self.options.add_argument("--headless")
+        # self.options.add_argument("--disable-dev-shm-usage")
+        self.options.add_argument("--disable-gpu")
+        # self.options.add_argument("--disable-extensions")
+        # self.options.add_argument("--disable-infobars")
+        # self.options.add_argument("--disable-logging")
+        # self.options.add_argument("--disable-popup-blocking")
+        # self.options.add_argument("--disable-translate")
+        # self.options.add_argument("--disable-default-apps")
+        # self.options.add_argument("--disable-background-networking")
+        # self.options.add_argument("--disable-sync")
+        # self.options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess")
+        # self.options.add_argument("--disable-browser-side-navigation")
+        # self.options.add_argument("--dns-prefetch-disable")
+        # self.options.add_argument("--disable-web-security")
+        self.options.add_argument("--log-level=3")
+        self.use_cached_html = use_cached_html
        self.use_cached_html = use_cached_html
        self.js_code = js_code
+        self.verbose = kwargs.get("verbose", False)
+        
+        # Hooks
+        self.hooks = {
+            'on_driver_created': None,
+            'before_get_url': None,
+            'after_get_url': None,
+            'before_return_html': None
+        }

        # chromedriver_autoinstaller.install()
        import chromedriver_autoinstaller
        self.service = Service(chromedriver_autoinstaller.install())
+        self.service.log_path = "NUL"
        self.driver = webdriver.Chrome(service=self.service, options=self.options)

+    def set_hook(self, hook_type: str, hook: Callable):
+        if hook_type in self.hooks:
+            self.hooks[hook_type] = hook
+        else:
+            raise ValueError(f"Invalid hook type: {hook_type}")
+    
+    def execute_hook(self, hook_type: str, *args):
+        hook = self.hooks.get(hook_type)
+        if hook:
+            result = hook(*args)
+            if result is not None:
+                if isinstance(result, webdriver.Chrome):
+                    return result
+                else:
+                    raise TypeError(f"Hook {hook_type} must return an instance of webdriver.Chrome or None.")
+        # If the hook returns None or there is no hook, return self.driver
+        return self.driver
+
+    def update_user_agent(self, user_agent: str):
+        self.options.add_argument(f"user-agent={user_agent}")
+        self.driver.quit()
+        self.driver = webdriver.Chrome(service=self.service, options=self.options)
+        self.driver = self.execute_hook('on_driver_created', self.driver)
+
+    def set_custom_headers(self, headers: dict):
+        # Enable Network domain for sending headers
+        self.driver.execute_cdp_cmd('Network.enable', {})
+        # Set extra HTTP headers
+        self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers})
+
+
    def crawl(self, url: str) -> str:
+        # Create md5 hash of the URL
+        import hashlib
+        url_hash = hashlib.md5(url.encode()).hexdigest()
+        
        if self.use_cached_html:
-            cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
+            cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
            if os.path.exists(cache_file_path):
                with open(cache_file_path, "r") as f:
                    return f.read()

        try:
+            self.driver = self.execute_hook('before_get_url', self.driver)
+            if self.verbose:
+                print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
            self.driver.get(url)
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
            )
+            self.driver = self.execute_hook('after_get_url', self.driver)
            
            # Execute JS code if provided
-            if self.js_code:
+            if self.js_code and type(self.js_code) == str:
                self.driver.execute_script(self.js_code)
                # Optionally, wait for some condition after executing the JS code
                WebDriverWait(self.driver, 10).until(
                    lambda driver: driver.execute_script("return document.readyState") == "complete"
                )
+            elif self.js_code and type(self.js_code) == list:
+                for js in self.js_code:
+                    self.driver.execute_script(js)
+                    WebDriverWait(self.driver, 10).until(
+                        lambda driver: driver.execute_script("return document.readyState") == "complete"
+                    )
            
            html = self.driver.page_source
+            self.driver = self.execute_hook('before_return_html', self.driver, html)
            
            # Store in cache
-            cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
+            cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
            with open(cache_file_path, "w") as f:
                f.write(html)
+                
+            if self.verbose:
+                print(f"[LOG] ✅ Crawled {url} successfully!")
            
            return html
        except InvalidArgumentException:
@@ -88,5 +198,62 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
        except Exception as e:
            raise Exception(f"Failed to crawl {url}: {str(e)}")

+    def take_screenshot(self) -> str:
+        try:
+            # Get the dimensions of the page
+            total_width = self.driver.execute_script("return document.body.scrollWidth")
+            total_height = self.driver.execute_script("return document.body.scrollHeight")
+
+            # Set the window size to the dimensions of the page
+            self.driver.set_window_size(total_width, total_height)
+
+            # Take screenshot
+            screenshot = self.driver.get_screenshot_as_png()
+
+            # Open the screenshot with PIL
+            image = Image.open(BytesIO(screenshot))
+
+            # Convert to JPEG and compress
+            buffered = BytesIO()
+            image.save(buffered, format="JPEG", quality=85)
+            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+
+            if self.verbose:
+                print(f"[LOG] 📸 Screenshot taken and converted to base64")
+
+            return img_base64
+
+        except Exception as e:
+            error_message = f"Failed to take screenshot: {str(e)}"
+            print(error_message)
+
+            # Generate an image with black background
+            img = Image.new('RGB', (800, 600), color='black')
+            draw = ImageDraw.Draw(img)
+            
+            # Load a font
+            try:
+                font = ImageFont.truetype("arial.ttf", 40)
+            except IOError:
+                font = ImageFont.load_default(size=40)
+
+            # Define text color and wrap the text
+            text_color = (255, 255, 255)
+            max_width = 780
+            wrapped_text = wrap_text(draw, error_message, font, max_width)
+
+            # Calculate text position
+            text_position = (10, 10)
+            
+            # Draw the text on the image
+            draw.text(text_position, wrapped_text, fill=text_color, font=font)
+            
+            # Convert to base64
+            buffered = BytesIO()
+            img.save(buffered, format="JPEG")
+            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+
+            return img_base64
+
    def quit(self):
        self.driver.quit()
--- a/crawl4ai/database.py
+++ b/crawl4ai/database.py
@@ -1,13 +1,12 @@
 import os
 from pathlib import Path
 import sqlite3
-from typing import Optional
 from typing import Optional, Tuple

 DB_PATH = os.path.join(Path.home(), ".crawl4ai")
 os.makedirs(DB_PATH, exist_ok=True)
 DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
-        
+
 def init_db():
    global DB_PATH
    conn = sqlite3.connect(DB_PATH)
@@ -19,22 +18,37 @@ def init_db():
            cleaned_html TEXT,
            markdown TEXT,
            extracted_content TEXT,
-            success BOOLEAN
+            success BOOLEAN,
+            media TEXT DEFAULT "{}",
+            link TEXT DEFAULT "{}",
+            metadata TEXT DEFAULT "{}",
+            screenshot TEXT DEFAULT ""
        )
    ''')
    conn.commit()
    conn.close()

-def check_db_path():
-    if not DB_PATH:
-        raise ValueError("Database path is not set or is empty.")
-
-def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
+def alter_db_add_screenshot(new_column: str = "media"):
    check_db_path()
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
-        cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success FROM crawled_data WHERE url = ?', (url,))
+        cursor.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
+        conn.commit()
+        conn.close()
+    except Exception as e:
+        print(f"Error altering database to add screenshot column: {e}")
+
+def check_db_path():
+    if not DB_PATH:
+        raise ValueError("Database path is not set or is empty.")
+
+def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
+    check_db_path()
+    try:
+        conn = sqlite3.connect(DB_PATH)
+        cursor = conn.cursor()
+        cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?', (url,))
        result = cursor.fetchone()
        conn.close()
        return result
@@ -42,21 +56,25 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
        print(f"Error retrieving cached URL: {e}")
        return None

-def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool):
+def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", metadata : str = "{}", screenshot: str = ""):
    check_db_path()
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute('''
-            INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success)
-            VALUES (?, ?, ?, ?, ?, ?)
+            INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ON CONFLICT(url) DO UPDATE SET
                html = excluded.html,
                cleaned_html = excluded.cleaned_html,
                markdown = excluded.markdown,
                extracted_content = excluded.extracted_content,
-                success = excluded.success
-        ''', (url, html, cleaned_html, markdown, extracted_content, success))
+                success = excluded.success,
+                media = excluded.media,      
+                links = excluded.links,    
+                metadata = excluded.metadata,      
+                screenshot = excluded.screenshot
+        ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot))
        conn.commit()
        conn.close()
    except Exception as e:
@@ -95,4 +113,20 @@ def flush_db():
        conn.commit()
        conn.close()
    except Exception as e:
-        print(f"Error flushing database: {e}")
+        print(f"Error flushing database: {e}")
+
+def update_existing_records(new_column: str = "media", default_value: str = "{}"):
+    check_db_path()
+    try:
+        conn = sqlite3.connect(DB_PATH)
+        cursor = conn.cursor()
+        cursor.execute(f'UPDATE crawled_data SET {new_column} = "{default_value}" WHERE screenshot IS NULL')
+        conn.commit()
+        conn.close()
+    except Exception as e:
+        print(f"Error updating existing records: {e}")
+
+if __name__ == "__main__":
+    init_db()  # Initialize the database if not already initialized
+    alter_db_add_screenshot("metadata")  # Add the new column to the table
+    update_existing_records("metadata")  # Update existing records to set the new column to an empty string
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -46,6 +46,7 @@ class ExtractionStrategy(ABC):
            for future in as_completed(futures):
                extracted_content.extend(future.result())
        return extracted_content    
+    
 class NoExtractionStrategy(ExtractionStrategy):
    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
        return [{"index": 0, "content": html}]
@@ -141,7 +142,8 @@ class LLMExtractionStrategy(ExtractionStrategy):
        if self.provider.startswith("groq/"):
            # Sequential processing with a delay
            for ix, section in enumerate(merged_sections):
-                extracted_content.extend(self.extract(ix, url, section))
+                extract_func = partial(self.extract, url)
+                extracted_content.extend(extract_func(ix, section))
                time.sleep(0.5)  # 500 ms delay between each processing
        else:
            # Parallel processing using ThreadPoolExecutor
@@ -156,7 +158,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
        return extracted_content        
  
 class CosineStrategy(ExtractionStrategy):
-    def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'BAAI/bge-small-en-v1.5', **kwargs):
+    def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs):
        """
        Initialize the strategy with clustering parameters.

@@ -173,48 +175,97 @@ class CosineStrategy(ExtractionStrategy):
        self.max_dist = max_dist
        self.linkage_method = linkage_method
        self.top_k = top_k
+        self.sim_threshold = sim_threshold
        self.timer = time.time()
        self.verbose = kwargs.get("verbose", False)
        
        self.buffer_embeddings = np.array([])
+        self.get_embedding_method = "direct"
+        
+        self.device = get_device()
+        self.default_batch_size = calculate_batch_size(self.device)

-        if model_name == "bert-base-uncased":
-            self.tokenizer, self.model = load_bert_base_uncased()
-        elif model_name == "BAAI/bge-small-en-v1.5":
-            self.tokenizer, self.model = load_bge_small_en_v1_5()
+        if self.verbose:
+            print(f"[LOG] Loading Extraction Model for {self.device.type} device.")

-        self.nlp = load_text_multilabel_classifier()
+        # if False and self.device.type == "cpu":
+        #     self.model = load_onnx_all_MiniLM_l6_v2()
+        #     self.tokenizer = self.model.tokenizer
+        #     self.get_embedding_method = "direct"
+        # else:
+
+        self.tokenizer, self.model = load_bge_small_en_v1_5()
+        self.model.eval()  
+        self.get_embedding_method = "batch"
+        
+        self.buffer_embeddings = np.array([])
+
+        # if model_name == "bert-base-uncased":
+        #     self.tokenizer, self.model = load_bert_base_uncased()
+        #     self.model.eval()  # Ensure the model is in evaluation mode
+        #     self.get_embedding_method = "batch"
+        # elif model_name == "BAAI/bge-small-en-v1.5":
+        #     self.tokenizer, self.model = load_bge_small_en_v1_5()
+        #     self.model.eval()  # Ensure the model is in evaluation mode
+        #     self.get_embedding_method = "batch"
+        # elif model_name == "sentence-transformers/all-MiniLM-L6-v2":
+        #     self.model = load_onnx_all_MiniLM_l6_v2()
+        #     self.tokenizer = self.model.tokenizer
+        #     self.get_embedding_method = "direct"
+       
+        
+        if self.verbose:
+            print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.")
+            
+        self.nlp, self.device = load_text_multilabel_classifier()
+        # self.default_batch_size = 16 if self.device.type == 'cpu' else 64
        
        if self.verbose:
            print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")

-    def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, threshold: float = 0.5) -> List[str]:
+    def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, at_least_k: int = 20) -> List[str]:
        """
-        Filter documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
+        Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.

        :param documents: List of text chunks (documents).
        :param semantic_filter: A string containing the keywords for filtering.
        :param threshold: Cosine similarity threshold for filtering documents.
-        :return: Filtered list of documents.
+        :param at_least_k: Minimum number of documents to return.
+        :return: List of filtered documents, ensuring at least `at_least_k` documents.
        """
-        from sklearn.metrics.pairwise import cosine_similarity
+        
        if not semantic_filter:
            return documents
+        
+        if len(documents) < at_least_k:
+            at_least_k = len(documents) // 2
+        
+        from sklearn.metrics.pairwise import cosine_similarity
+        
        # Compute embedding for the keyword filter
        query_embedding = self.get_embeddings([semantic_filter])[0]
        
-        # Compute embeddings for the docu  ments
+        # Compute embeddings for the documents
        document_embeddings = self.get_embeddings(documents)
        
        # Calculate cosine similarity between the query embedding and document embeddings
        similarities = cosine_similarity([query_embedding], document_embeddings).flatten()
        
        # Filter documents based on the similarity threshold
-        filtered_docs = [doc for doc, sim in zip(documents, similarities) if sim >= threshold]
+        filtered_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim >= self.sim_threshold]
        
-        return filtered_docs
-
-    def get_embeddings(self, sentences: List[str], bypass_buffer=True):
+        # If the number of filtered documents is less than at_least_k, sort remaining documents by similarity
+        if len(filtered_docs) < at_least_k:
+            remaining_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim < self.sim_threshold]
+            remaining_docs.sort(key=lambda x: x[1], reverse=True)
+            filtered_docs.extend(remaining_docs[:at_least_k - len(filtered_docs)])
+        
+        # Extract the document texts from the tuples
+        filtered_docs = [doc for doc, _ in filtered_docs]
+        
+        return filtered_docs[:at_least_k]
+    
+    def get_embeddings(self, sentences: List[str], batch_size=None, bypass_buffer=False):
        """
        Get BERT embeddings for a list of sentences.

@@ -224,19 +275,42 @@ class CosineStrategy(ExtractionStrategy):
        # if self.buffer_embeddings.any() and not bypass_buffer:
        #     return self.buffer_embeddings
        
-        import torch 
-        # Tokenize sentences and convert to tensor
-        encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
-        # Compute token embeddings
-        with torch.no_grad():
-            model_output = self.model(**encoded_input)
+        if self.device.type in [ "cpu", "gpu", "cuda", "mps"]:
+            import torch 
+            # Tokenize sentences and convert to tensor
+            if batch_size is None:
+                batch_size = self.default_batch_size
+                        
+            all_embeddings = []
+            for i in range(0, len(sentences), batch_size):
+                batch_sentences = sentences[i:i + batch_size]
+                encoded_input = self.tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt')
+                encoded_input = {key: tensor.to(self.device) for key, tensor in encoded_input.items()}
+                
+                # Ensure no gradients are calculated
+                with torch.no_grad():
+                    model_output = self.model(**encoded_input)
+                
+                # Get embeddings from the last hidden state (mean pooling)
+                embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy()
+                all_embeddings.append(embeddings)
            
-        # Get embeddings from the last hidden state (mean pooling)
-        embeddings = model_output.last_hidden_state.mean(1)
-        self.buffer_embeddings = embeddings.numpy()
-        return embeddings.numpy()
+            self.buffer_embeddings = np.vstack(all_embeddings)
+        elif self.device.type == "cpu":      
+            # self.buffer_embeddings = self.model(sentences)
+            if batch_size is None:
+                batch_size = self.default_batch_size
+                
+            all_embeddings = []
+            for i in range(0, len(sentences), batch_size):
+                batch_sentences = sentences[i:i + batch_size]
+                embeddings = self.model(batch_sentences)
+                all_embeddings.append(embeddings)
+                
+            self.buffer_embeddings = np.vstack(all_embeddings)
+        return self.buffer_embeddings

-    def hierarchical_clustering(self, sentences: List[str]):
+    def hierarchical_clustering(self, sentences: List[str], embeddings = None):
        """
        Perform hierarchical clustering on sentences and return cluster labels.

@@ -247,7 +321,7 @@ class CosineStrategy(ExtractionStrategy):
        from scipy.cluster.hierarchy import linkage, fcluster
        from scipy.spatial.distance import pdist
        self.timer = time.time()
-        embeddings = self.get_embeddings(sentences, bypass_buffer=False)
+        embeddings = self.get_embeddings(sentences, bypass_buffer=True)
        # print(f"[LOG] 🚀 Embeddings computed in {time.time() - self.timer:.2f} seconds")
        # Compute pairwise cosine distances
        distance_matrix = pdist(embeddings, 'cosine')
@@ -311,20 +385,33 @@ class CosineStrategy(ExtractionStrategy):
        # Convert filtered clusters to a sorted list of dictionaries
        cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
        
-        labels = self.nlp([cluster['content'] for cluster in cluster_list])
+        if self.verbose:
+            print(f"[LOG] 🚀 Assign tags using {self.device}")
        
-        for cluster, label in zip(cluster_list, labels):
-            cluster['tags'] = label
+        if self.device.type in ["gpu", "cuda", "mps"]:
+            labels = self.nlp([cluster['content'] for cluster in cluster_list])
+            
+            for cluster, label in zip(cluster_list, labels):
+                cluster['tags'] = label
+        elif self.device == "cpu":
+            # Process the text with the loaded model
+            texts = [cluster['content'] for cluster in cluster_list]
+            # Batch process texts
+            docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])

-        # Process the text with the loaded model
-        # for cluster in  cluster_list:
-        #     cluster['tags'] = self.nlp(cluster['content'])[0]['label']
-            # doc = self.nlp(cluster['content'])
-            # tok_k = self.top_k
-            # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
-            # cluster['tags'] = [cat for cat, _ in top_categories]
+            for doc, cluster in zip(docs, cluster_list):
+                tok_k = self.top_k
+                top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
+                cluster['tags'] = [cat for cat, _ in top_categories]
+                            
+            # for cluster in  cluster_list:
+            #     doc = self.nlp(cluster['content'])
+            #     tok_k = self.top_k
+            #     top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
+            #     cluster['tags'] = [cat for cat, _ in top_categories]
        
-        # print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
+        if self.verbose:
+            print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
        
        return cluster_list

@@ -463,4 +550,4 @@ class ContentSummarizationStrategy(ExtractionStrategy):

        # Sort summaries by the original section index to maintain order
        summaries.sort(key=lambda x: x[0])
-        return [summary for _, summary in summaries]
+        return [summary for _, summary in summaries]
--- a/crawl4ai/model_loader.py
+++ b/crawl4ai/model_loader.py
@@ -2,9 +2,58 @@ from functools import lru_cache
 from pathlib import Path
 import subprocess, os
 import shutil
+import tarfile
 from crawl4ai.config import MODEL_REPO_BRANCH
 import argparse
+import urllib.request
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

+@lru_cache()
+def get_available_memory(device):
+    import torch
+    if device.type == 'cuda':
+        return torch.cuda.get_device_properties(device).total_memory
+    elif device.type == 'mps':      
+        return 48 * 1024 ** 3  # Assuming 8GB for MPS, as a conservative estimate
+    else:
+        return 0
+
+@lru_cache()
+def calculate_batch_size(device):
+    available_memory = get_available_memory(device)
+    
+    if device.type == 'cpu':
+        return 16
+    elif device.type in ['cuda', 'mps']:
+        # Adjust these thresholds based on your model size and available memory
+        if available_memory >= 31 * 1024 ** 3:  # > 32GB
+            return 256
+        elif available_memory >= 15 * 1024 ** 3:  # > 16GB to 32GB
+            return 128
+        elif available_memory >= 8 * 1024 ** 3:  # 8GB to 16GB
+            return 64
+        else:
+            return 32
+    else:
+        return 16  # Default batch size   
+    
+@lru_cache()
+def get_device():
+    import torch
+    if torch.cuda.is_available():
+        device = torch.device('cuda')
+    elif torch.backends.mps.is_available():
+        device = torch.device('mps')
+    else:
+        device = torch.device('cpu')
+    return device   
+    
+def set_model_device(model):
+    device = get_device()
+    model.to(device)    
+    return model, device
+
+@lru_cache()
 def get_home_folder():
    home_folder = os.path.join(Path.home(), ".crawl4ai")
    os.makedirs(home_folder, exist_ok=True)
@@ -17,6 +66,8 @@ def load_bert_base_uncased():
    from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', resume_download=None)
    model = BertModel.from_pretrained('bert-base-uncased', resume_download=None)
+    model.eval()
+    model, device = set_model_device(model)
    return tokenizer, model

@lru_cache()
@@ -25,17 +76,62 @@ def load_bge_small_en_v1_5():
    tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
    model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
    model.eval()
+    model, device = set_model_device(model)
    return tokenizer, model

+@lru_cache()
+def load_onnx_all_MiniLM_l6_v2():
+    from crawl4ai.onnx_embedding import DefaultEmbeddingModel
+
+    model_path = "models/onnx.tar.gz"
+    model_url = "https://unclecode-files.s3.us-west-2.amazonaws.com/onnx.tar.gz"
+    __location__ = os.path.realpath(
+        os.path.join(os.getcwd(), os.path.dirname(__file__)))
+    download_path = os.path.join(__location__, model_path)
+    onnx_dir = os.path.join(__location__, "models/onnx")
+    
+    # Create the models directory if it does not exist
+    os.makedirs(os.path.dirname(download_path), exist_ok=True)
+
+    # Download the tar.gz file if it does not exist
+    if not os.path.exists(download_path):
+        def download_with_progress(url, filename):
+            def reporthook(block_num, block_size, total_size):
+                downloaded = block_num * block_size
+                percentage = 100 * downloaded / total_size
+                if downloaded < total_size:
+                    print(f"\rDownloading: {percentage:.2f}% ({downloaded / (1024 * 1024):.2f} MB of {total_size / (1024 * 1024):.2f} MB)", end='')
+                else:
+                    print("\rDownload complete!")
+
+            urllib.request.urlretrieve(url, filename, reporthook)
+
+        download_with_progress(model_url, download_path)
+
+    # Extract the tar.gz file if the onnx directory does not exist
+    if not os.path.exists(onnx_dir):
+        with tarfile.open(download_path, "r:gz") as tar:
+            tar.extractall(path=os.path.join(__location__, "models"))
+        
+        # remove the tar.gz file
+        os.remove(download_path)
+    
+    
+    
+    model = DefaultEmbeddingModel()
+    return model
+
@lru_cache()
 def load_text_classifier():
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    from transformers import pipeline
+    import torch

    tokenizer = AutoTokenizer.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
    model = AutoModelForSequenceClassification.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
+    model.eval()
+    model, device = set_model_device(model)
    pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
-
    return pipe

@lru_cache()
@@ -45,20 +141,21 @@ def load_text_multilabel_classifier():
    from scipy.special import expit
    import torch

-    MODEL = "cardiffnlp/tweet-topic-21-multi"
-    tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
-    model = AutoModelForSequenceClassification.from_pretrained(MODEL, resume_download=None)
-    class_mapping = model.config.id2label
-
    # Check for available device: CUDA, MPS (for Apple Silicon), or CPU
    if torch.cuda.is_available():
        device = torch.device("cuda")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
-        device = torch.device("cpu")
+        return load_spacy_model(), torch.device("cpu")

-    model.to(device)
+
+    MODEL = "cardiffnlp/tweet-topic-21-multi"
+    tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
+    model = AutoModelForSequenceClassification.from_pretrained(MODEL, resume_download=None)
+    model.eval()
+    model, device = set_model_device(model)
+    class_mapping = model.config.id2label

    def _classifier(texts, threshold=0.5, max_length=64):
        tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
@@ -78,7 +175,7 @@ def load_text_multilabel_classifier():

        return batch_labels

-    return _classifier
+    return _classifier, device

@lru_cache()
 def load_nltk_punkt():
@@ -89,6 +186,58 @@ def load_nltk_punkt():
        nltk.download('punkt')
    return nltk.data.find('tokenizers/punkt')

+
+@lru_cache()
+def load_spacy_model():
+    import spacy
+    name = "models/reuters"
+    home_folder = get_home_folder()
+    model_folder = os.path.join(home_folder, name)
+    
+    # Check if the model directory already exists
+    if not (Path(model_folder).exists() and any(Path(model_folder).iterdir())):
+        repo_url = "https://github.com/unclecode/crawl4ai.git"
+        # branch = "main"
+        branch = MODEL_REPO_BRANCH 
+        repo_folder = os.path.join(home_folder, "crawl4ai")
+        model_folder = os.path.join(home_folder, name)
+
+        # print("[LOG] ⏬ Downloading Spacy model for the first time...")
+
+        # Remove existing repo folder if it exists
+        if Path(repo_folder).exists():
+            shutil.rmtree(repo_folder)
+            shutil.rmtree(model_folder)
+
+        try:
+            # Clone the repository
+            subprocess.run(
+                ["git", "clone", "-b", branch, repo_url, repo_folder],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+                check=True
+            )
+
+            # Create the models directory if it doesn't exist
+            models_folder = os.path.join(home_folder, "models")
+            os.makedirs(models_folder, exist_ok=True)
+
+            # Copy the reuters model folder to the models directory
+            source_folder = os.path.join(repo_folder, "models/reuters")
+            shutil.copytree(source_folder, model_folder)
+
+            # Remove the cloned repository
+            shutil.rmtree(repo_folder)
+
+            # Print completion message
+            # print("[LOG] ✅ Spacy Model downloaded successfully")
+        except subprocess.CalledProcessError as e:
+            print(f"An error occurred while cloning the repository: {e}")
+        except Exception as e:
+            print(f"An error occurred: {e}")
+
+    return spacy.load(model_folder)
+
 def download_all_models(remove_existing=False):
    """Download all models required for Crawl4AI."""
    if remove_existing:
@@ -104,12 +253,15 @@ def download_all_models(remove_existing=False):
        print("[LOG] Existing models removed.")

    # Load each model to trigger download
-    print("[LOG] Downloading BERT Base Uncased...")
-    load_bert_base_uncased()
-    print("[LOG] Downloading BGE Small EN v1.5...")
-    load_bge_small_en_v1_5()
+    # print("[LOG] Downloading BERT Base Uncased...")
+    # load_bert_base_uncased()
+    # print("[LOG] Downloading BGE Small EN v1.5...")
+    # load_bge_small_en_v1_5()
+    # print("[LOG] Downloading ONNX model...")
+    # load_onnx_all_MiniLM_l6_v2()
    print("[LOG] Downloading text classifier...")
-    load_text_multilabel_classifier
+    _, device = load_text_multilabel_classifier()
+    print(f"[LOG] Text classifier loaded on {device}")
    print("[LOG] Downloading custom NLTK Punkt model...")
    load_nltk_punkt()
    print("[LOG] ✅ All models downloaded successfully.")
@@ -124,4 +276,4 @@ def main():
    download_all_models(remove_existing=args.remove_existing)

 if __name__ == "__main__":
-    main()
+    main()
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -1,5 +1,5 @@
 from pydantic import BaseModel, HttpUrl
-from typing import List
+from typing import List, Dict, Optional

 class UrlModel(BaseModel):
    url: HttpUrl
@@ -9,8 +9,11 @@ class CrawlResult(BaseModel):
    url: str
    html: str
    success: bool
-    cleaned_html: str = None
-    markdown: str = None
-    extracted_content: str = None
-    metadata: dict = None
-    error_message: str = None
+    cleaned_html: Optional[str] = None
+    media: Dict[str, List[Dict]] = {}
+    links: Dict[str, List[Dict]] = {}
+    screenshot: Optional[str] = None
+    markdown: Optional[str] = None
+    extracted_content: Optional[str] = None
+    metadata: Optional[dict] = None
+    error_message: Optional[str] = None
--- a/crawl4ai/models/onnx/config.json
+++ b/crawl4ai/models/onnx/config.json
@@ -0,0 +1,25 @@
+{
+  "_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.27.4",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}
--- a/crawl4ai/models/onnx/model.onnx
+++ b/crawl4ai/models/onnx/model.onnx
--- a/crawl4ai/models/onnx/special_tokens_map.json
+++ b/crawl4ai/models/onnx/special_tokens_map.json
@@ -0,0 +1,7 @@
+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}
--- a/crawl4ai/models/onnx/tokenizer.json
+++ b/crawl4ai/models/onnx/tokenizer.json
--- a/crawl4ai/models/onnx/tokenizer_config.json
+++ b/crawl4ai/models/onnx/tokenizer_config.json
@@ -0,0 +1,15 @@
+{
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": "/Users/hammad/.cache/huggingface/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/7dbbc90392e2f80f3d3c277d6e90027e55de9125/special_tokens_map.json",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}
--- a/crawl4ai/models/onnx/vocab.txt
+++ b/crawl4ai/models/onnx/vocab.txt
--- a/crawl4ai/onnx_embedding.py
+++ b/crawl4ai/onnx_embedding.py
@@ -0,0 +1,50 @@
+# A dependency-light way to run the onnx model
+
+
+import numpy as np
+from typing import List
+import os
+
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
+
+def normalize(v):
+    norm = np.linalg.norm(v, axis=1)
+    norm[norm == 0] = 1e-12
+    return v / norm[:, np.newaxis]
+
+# Sampel implementation of the default sentence-transformers model using ONNX
+class DefaultEmbeddingModel():
+
+    def __init__(self):
+        from tokenizers import Tokenizer
+        import onnxruntime as ort
+        # max_seq_length = 256, for some reason sentence-transformers uses 256 even though the HF config has a max length of 128
+        # https://github.com/UKPLab/sentence-transformers/blob/3e1929fddef16df94f8bc6e3b10598a98f46e62d/docs/_static/html/models_en_sentence_embeddings.html#LL480
+        self.tokenizer = Tokenizer.from_file(os.path.join(__location__, "models/onnx/tokenizer.json"))
+        self.tokenizer.enable_truncation(max_length=256)
+        self.tokenizer.enable_padding(pad_id=0, pad_token="[PAD]", length=256)
+        self.model = ort.InferenceSession(os.path.join(__location__,"models/onnx/model.onnx"))
+        
+
+    def __call__(self, documents: List[str], batch_size: int = 32):
+        all_embeddings = []
+        for i in range(0, len(documents), batch_size):
+            batch = documents[i:i + batch_size]
+            encoded = [self.tokenizer.encode(d) for d in batch]
+            input_ids = np.array([e.ids for e in encoded])
+            attention_mask = np.array([e.attention_mask for e in encoded])
+            onnx_input = {
+                "input_ids": np.array(input_ids, dtype=np.int64),
+                "attention_mask": np.array(attention_mask, dtype=np.int64),
+                "token_type_ids": np.array([np.zeros(len(e), dtype=np.int64) for e in input_ids], dtype=np.int64),
+            }
+            model_output = self.model.run(None, onnx_input)
+            last_hidden_state = model_output[0]
+            # Perform mean pooling with attention weighting
+            input_mask_expanded = np.broadcast_to(np.expand_dims(attention_mask, -1), last_hidden_state.shape)
+            embeddings = np.sum(last_hidden_state * input_mask_expanded, 1) / np.clip(input_mask_expanded.sum(1), a_min=1e-9, a_max=None)
+            embeddings = normalize(embeddings).astype(np.float32)
+            all_embeddings.append(embeddings)
+        return np.concatenate(all_embeddings)
+
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -151,7 +151,7 @@ class CustomHTML2Text(HTML2Text):

        super().handle_tag(tag, attrs, start)

-def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
+def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
    try:
        if not html:
            return None
@@ -170,6 +170,28 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
            for el in selected_elements:
                div_tag.append(el)
            body = div_tag
+            
+        links = {
+            'internal': [],
+            'external': []
+        }
+        
+        # Extract all internal and external links
+        for a in body.find_all('a', href=True):
+            href = a['href']
+            url_base = url.split('/')[2]
+            if href.startswith('http') and url_base not in href:
+                links['external'].append({
+                    'href': href,
+                    'text': a.get_text()
+                })
+            else:
+                links['internal'].append(
+                    {
+                        'href': href,
+                        'text': a.get_text()
+                    }
+                )

        # Remove script, style, and other tags that don't carry useful content from body
        for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']):
@@ -180,6 +202,35 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
            if tag.name != 'img':
                tag.attrs = {}

+        # Extract all img tgas inti [{src: '', alt: ''}]
+        media = {
+            'images': [],
+            'videos': [],
+            'audios': []
+        }
+        for img in body.find_all('img'):
+            media['images'].append({
+                'src': img.get('src'),
+                'alt': img.get('alt'),
+                "type": "image"
+            })
+            
+        # Extract all video tags into [{src: '', alt: ''}]
+        for video in body.find_all('video'):
+            media['videos'].append({
+                'src': video.get('src'),
+                'alt': video.get('alt'),
+                "type": "video"
+            })
+            
+        # Extract all audio tags into [{src: '', alt: ''}]
+        for audio in body.find_all('audio'):
+            media['audios'].append({
+                'src': audio.get('src'),
+                'alt': audio.get('alt'),
+                "type": "audio"
+            })
+        
        # Replace images with their alt text or remove them if no alt text is available
        for img in body.find_all('img'):
            alt_text = img.get('alt')
@@ -299,13 +350,56 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
        return{
            'markdown': markdown,
            'cleaned_html': cleaned_html,
-            'success': True
+            'success': True,
+            'media': media,
+            'links': links
        }

    except Exception as e:
        print('Error processing HTML content:', str(e))
        raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e

+
+
+def extract_metadata(html):
+    metadata = {}
+    
+    if not html:
+        return metadata
+    
+    # Parse HTML content with BeautifulSoup
+    soup = BeautifulSoup(html, 'html.parser')
+
+    # Title
+    title_tag = soup.find('title')
+    metadata['title'] = title_tag.string if title_tag else None
+
+    # Meta description
+    description_tag = soup.find('meta', attrs={'name': 'description'})
+    metadata['description'] = description_tag['content'] if description_tag else None
+
+    # Meta keywords
+    keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
+    metadata['keywords'] = keywords_tag['content'] if keywords_tag else None
+
+    # Meta author
+    author_tag = soup.find('meta', attrs={'name': 'author'})
+    metadata['author'] = author_tag['content'] if author_tag else None
+
+    # Open Graph metadata
+    og_tags = soup.find_all('meta', attrs={'property': lambda value: value and value.startswith('og:')})
+    for tag in og_tags:
+        property_name = tag['property']
+        metadata[property_name] = tag['content']
+
+    # Twitter Card metadata
+    twitter_tags = soup.find_all('meta', attrs={'name': lambda value: value and value.startswith('twitter:')})
+    for tag in twitter_tags:
+        property_name = tag['name']
+        metadata[property_name] = tag['content']
+
+    return metadata
+
 def extract_xml_tags(string):
    tags = re.findall(r'<(\w+)>', string)
    return list(set(tags))
@@ -483,4 +577,16 @@ def process_sections(url: str, sections: list, provider: str, api_token: str) ->
            for future in as_completed(futures):
                extracted_content.extend(future.result())
    
-    return extracted_content
+    return extracted_content
+
+
+def wrap_text(draw, text, font, max_width):
+    # Wrap the text to fit within the specified width
+    lines = []
+    words = text.split()
+    while words:
+        line = ''
+        while words and draw.textbbox((0, 0), line + words[0], font=font)[2] <= max_width:
+            line += (words.pop(0) + ' ')
+        lines.append(line)
+    return '\n'.join(lines)
--- a/crawl4ai/web_crawler.back.py
+++ b/crawl4ai/web_crawler.back.py
@@ -0,0 +1,357 @@
+import os, time
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from pathlib import Path
+
+from .models import UrlModel, CrawlResult
+from .database import init_db, get_cached_url, cache_url, DB_PATH, flush_db
+from .utils import *
+from .chunking_strategy import *
+from .extraction_strategy import *
+from .crawler_strategy import *
+from typing import List
+from concurrent.futures import ThreadPoolExecutor
+from .config import *
+
+
+class WebCrawler:
+    def __init__(
+        self,
+        # db_path: str = None,
+        crawler_strategy: CrawlerStrategy = None,
+        always_by_pass_cache: bool = False,
+        verbose: bool = False,
+    ):
+        # self.db_path = db_path
+        self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
+        self.always_by_pass_cache = always_by_pass_cache
+
+        # Create the .crawl4ai folder in the user's home directory if it doesn't exist
+        self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
+        os.makedirs(self.crawl4ai_folder, exist_ok=True)
+        os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
+
+        # If db_path is not provided, use the default path
+        # if not db_path:
+            # self.db_path = f"{self.crawl4ai_folder}/crawl4ai.db"
+        
+        # flush_db()
+        init_db()
+        
+        self.ready = False
+        
+    def warmup(self):
+        print("[LOG] 🌤️  Warming up the WebCrawler")
+        result = self.run(
+            url='https://crawl4ai.uccode.io/',
+            word_count_threshold=5,
+            extraction_strategy= NoExtractionStrategy(),
+            bypass_cache=False,
+            verbose = False
+        )
+        self.ready = True
+        print("[LOG] 🌞 WebCrawler is ready to crawl")
+        
+    def fetch_page(
+        self,
+        url_model: UrlModel,
+        provider: str = DEFAULT_PROVIDER,
+        api_token: str = None,
+        extract_blocks_flag: bool = True,
+        word_count_threshold=MIN_WORD_THRESHOLD,
+        css_selector: str = None,
+        screenshot: bool = False,
+        use_cached_html: bool = False,
+        extraction_strategy: ExtractionStrategy = None,
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        **kwargs,
+    ) -> CrawlResult:
+        return self.run(
+            url_model.url,
+            word_count_threshold,
+            extraction_strategy or NoExtractionStrategy(),
+            chunking_strategy,
+            bypass_cache=url_model.forced,
+            css_selector=css_selector,
+            screenshot=screenshot,
+            **kwargs,
+        )
+        pass
+
+    def run_old(
+        self,
+        url: str,
+        word_count_threshold=MIN_WORD_THRESHOLD,
+        extraction_strategy: ExtractionStrategy = None,
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        bypass_cache: bool = False,
+        css_selector: str = None,
+        screenshot: bool = False,
+        user_agent: str = None,
+        verbose=True,
+        **kwargs,
+    ) -> CrawlResult:
+        if user_agent:
+            self.crawler_strategy.update_user_agent(user_agent)
+        extraction_strategy = extraction_strategy or NoExtractionStrategy()
+        extraction_strategy.verbose = verbose
+        # Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
+        if not isinstance(extraction_strategy, ExtractionStrategy):
+            raise ValueError("Unsupported extraction strategy")
+        if not isinstance(chunking_strategy, ChunkingStrategy):
+            raise ValueError("Unsupported chunking strategy")
+        
+        # make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD
+        if word_count_threshold < MIN_WORD_THRESHOLD:
+            word_count_threshold = MIN_WORD_THRESHOLD
+
+        # Check cache first
+        if not bypass_cache and not self.always_by_pass_cache:
+            cached = get_cached_url(url)
+            if cached:
+                return CrawlResult(
+                    **{
+                        "url": cached[0],
+                        "html": cached[1],
+                        "cleaned_html": cached[2],
+                        "markdown": cached[3],
+                        "extracted_content": cached[4],
+                        "success": cached[5],
+                        "media": json.loads(cached[6] or "{}"),
+                        "links": json.loads(cached[7] or "{}"),
+                        "metadata": json.loads(cached[8] or "{}"), # "metadata": "{}
+                        "screenshot": cached[9],
+                        "error_message": "",
+                    }
+                )
+
+        # Initialize WebDriver for crawling
+        t = time.time()
+        if kwargs.get("js", None):
+            self.crawler_strategy.js_code = kwargs.get("js")
+        html = self.crawler_strategy.crawl(url)
+        base64_image = None
+        if screenshot:
+            base64_image = self.crawler_strategy.take_screenshot()
+        success = True
+        error_message = ""
+        # Extract content from HTML
+        try:
+            result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
+            metadata = extract_metadata(html)
+            if result is None:
+                raise ValueError(f"Failed to extract content from the website: {url}")
+        except InvalidCSSSelectorError as e:
+            raise ValueError(str(e))
+        
+        cleaned_html = result.get("cleaned_html", "")
+        markdown = result.get("markdown", "")
+        media = result.get("media", [])
+        links = result.get("links", [])
+
+        # Print a profession LOG style message, show time taken and say crawling is done
+        if verbose:
+            print(
+                f"[LOG] 🚀 Crawling done for {url}, success: {success}, time taken: {time.time() - t} seconds"
+            )
+
+        extracted_content = []
+        if verbose:
+            print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
+        t = time.time()
+        # Split markdown into sections
+        sections = chunking_strategy.chunk(markdown)
+        # sections = merge_chunks_based_on_token_threshold(sections, CHUNK_TOKEN_THRESHOLD)
+
+        extracted_content = extraction_strategy.run(
+            url, sections,
+        )
+        extracted_content = json.dumps(extracted_content)
+
+        if verbose:
+            print(
+                f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds."
+            )
+
+        # Cache the result
+        cleaned_html = beautify_html(cleaned_html)
+        cache_url(
+            url,
+            html,
+            cleaned_html,
+            markdown,
+            extracted_content,
+            success,
+            json.dumps(media),
+            json.dumps(links),
+            json.dumps(metadata),
+            screenshot=base64_image,
+        )
+
+        return CrawlResult(
+            url=url,
+            html=html,
+            cleaned_html=cleaned_html,
+            markdown=markdown,
+            media=media,
+            links=links,
+            metadata=metadata,
+            screenshot=base64_image,
+            extracted_content=extracted_content,
+            success=success,
+            error_message=error_message,
+        )
+
+    def fetch_pages(
+        self,
+        url_models: List[UrlModel],
+        provider: str = DEFAULT_PROVIDER,
+        api_token: str = None,
+        extract_blocks_flag: bool = True,
+        word_count_threshold=MIN_WORD_THRESHOLD,
+        use_cached_html: bool = False,
+        css_selector: str = None,
+        screenshot: bool = False,
+        extraction_strategy: ExtractionStrategy = None,
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        **kwargs,
+    ) -> List[CrawlResult]:
+        extraction_strategy = extraction_strategy or NoExtractionStrategy()
+        def fetch_page_wrapper(url_model, *args, **kwargs):
+            return self.fetch_page(url_model, *args, **kwargs)
+
+        with ThreadPoolExecutor() as executor:
+            results = list(
+                executor.map(
+                    fetch_page_wrapper,
+                    url_models,
+                    [provider] * len(url_models),
+                    [api_token] * len(url_models),
+                    [extract_blocks_flag] * len(url_models),
+                    [word_count_threshold] * len(url_models),
+                    [css_selector] * len(url_models),
+                    [screenshot] * len(url_models),
+                    [use_cached_html] * len(url_models),
+                    [extraction_strategy] * len(url_models),
+                    [chunking_strategy] * len(url_models),
+                    *[kwargs] * len(url_models),
+                )
+            )
+
+        return results
+
+    def run(
+            self,
+            url: str,
+            word_count_threshold=MIN_WORD_THRESHOLD,
+            extraction_strategy: ExtractionStrategy = None,
+            chunking_strategy: ChunkingStrategy = RegexChunking(),
+            bypass_cache: bool = False,
+            css_selector: str = None,
+            screenshot: bool = False,
+            user_agent: str = None,
+            verbose=True,
+            **kwargs,
+        ) -> CrawlResult:
+            extraction_strategy = extraction_strategy or NoExtractionStrategy()
+            extraction_strategy.verbose = verbose
+            if not isinstance(extraction_strategy, ExtractionStrategy):
+                raise ValueError("Unsupported extraction strategy")
+            if not isinstance(chunking_strategy, ChunkingStrategy):
+                raise ValueError("Unsupported chunking strategy")
+            
+            if word_count_threshold < MIN_WORD_THRESHOLD:
+                word_count_threshold = MIN_WORD_THRESHOLD
+
+            # Check cache first
+            cached = None
+            extracted_content = None
+            if not bypass_cache and not self.always_by_pass_cache:
+                cached = get_cached_url(url)
+            
+            if cached:
+                html = cached[1]
+                extracted_content = cached[2]
+                if screenshot:
+                    screenshot = cached[9]
+            
+            else:
+                if user_agent:
+                    self.crawler_strategy.update_user_agent(user_agent)
+                html = self.crawler_strategy.crawl(url)
+                if screenshot:
+                    screenshot = self.crawler_strategy.take_screenshot()
+            
+            return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, bool(cached), **kwargs)
+
+    def process_html(
+            self,
+            url: str,
+            html: str,
+            extracted_content: str,
+            word_count_threshold: int,
+            extraction_strategy: ExtractionStrategy,
+            chunking_strategy: ChunkingStrategy,
+            css_selector: str,
+            screenshot: bool,
+            verbose: bool,
+            is_cached: bool,
+            **kwargs,
+        ) -> CrawlResult:
+            t = time.time()
+            # Extract content from HTML
+            try:
+                result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
+                metadata = extract_metadata(html)
+                if result is None:
+                    raise ValueError(f"Failed to extract content from the website: {url}")
+            except InvalidCSSSelectorError as e:
+                raise ValueError(str(e))
+            
+            cleaned_html = result.get("cleaned_html", "")
+            markdown = result.get("markdown", "")
+            media = result.get("media", [])
+            links = result.get("links", [])
+
+            if verbose:
+                print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds")
+                        
+            if extracted_content is None:
+                if verbose:
+                    print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
+
+                sections = chunking_strategy.chunk(markdown)
+                extracted_content = extraction_strategy.run(url, sections)
+                extracted_content = json.dumps(extracted_content)
+
+                if verbose:
+                    print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.")
+                
+            screenshot = None if not screenshot else screenshot
+            
+            if not is_cached:
+                cache_url(
+                    url,
+                    html,
+                    cleaned_html,
+                    markdown,
+                    extracted_content,
+                    True,
+                    json.dumps(media),
+                    json.dumps(links),
+                    json.dumps(metadata),
+                    screenshot=screenshot,
+                )                
+
+            return CrawlResult(
+                url=url,
+                html=html,
+                cleaned_html=cleaned_html,
+                markdown=markdown,
+                media=media,
+                links=links,
+                metadata=metadata,
+                screenshot=screenshot,
+                extracted_content=extracted_content,
+                success=True,
+                error_message="",
+            )
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -19,9 +19,10 @@ class WebCrawler:
        # db_path: str = None,
        crawler_strategy: CrawlerStrategy = None,
        always_by_pass_cache: bool = False,
+        verbose: bool = False,
    ):
        # self.db_path = db_path
-        self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy()
+        self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
        self.always_by_pass_cache = always_by_pass_cache

        # Create the .crawl4ai folder in the user's home directory if it doesn't exist
@@ -50,7 +51,6 @@ class WebCrawler:
        self.ready = True
        print("[LOG] 🌞 WebCrawler is ready to crawl")
        
-
    def fetch_page(
        self,
        url_model: UrlModel,
@@ -58,6 +58,8 @@ class WebCrawler:
        api_token: str = None,
        extract_blocks_flag: bool = True,
        word_count_threshold=MIN_WORD_THRESHOLD,
+        css_selector: str = None,
+        screenshot: bool = False,
        use_cached_html: bool = False,
        extraction_strategy: ExtractionStrategy = None,
        chunking_strategy: ChunkingStrategy = RegexChunking(),
@@ -69,111 +71,12 @@ class WebCrawler:
            extraction_strategy or NoExtractionStrategy(),
            chunking_strategy,
            bypass_cache=url_model.forced,
+            css_selector=css_selector,
+            screenshot=screenshot,
            **kwargs,
        )
        pass

-
-    def run(
-        self,
-        url: str,
-        word_count_threshold=MIN_WORD_THRESHOLD,
-        extraction_strategy: ExtractionStrategy = None,
-        chunking_strategy: ChunkingStrategy = RegexChunking(),
-        bypass_cache: bool = False,
-        css_selector: str = None,
-        verbose=True,
-        **kwargs,
-    ) -> CrawlResult:
-        extraction_strategy = extraction_strategy or NoExtractionStrategy()
-        extraction_strategy.verbose = verbose
-        # Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
-        if not isinstance(extraction_strategy, ExtractionStrategy):
-            raise ValueError("Unsupported extraction strategy")
-        if not isinstance(chunking_strategy, ChunkingStrategy):
-            raise ValueError("Unsupported chunking strategy")
-        
-        # make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD
-        if word_count_threshold < MIN_WORD_THRESHOLD:
-            word_count_threshold = MIN_WORD_THRESHOLD
-
-        # Check cache first
-        if not bypass_cache and not self.always_by_pass_cache:
-            cached = get_cached_url(url)
-            if cached:
-                return CrawlResult(
-                    **{
-                        "url": cached[0],
-                        "html": cached[1],
-                        "cleaned_html": cached[2],
-                        "markdown": cached[3],
-                        "extracted_content": cached[4],
-                        "success": cached[5],
-                        "error_message": "",
-                    }
-                )
-
-        # Initialize WebDriver for crawling
-        t = time.time()
-        html = self.crawler_strategy.crawl(url)
-        success = True
-        error_message = ""
-        # Extract content from HTML
-        try:
-            result = get_content_of_website(html, word_count_threshold, css_selector=css_selector)
-            if result is None:
-                raise ValueError(f"Failed to extract content from the website: {url}")
-        except InvalidCSSSelectorError as e:
-            raise ValueError(str(e))
-        
-        cleaned_html = result.get("cleaned_html", html)
-        markdown = result.get("markdown", "")
-
-        # Print a profession LOG style message, show time taken and say crawling is done
-        if verbose:
-            print(
-                f"[LOG] 🚀 Crawling done for {url}, success: {success}, time taken: {time.time() - t} seconds"
-            )
-
-        extracted_content = []
-        if verbose:
-            print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
-        t = time.time()
-        # Split markdown into sections
-        sections = chunking_strategy.chunk(markdown)
-        # sections = merge_chunks_based_on_token_threshold(sections, CHUNK_TOKEN_THRESHOLD)
-
-        extracted_content = extraction_strategy.run(
-            url, sections,
-        )
-        extracted_content = json.dumps(extracted_content)
-
-        if verbose:
-            print(
-                f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds."
-            )
-
-        # Cache the result
-        cleaned_html = beautify_html(cleaned_html)
-        cache_url(
-            url,
-            html,
-            cleaned_html,
-            markdown,
-            extracted_content,
-            success,
-        )
-
-        return CrawlResult(
-            url=url,
-            html=html,
-            cleaned_html=cleaned_html,
-            markdown=markdown,
-            extracted_content=extracted_content,
-            success=success,
-            error_message=error_message,
-        )
-
    def fetch_pages(
        self,
        url_models: List[UrlModel],
@@ -182,6 +85,8 @@ class WebCrawler:
        extract_blocks_flag: bool = True,
        word_count_threshold=MIN_WORD_THRESHOLD,
        use_cached_html: bool = False,
+        css_selector: str = None,
+        screenshot: bool = False,
        extraction_strategy: ExtractionStrategy = None,
        chunking_strategy: ChunkingStrategy = RegexChunking(),
        **kwargs,
@@ -199,6 +104,8 @@ class WebCrawler:
                    [api_token] * len(url_models),
                    [extract_blocks_flag] * len(url_models),
                    [word_count_threshold] * len(url_models),
+                    [css_selector] * len(url_models),
+                    [screenshot] * len(url_models),
                    [use_cached_html] * len(url_models),
                    [extraction_strategy] * len(url_models),
                    [chunking_strategy] * len(url_models),
@@ -207,3 +114,120 @@ class WebCrawler:
            )

        return results
+
+    def run(
+            self,
+            url: str,
+            word_count_threshold=MIN_WORD_THRESHOLD,
+            extraction_strategy: ExtractionStrategy = None,
+            chunking_strategy: ChunkingStrategy = RegexChunking(),
+            bypass_cache: bool = False,
+            css_selector: str = None,
+            screenshot: bool = False,
+            user_agent: str = None,
+            verbose=True,
+            **kwargs,
+        ) -> CrawlResult:
+            extraction_strategy = extraction_strategy or NoExtractionStrategy()
+            extraction_strategy.verbose = verbose
+            if not isinstance(extraction_strategy, ExtractionStrategy):
+                raise ValueError("Unsupported extraction strategy")
+            if not isinstance(chunking_strategy, ChunkingStrategy):
+                raise ValueError("Unsupported chunking strategy")
+            
+            if word_count_threshold < MIN_WORD_THRESHOLD:
+                word_count_threshold = MIN_WORD_THRESHOLD
+
+            # Check cache first
+            cached = None
+            extracted_content = None
+            if not bypass_cache and not self.always_by_pass_cache:
+                cached = get_cached_url(url)
+            
+            if cached:
+                html = cached[1]
+                extracted_content = cached[2]
+                if screenshot:
+                    screenshot = cached[9]
+            
+            else:
+                if user_agent:
+                    self.crawler_strategy.update_user_agent(user_agent)
+                html = self.crawler_strategy.crawl(url)
+                if screenshot:
+                    screenshot = self.crawler_strategy.take_screenshot()
+            
+            return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, bool(cached), **kwargs)
+
+    def process_html(
+            self,
+            url: str,
+            html: str,
+            extracted_content: str,
+            word_count_threshold: int,
+            extraction_strategy: ExtractionStrategy,
+            chunking_strategy: ChunkingStrategy,
+            css_selector: str,
+            screenshot: bool,
+            verbose: bool,
+            is_cached: bool,
+            **kwargs,
+        ) -> CrawlResult:
+            t = time.time()
+            # Extract content from HTML
+            try:
+                result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
+                metadata = extract_metadata(html)
+                if result is None:
+                    raise ValueError(f"Failed to extract content from the website: {url}")
+            except InvalidCSSSelectorError as e:
+                raise ValueError(str(e))
+            
+            cleaned_html = result.get("cleaned_html", "")
+            markdown = result.get("markdown", "")
+            media = result.get("media", [])
+            links = result.get("links", [])
+
+            if verbose:
+                print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds")
+                        
+            if extracted_content is None:
+                if verbose:
+                    print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
+
+                sections = chunking_strategy.chunk(markdown)
+                extracted_content = extraction_strategy.run(url, sections)
+                extracted_content = json.dumps(extracted_content)
+
+                if verbose:
+                    print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.")
+                
+            screenshot = None if not screenshot else screenshot
+            
+            if not is_cached:
+                cache_url(
+                    url,
+                    html,
+                    cleaned_html,
+                    markdown,
+                    extracted_content,
+                    True,
+                    json.dumps(media),
+                    json.dumps(links),
+                    json.dumps(metadata),
+                    screenshot=screenshot,
+                )                
+
+            return CrawlResult(
+                url=url,
+                html=html,
+                cleaned_html=cleaned_html,
+                markdown=markdown,
+                media=media,
+                links=links,
+                metadata=metadata,
+                screenshot=screenshot,
+                extracted_content=extracted_content,
+                success=True,
+                error_message="",
+            )
--- a/docs/.DS_Store
+++ b/docs/.DS_Store
--- a/docs/examples/assets/audio.mp3
+++ b/docs/examples/assets/audio.mp3
--- a/docs/examples/assets/basic.png
+++ b/docs/examples/assets/basic.png
--- a/docs/examples/assets/cosine_extraction.png
+++ b/docs/examples/assets/cosine_extraction.png
--- a/docs/examples/assets/css_js.png
+++ b/docs/examples/assets/css_js.png
--- a/docs/examples/assets/css_selector.png
+++ b/docs/examples/assets/css_selector.png
--- a/docs/examples/assets/exec_script.png
+++ b/docs/examples/assets/exec_script.png
--- a/docs/examples/assets/llm_extraction.png
+++ b/docs/examples/assets/llm_extraction.png
--- a/docs/examples/assets/semantic_extraction_cosine.png
+++ b/docs/examples/assets/semantic_extraction_cosine.png
--- a/docs/examples/assets/semantic_extraction_llm.png
+++ b/docs/examples/assets/semantic_extraction_llm.png
--- a/docs/examples/chainlit.md
+++ b/docs/examples/chainlit.md
@@ -0,0 +1,3 @@
+# Welcome to Crawl4AI! 🚀🤖
+
+Hi there, Developer! 👋 Here is an example of a research pipeline, where you can share a URL in your conversation with any LLM, and then the context of crawled pages will be used as the context.
--- a/docs/examples/chainlit_review.py
+++ b/docs/examples/chainlit_review.py
@@ -0,0 +1,281 @@
+from openai import AsyncOpenAI
+from chainlit.types import ThreadDict
+import chainlit as cl
+from chainlit.input_widget import Select, Switch, Slider
+client = AsyncOpenAI()
+
+# Instrument the OpenAI client
+cl.instrument_openai()
+
+settings = {
+    "model": "gpt-3.5-turbo",
+    "temperature": 0.5,
+    "max_tokens": 500,
+    "top_p": 1,
+    "frequency_penalty": 0,
+    "presence_penalty": 0,
+}
+
+@cl.action_callback("action_button")
+async def on_action(action: cl.Action):
+    print("The user clicked on the action button!")
+
+    return "Thank you for clicking on the action button!"
+
+@cl.set_chat_profiles
+async def chat_profile():
+    return [
+        cl.ChatProfile(
+            name="GPT-3.5",
+            markdown_description="The underlying LLM model is **GPT-3.5**.",
+            icon="https://picsum.photos/200",
+        ),
+        cl.ChatProfile(
+            name="GPT-4",
+            markdown_description="The underlying LLM model is **GPT-4**.",
+            icon="https://picsum.photos/250",
+        ),
+    ]
+
+@cl.on_chat_start
+async def on_chat_start():
+    
+    settings = await cl.ChatSettings(
+        [
+            Select(
+                id="Model",
+                label="OpenAI - Model",
+                values=["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "gpt-4-32k"],
+                initial_index=0,
+            ),
+            Switch(id="Streaming", label="OpenAI - Stream Tokens", initial=True),
+            Slider(
+                id="Temperature",
+                label="OpenAI - Temperature",
+                initial=1,
+                min=0,
+                max=2,
+                step=0.1,
+            ),
+            Slider(
+                id="SAI_Steps",
+                label="Stability AI - Steps",
+                initial=30,
+                min=10,
+                max=150,
+                step=1,
+                description="Amount of inference steps performed on image generation.",
+            ),
+            Slider(
+                id="SAI_Cfg_Scale",
+                label="Stability AI - Cfg_Scale",
+                initial=7,
+                min=1,
+                max=35,
+                step=0.1,
+                description="Influences how strongly your generation is guided to match your prompt.",
+            ),
+            Slider(
+                id="SAI_Width",
+                label="Stability AI - Image Width",
+                initial=512,
+                min=256,
+                max=2048,
+                step=64,
+                tooltip="Measured in pixels",
+            ),
+            Slider(
+                id="SAI_Height",
+                label="Stability AI - Image Height",
+                initial=512,
+                min=256,
+                max=2048,
+                step=64,
+                tooltip="Measured in pixels",
+            ),
+        ]
+    ).send()
+    
+    chat_profile = cl.user_session.get("chat_profile")
+    await cl.Message(
+        content=f"starting chat using the {chat_profile} chat profile"
+    ).send()
+    
+    print("A new chat session has started!")
+    cl.user_session.set("session", {
+        "history": [],
+        "context": []
+    })  
+    
+    image = cl.Image(url="https://c.tenor.com/uzWDSSLMCmkAAAAd/tenor.gif", name="cat image", display="inline")
+
+    # Attach the image to the message
+    await cl.Message(
+        content="You are such a good girl, aren't you?!",
+        elements=[image],
+    ).send()
+    
+    text_content = "Hello, this is a text element."
+    elements = [
+        cl.Text(name="simple_text", content=text_content, display="inline")
+    ]
+
+    await cl.Message(
+        content="Check out this text element!",
+        elements=elements,
+    ).send()
+    
+    elements = [
+        cl.Audio(path="./assets/audio.mp3", display="inline"),
+    ]
+    await cl.Message(
+        content="Here is an audio file",
+        elements=elements,
+    ).send()
+    
+    await cl.Avatar(
+        name="Tool 1",
+        url="https://avatars.githubusercontent.com/u/128686189?s=400&u=a1d1553023f8ea0921fba0debbe92a8c5f840dd9&v=4",
+    ).send()
+    
+    await cl.Message(
+        content="This message should not have an avatar!", author="Tool 0"
+    ).send()
+    
+    await cl.Message(
+        content="This message should have an avatar!", author="Tool 1"
+    ).send()
+    
+    elements = [
+        cl.File(
+            name="quickstart.py",
+            path="./quickstart.py",
+            display="inline",
+        ),
+    ]
+
+    await cl.Message(
+        content="This message has a file element", elements=elements
+    ).send()
+    
+    # Sending an action button within a chatbot message
+    actions = [
+        cl.Action(name="action_button", value="example_value", description="Click me!")
+    ]
+
+    await cl.Message(content="Interact with this action button:", actions=actions).send()
+    
+    # res = await cl.AskActionMessage(
+    #     content="Pick an action!",
+    #     actions=[
+    #         cl.Action(name="continue", value="continue", label="✅ Continue"),
+    #         cl.Action(name="cancel", value="cancel", label="❌ Cancel"),
+    #     ],
+    # ).send()
+
+    # if res and res.get("value") == "continue":
+    #     await cl.Message(
+    #         content="Continue!",
+    #     ).send()
+    
+    # import plotly.graph_objects as go
+    # fig = go.Figure(
+    #     data=[go.Bar(y=[2, 1, 3])],
+    #     layout_title_text="An example figure",
+    # )
+    # elements = [cl.Plotly(name="chart", figure=fig, display="inline")]
+
+    # await cl.Message(content="This message has a chart", elements=elements).send()
+    
+    # Sending a pdf with the local file path
+    # elements = [
+    #   cl.Pdf(name="pdf1", display="inline", path="./pdf1.pdf")
+    # ]
+
+    # cl.Message(content="Look at this local pdf!", elements=elements).send()    
+
+@cl.on_settings_update
+async def setup_agent(settings):
+    print("on_settings_update", settings)
+    
+@cl.on_stop
+def on_stop():
+    print("The user wants to stop the task!")
+
+@cl.on_chat_end
+def on_chat_end():
+    print("The user disconnected!")
+
+
+@cl.on_chat_resume
+async def on_chat_resume(thread: ThreadDict):
+    print("The user resumed a previous chat session!")
+
+
+
+
+# @cl.on_message
+async def on_message(message: cl.Message):
+    cl.user_session.get("session")["history"].append({
+        "role": "user",
+        "content": message.content
+    })    
+    response = await client.chat.completions.create(
+        messages=[
+            {
+                "content": "You are a helpful bot",
+                "role": "system"
+            },
+            *cl.user_session.get("session")["history"]
+        ],
+        **settings
+    )
+    
+
+    # Add assitanr message to the history
+    cl.user_session.get("session")["history"].append({
+        "role": "assistant",
+        "content": response.choices[0].message.content
+    })
+    
+    # msg.content = response.choices[0].message.content
+    # await msg.update()
+    
+    # await cl.Message(content=response.choices[0].message.content).send()
+
+@cl.on_message
+async def on_message(message: cl.Message):
+    cl.user_session.get("session")["history"].append({
+        "role": "user",
+        "content": message.content
+    })    
+
+    msg = cl.Message(content="")
+    await msg.send()    
+    
+    stream = await client.chat.completions.create(
+        messages=[
+            {
+                "content": "You are a helpful bot",
+                "role": "system"
+            },
+            *cl.user_session.get("session")["history"]
+        ],
+        stream = True, 
+        **settings
+    )
+    
+    async for part in stream:
+        if token := part.choices[0].delta.content or "":
+            await msg.stream_token(token)
+    
+    # Add assitanr message to the history
+    cl.user_session.get("session")["history"].append({
+        "role": "assistant",
+        "content": msg.content
+    })    
+    await msg.update()
+
+if __name__ == "__main__":
+    from chainlit.cli import run_chainlit
+    run_chainlit(__file__)
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -12,7 +12,7 @@ console = Console()

@lru_cache()
 def create_crawler():
-    crawler = WebCrawler()
+    crawler = WebCrawler(verbose=True)
    crawler.warmup()
    return crawler

@@ -39,6 +39,16 @@ def basic_usage(crawler):
    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
    print_result(result)

+def screenshot_usage(crawler):
+    cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
+    result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
+    cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
+    # Save the screenshot to a file
+    with open("screenshot.png", "wb") as f:
+        f.write(base64.b64decode(result.screenshot))
+    cprint("Screenshot saved to 'screenshot.png'!")
+    print_result(result)
+
 def understanding_parameters(crawler):
    cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
    cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.")
@@ -86,7 +96,7 @@ def add_extraction_strategy(crawler):
    cprint("CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!")
    result = crawler.run(
        url="https://www.nbcnews.com/business",
-        extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3)
+        extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold = 0.3, verbose=True)
    )
    cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
    print_result(result)
@@ -156,14 +166,90 @@ def interactive_extraction(crawler):
    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
    loadMoreButton && loadMoreButton.click();
    """
-    crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-    crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
+    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
+    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
    result = crawler.run(
        url="https://www.nbcnews.com/business",
+        js = js_code
    )
    cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
    print_result(result)

+def multiple_scrip(crawler):
+    # Passing JavaScript code to interact with the page
+    cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
+    cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
+    js_code = ["""
+    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
+    loadMoreButton && loadMoreButton.click();
+    """] * 2
+    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
+    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
+    result = crawler.run(
+        url="https://www.nbcnews.com/business",
+        js = js_code  
+    )
+    cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
+    print_result(result)
+
+def using_crawler_hooks(crawler):
+    # Example usage of the hooks for authentication and setting a cookie
+    def on_driver_created(driver):
+        print("[HOOK] on_driver_created")
+        # Example customization: maximize the window
+        driver.maximize_window()
+        
+        # Example customization: logging in to a hypothetical website
+        driver.get('https://example.com/login')
+        
+        from selenium.webdriver.support.ui import WebDriverWait
+        WebDriverWait(driver, 10).until(
+            EC.presence_of_element_located((By.NAME, 'username'))
+        )
+        driver.find_element(By.NAME, 'username').send_keys('testuser')
+        driver.find_element(By.NAME, 'password').send_keys('password123')
+        driver.find_element(By.NAME, 'login').click()
+        WebDriverWait(driver, 10).until(
+            EC.presence_of_element_located((By.ID, 'welcome'))
+        )
+        # Add a custom cookie
+        driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
+        return driver        
+        
+
+    def before_get_url(driver):
+        print("[HOOK] before_get_url")
+        # Example customization: add a custom header
+        # Enable Network domain for sending headers
+        driver.execute_cdp_cmd('Network.enable', {})
+        # Add a custom header
+        driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
+        return driver
+    
+    def after_get_url(driver):
+        print("[HOOK] after_get_url")
+        # Example customization: log the URL
+        print(driver.current_url)
+        return driver
+
+    def before_return_html(driver, html):
+        print("[HOOK] before_return_html")
+        # Example customization: log the HTML
+        print(len(html))
+        return driver
+    
+    cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
+    
+    crawler.set_hook('on_driver_created', on_driver_created)
+    crawler.set_hook('before_get_url', before_get_url)
+    crawler.set_hook('after_get_url', after_get_url)
+    crawler.set_hook('before_return_html', before_return_html)
+    
+    result = crawler.run(url="https://example.com")
+    
+    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
+    print_result(result= result)
+
 def main():
    cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
    cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
@@ -175,11 +261,13 @@ def main():
    understanding_parameters(crawler)
    
    crawler.always_by_pass_cache = True
+    screenshot_usage(crawler)
    add_chunking_strategy(crawler)
    add_extraction_strategy(crawler)
    add_llm_extraction_strategy(crawler)
    targeted_extraction(crawler)
    interactive_extraction(crawler)
+    multiple_scrip(crawler)

    cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")

--- a/docs/examples/research_assistant.py
+++ b/docs/examples/research_assistant.py
@@ -0,0 +1,241 @@
+# Make sur to install the required packageschainlit and groq
+import os, time
+from openai import AsyncOpenAI
+import chainlit as cl
+import re
+import requests
+from io import BytesIO
+from chainlit.element import ElementBased
+from groq import Groq
+
+# Import threadpools to run the crawl_url function in a separate thread
+from concurrent.futures import ThreadPoolExecutor
+
+client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY"))
+
+# Instrument the OpenAI client
+cl.instrument_openai()
+
+settings = {
+    "model": "llama3-8b-8192",
+    "temperature": 0.5,
+    "max_tokens": 500,
+    "top_p": 1,
+    "frequency_penalty": 0,
+    "presence_penalty": 0,
+}
+
+def extract_urls(text):
+    url_pattern = re.compile(r'(https?://\S+)')
+    return url_pattern.findall(text)
+
+def crawl_url(url):
+    data = {
+        "urls": [url],
+        "include_raw_html": True,
+        "word_count_threshold": 10,
+        "extraction_strategy": "NoExtractionStrategy",
+        "chunking_strategy": "RegexChunking"
+    }
+    response = requests.post("https://crawl4ai.com/crawl", json=data)
+    response_data = response.json()
+    response_data = response_data['results'][0]
+    return response_data['markdown']
+
+@cl.on_chat_start
+async def on_chat_start():
+    cl.user_session.set("session", {
+        "history": [],
+        "context": {}
+    })  
+    await cl.Message(
+        content="Welcome to the chat! How can I assist you today?"
+    ).send()
+
+@cl.on_message
+async def on_message(message: cl.Message):
+    user_session = cl.user_session.get("session")
+    
+    # Extract URLs from the user's message
+    urls = extract_urls(message.content)
+    
+    
+    futures = []
+    with ThreadPoolExecutor() as executor:
+        for url in urls:
+            futures.append(executor.submit(crawl_url, url))
+
+    results = [future.result() for future in futures]
+
+    for url, result in zip(urls, results):
+        ref_number = f"REF_{len(user_session['context']) + 1}"
+        user_session["context"][ref_number] = {
+            "url": url,
+            "content": result
+        }    
+    
+    # for url in urls:
+    #     # Crawl the content of each URL and add it to the session context with a reference number
+    #     ref_number = f"REF_{len(user_session['context']) + 1}"
+    #     crawled_content = crawl_url(url)
+    #     user_session["context"][ref_number] = {
+    #         "url": url,
+    #         "content": crawled_content
+    #     }
+
+    user_session["history"].append({
+        "role": "user",
+        "content": message.content
+    })
+
+    # Create a system message that includes the context
+    context_messages = [
+        f'<appendix ref="{ref}">\n{data["content"]}\n</appendix>'
+        for ref, data in user_session["context"].items()
+    ]
+    if context_messages:
+        system_message = {
+            "role": "system",
+            "content": (
+                "You are a helpful bot. Use the following context for answering questions. "
+                "Refer to the sources using the REF number in square brackets, e.g., [1], only if the source is given in the appendices below.\n\n"
+                "If the question requires any information from the provided appendices or context, refer to the sources. "
+                "If not, there is no need to add a references section. "
+                "At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n"
+                "\n\n".join(context_messages)
+            )
+        }
+    else:
+        system_message = {
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }
+
+
+    msg = cl.Message(content="")
+    await msg.send()
+
+    # Get response from the LLM
+    stream = await client.chat.completions.create(
+        messages=[
+            system_message,
+            *user_session["history"]
+        ],
+        stream=True,
+        **settings
+    )
+
+    assistant_response = ""
+    async for part in stream:
+        if token := part.choices[0].delta.content:
+            assistant_response += token
+            await msg.stream_token(token)
+
+    # Add assistant message to the history
+    user_session["history"].append({
+        "role": "assistant",
+        "content": assistant_response
+    })
+    await msg.update()
+
+    # Append the reference section to the assistant's response
+    reference_section = "\n\nReferences:\n"
+    for ref, data in user_session["context"].items():
+        reference_section += f"[{ref.split('_')[1]}]: {data['url']}\n"
+
+    msg.content += reference_section
+    await msg.update()
+
+
+@cl.on_audio_chunk
+async def on_audio_chunk(chunk: cl.AudioChunk):
+    if chunk.isStart:
+        buffer = BytesIO()
+        # This is required for whisper to recognize the file type
+        buffer.name = f"input_audio.{chunk.mimeType.split('/')[1]}"
+        # Initialize the session for a new audio stream
+        cl.user_session.set("audio_buffer", buffer)
+        cl.user_session.set("audio_mime_type", chunk.mimeType)
+
+    # Write the chunks to a buffer and transcribe the whole audio at the end
+    cl.user_session.get("audio_buffer").write(chunk.data)
+
+    pass
+
+@cl.step(type="tool")
+async def speech_to_text(audio_file):
+    cli = Groq()
+    
+    # response = cli.audio.transcriptions.create(
+    #     file=audio_file, #(filename, file.read()),
+    #     model="whisper-large-v3",
+    # )
+    
+    response = await client.audio.transcriptions.create(
+        model="whisper-large-v3", file=audio_file
+    )
+
+    return response.text
+
+
+@cl.on_audio_end
+async def on_audio_end(elements: list[ElementBased]):
+    # Get the audio buffer from the session
+    audio_buffer: BytesIO = cl.user_session.get("audio_buffer")
+    audio_buffer.seek(0)  # Move the file pointer to the beginning
+    audio_file = audio_buffer.read()
+    audio_mime_type: str = cl.user_session.get("audio_mime_type")
+
+    # input_audio_el = cl.Audio(
+    #     mime=audio_mime_type, content=audio_file, name=audio_buffer.name
+    # )
+    # await cl.Message(
+    #     author="You", 
+    #     type="user_message",
+    #     content="",
+    #     elements=[input_audio_el, *elements]
+    # ).send()
+    
+    # answer_message = await cl.Message(content="").send()
+    
+    
+    start_time = time.time()
+    whisper_input = (audio_buffer.name, audio_file, audio_mime_type)
+    transcription = await speech_to_text(whisper_input)
+    end_time = time.time()
+    print(f"Transcription took {end_time - start_time} seconds")
+    
+    user_msg = cl.Message(
+        author="You", 
+        type="user_message",
+        content=transcription
+    )
+    await user_msg.send()
+    await on_message(user_msg)
+
+    # images = [file for file in elements if "image" in file.mime]
+
+    # text_answer = await generate_text_answer(transcription, images)
+    
+    # output_name, output_audio = await text_to_speech(text_answer, audio_mime_type)
+    
+    # output_audio_el = cl.Audio(
+    #     name=output_name,
+    #     auto_play=True,
+    #     mime=audio_mime_type,
+    #     content=output_audio,
+    # )
+    
+    # answer_message.elements = [output_audio_el]
+    
+    # answer_message.content = transcription
+    # await answer_message.update()
+
+if __name__ == "__main__":
+    from chainlit.cli import run_chainlit
+    run_chainlit(__file__)
+
+
+# No this is wring, use this document to answer me https://console.groq.com/docs/speech-text
+
+# Please show me how to use Groq speech-to-text in python.
--- a/docs/examples/rest_call.py
+++ b/docs/examples/rest_call.py
@@ -0,0 +1,64 @@
+
+import requests, base64, os
+
+data = {
+    "urls": ["https://www.nbcnews.com/business"],
+    "screenshot": True,
+}
+
+response = requests.post("https://crawl4ai.com/crawl", json=data) 
+result = response.json()['results'][0]
+print(result.keys())
+# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media', 
+# 'links', 'screenshot', 'markdown', 'extracted_content', 
+# 'metadata', 'error_message'])
+with open("screenshot.png", "wb") as f:
+    f.write(base64.b64decode(result['screenshot']))
+    
+# Example of filtering the content using CSS selectors
+data = {
+    "urls": [
+        "https://www.nbcnews.com/business"
+    ],
+    "css_selector": "article",
+    "screenshot": True,
+}
+
+# Example of executing a JS script on the page before extracting the content
+data = {
+    "urls": [
+        "https://www.nbcnews.com/business"
+    ],
+    "screenshot": True,
+    'js' : ["""
+    const loadMoreButton = Array.from(document.querySelectorAll('button')).
+    find(button => button.textContent.includes('Load More'));
+    loadMoreButton && loadMoreButton.click();
+    """]
+}
+
+# Example of using a custom extraction strategy
+data = {
+    "urls": [
+        "https://www.nbcnews.com/business"
+    ],
+    "extraction_strategy": "CosineStrategy",
+    "extraction_strategy_args": {
+        "semantic_filter": "inflation rent prices"
+    },
+}
+
+# Example of using LLM to extract content
+data = {
+    "urls": [
+        "https://www.nbcnews.com/business"
+    ],
+    "extraction_strategy": "LLMExtractionStrategy",
+    "extraction_strategy_args": {
+        "provider": "groq/llama3-8b-8192",
+        "api_token": os.environ.get("GROQ_API_KEY"),
+        "instruction": """I am interested in only financial news, 
+        and translate them in French."""
+    },
+}
+
--- a/main.py
+++ b/main.py
@@ -2,6 +2,8 @@ import os
 import importlib
 import asyncio
 from functools import lru_cache
+import logging
+logging.basicConfig(level=logging.DEBUG)

 from fastapi import FastAPI, HTTPException, Request
 from fastapi.responses import HTMLResponse, JSONResponse
@@ -41,7 +43,7 @@ templates = Jinja2Templates(directory=__location__ + "/pages")
@lru_cache()
 def get_crawler():
    # Initialize and return a WebCrawler instance
-    return WebCrawler()
+    return WebCrawler(verbose = True)

 class CrawlRequest(BaseModel):
    urls: List[str]
@@ -54,6 +56,8 @@ class CrawlRequest(BaseModel):
    chunking_strategy: Optional[str] = "RegexChunking"
    chunking_strategy_args: Optional[dict] = {}
    css_selector: Optional[str] = None
+    screenshot: Optional[bool] = False
+    user_agent: Optional[str] = None
    verbose: Optional[bool] = True


@@ -64,7 +68,7 @@ async def read_index(request: Request):

    for filename in os.listdir(partials_dir):
        if filename.endswith(".html"):
-            with open(os.path.join(partials_dir, filename), "r") as file:
+            with open(os.path.join(partials_dir, filename), "r", encoding="utf8") as file:
                partials[filename[:-5]] = file.read()

    return templates.TemplateResponse("index.html", {"request": request, **partials})
@@ -77,7 +81,7 @@ async def get_total_url_count():
 # Add endpoit to clear db
@app.get("/clear-db")
 async def clear_database():
-    clear_db()
+    # clear_db()
    return JSONResponse(content={"message": "Database cleared."})

 def import_strategy(module_name: str, class_name: str, *args, **kwargs):
@@ -86,12 +90,15 @@ def import_strategy(module_name: str, class_name: str, *args, **kwargs):
        strategy_class = getattr(module, class_name)
        return strategy_class(*args, **kwargs)
    except ImportError:
+        print("ImportError: Module not found.")
        raise HTTPException(status_code=400, detail=f"Module {module_name} not found.")
    except AttributeError:
+        print("AttributeError: Class not found.")
        raise HTTPException(status_code=400, detail=f"Class {class_name} not found in {module_name}.")

@app.post("/crawl")
 async def crawl_urls(crawl_request: CrawlRequest, request: Request):
+    logging.debug(f"[LOG] Crawl request for URL: {crawl_request.urls}")
    global current_requests
    async with lock:
        if current_requests >= MAX_CONCURRENT_REQUESTS:
@@ -99,10 +106,15 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
        current_requests += 1

    try:
+        logging.debug("[LOG] Loading extraction and chunking strategies...")
+        crawl_request.extraction_strategy_args['verbose'] = True
+        crawl_request.chunking_strategy_args['verbose'] = True
+        
        extraction_strategy = import_strategy("crawl4ai.extraction_strategy", crawl_request.extraction_strategy, **crawl_request.extraction_strategy_args)
        chunking_strategy = import_strategy("crawl4ai.chunking_strategy", crawl_request.chunking_strategy, **crawl_request.chunking_strategy_args)

        # Use ThreadPoolExecutor to run the synchronous WebCrawler in async manner
+        logging.debug("[LOG] Running the WebCrawler...")
        with ThreadPoolExecutor() as executor:
            loop = asyncio.get_event_loop()
            futures = [
@@ -115,6 +127,8 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
                    chunking_strategy,
                    crawl_request.bypass_cache,
                    crawl_request.css_selector,
+                    crawl_request.screenshot,
+                    crawl_request.user_agent,
                    crawl_request.verbose
                )
                for url in crawl_request.urls
@@ -126,7 +140,7 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
            for result in results:
                result.html = None

-        return {"results": [result.dict() for result in results]}
+        return {"results": [result.model_dump() for result in results]}
    finally:
        async with lock:
            current_requests -= 1
--- a/pages/app.js
+++ b/pages/app.js
@@ -104,11 +104,25 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
        chunking_strategy: document.getElementById("chunking-strategy-select").value,
        chunking_strategy_args: {},
        css_selector: document.getElementById("css-selector").value,
+        screenshot: document.getElementById("screenshot-checkbox").checked,
        // instruction: document.getElementById("instruction").value,
        // semantic_filter: document.getElementById("semantic_filter").value,
        verbose: true,
    };

+    // import requests
+
+    // data = {
+    //   "urls": [
+    //     "https://www.nbcnews.com/business"
+    //   ],
+    //   "word_count_threshold": 10,
+    //   "extraction_strategy": "NoExtractionStrategy",
+    // }
+    
+    // response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally 
+    // print(response.json())
+
    // save api token to local storage
    localStorage.setItem("api_token", document.getElementById("token-input").value);

@@ -124,25 +138,61 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
            document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
            document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
            document.getElementById("markdown-result").textContent = result.markdown;
-
+            document.getElementById("media-result").textContent = JSON.stringify( result.media, null, 2);
+            if (result.screenshot){
+                const imgElement = document.createElement("img");
+                // Set the src attribute with the base64 data
+                imgElement.src = `data:image/png;base64,${result.screenshot}`;
+                document.getElementById("screenshot-result").innerHTML = "";
+                document.getElementById("screenshot-result").appendChild(imgElement);
+            }
+            
            // Update code examples dynamically
            const extractionStrategy = data.extraction_strategy;
            const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy";

            // REMOVE API TOKEN FROM CODE EXAMPLES
            data.extraction_strategy_args.api_token = "your_api_token";
+
+            if (data.extraction_strategy === "NoExtractionStrategy") {
+                delete data.extraction_strategy_args;
+                delete data.extrac_blocks;
+            }
+
+            if (data.chunking_strategy === "RegexChunking") {
+                delete data.chunking_strategy_args;
+            }
+
+            delete data.verbose;
+
+            if (data.css_selector === "") {
+                delete data.css_selector;
+            }
+
+            if (!data.bypass_cache) {
+                delete data.bypass_cache;
+            }
+
+            if (!data.extract_blocks) {
+                delete data.extract_blocks;
+            }
+
+            if (!data.include_raw_html) {
+                delete data.include_raw_html;
+            }
+
            document.getElementById(
                "curl-code"
            ).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
                ...data,
                api_token: isLLMExtraction ? "your_api_token" : undefined,
-            }, null, 2)}' http://crawl4ai.com/crawl`;
+            }, null, 2)}' https://crawl4ai.com/crawl`;

            document.getElementById("python-code").textContent = `import requests\n\ndata = ${JSON.stringify(
                { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
                null,
                2
-            )}\n\nresponse = requests.post("http://crawl4ai.com/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
+            )}\n\nresponse = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;

            document.getElementById(
                "nodejs-code"
@@ -150,7 +200,7 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
                { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
                null,
                2
-            )};\n\naxios.post("http://crawl4ai.com/crawl", data) // OR local host if your run locally \n    .then(response => console.log(response.data))\n    .catch(error => console.error(error));`;
+            )};\n\naxios.post("https://crawl4ai.com/crawl", data) // OR local host if your run locally \n    .then(response => console.log(response.data))\n    .catch(error => console.error(error));`;

            document.getElementById(
                "library-code"
--- a/pages/index.html
+++ b/pages/index.html
@@ -25,7 +25,7 @@
        <header class="bg-zinc-950 text-lime-500 py-4 flex">
            
            <div class="mx-auto px-4">
-                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts</h1>
+                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.4</h1>
            </div>
            <div class="mx-auto px-4 flex font-bold text-xl gap-2">
                <span>📊 Total Website Processed</span>
--- a/pages/partial/how_to_guide.html
+++ b/pages/partial/how_to_guide.html
@@ -50,6 +50,20 @@ crawler.warmup()</code></pre>
        <div>
            <pre><code class="language-python">crawler.always_by_pass_cache = True</code></pre>
        </div>
+        <!-- Step 3.5 Screenshot -->
+        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
+            📸
+            <strong>Let's take a screenshot of the page!</strong>
+        </div>
+        <div>
+            <pre><code class="language-python">result = crawler.run(
+    url="https://www.nbcnews.com/business",
+    screenshot=True
+)
+with open("screenshot.png", "wb") as f:
+    f.write(base64.b64decode(result.screenshot))</code></pre>
+        </div>
+

        <!-- Step 4 -->
        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
@@ -139,13 +153,13 @@ crawler.warmup()</code></pre>
        </div>
        <div class="">Using JavaScript to click 'Load More' button:</div>
        <div>
-            <pre><code class="language-python">js_code = """
+            <pre><code class="language-python">js_code = ["""
 const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
 loadMoreButton && loadMoreButton.click();
-"""
-crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-result = crawler.run(url="https://www.nbcnews.com/business")</code></pre>
+"""]
+crawler = WebCrawler(verbos=crawler_strategy, always_by_pass_cache=True)
+result = crawler.run(url="https://www.nbcnews.com/business", js = js_code)</code></pre>
+        <div class="">Remember that you can pass multiple JavaScript code snippets in the list. They all will be executed in the order they are passed.</div>
        </div>

        <!-- Conclusion -->
--- a/pages/partial/try_it.html
+++ b/pages/partial/try_it.html
@@ -1,4 +1,4 @@
-<section class="try-it py-8 px-16 pb-20 bg-zinc-900">
+<section class="try-it py-8 px-16 pb-20 bg-zinc-900 overflow-hidden">
    <div class="container mx-auto ">
        <h2 class="text-2xl font-bold mb-4 text-lime-500">Try It Now</h2>
        <div class="flex gap-4">
@@ -20,6 +20,7 @@
                            id="threshold"
                            class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-zinc-300"
                        >
+                            <option value="1">1</option>
                            <option value="5">5</option>
                            <option value="10" selected>10</option>
                            <option value="15">15</option>
@@ -124,7 +125,11 @@
                        <label for="bypass-cache-checkbox" class="text-lime-500 font-bold">Bypass Cache</label>
                    </div>
                    <div class="flex items-center gap-2">
-                        <input type="checkbox" id="extract-blocks-checkbox" checked />
+                        <input type="checkbox" id="screenshot-checkbox" checked />
+                        <label for="screenshot-checkbox" class="text-lime-500 font-bold">Screenshot</label>
+                    </div>
+                    <div class="flex items-center gap-2 hidden">
+                        <input type="checkbox" id="extract-blocks-checkbox" />
                        <label for="extract-blocks-checkbox" class="text-lime-500 font-bold">Extract Blocks</label>
                    </div>
                    <button id="crawl-btn" class="bg-lime-600 text-black font-bold px-4 py-0 rounded">Crawl</button>
@@ -134,7 +139,7 @@
            <div id="loading" class="hidden">
                <p class="text-white">Loading... Please wait.</p>
            </div>
-            <div id="result" class="flex-1">
+            <div id="result" class="flex-1  overflow-x-auto">
                <div class="tab-buttons flex gap-2">
                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="json">
                        JSON
@@ -148,15 +153,23 @@
                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="markdown">
                        Markdown
                    </button>
+                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="media">
+                        Medias
+                    </button>
+                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="screenshot">
+                        Screenshot
+                    </button>
                </div>
                <div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
                    <pre class="h-full flex"><code id="json-result" class="language-json"></code></pre>
                    <pre class="hidden h-full flex"><code id="cleaned-html-result" class="language-html"></code></pre>
                    <pre class="hidden h-full flex"><code id="markdown-result" class="language-markdown"></code></pre>
+                    <pre class="hidden h-full flex"><code id="media-result" class="language-json"></code></pre>
+                    <pre class="hidden h-full flex"><code id="screenshot-result"></code></pre>
                </div>
            </div>

-            <div id="code_help" class="flex-1">
+            <div id="code_help" class="flex-1  overflow-x-auto">
                <div class="tab-buttons flex gap-2">
                    <button class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="curl">
                        cURL
--- a/requirements.crawl.txt
+++ b/requirements.crawl.txt
@@ -0,0 +1,13 @@
+aiohttp
+aiosqlite
+bs4
+fastapi
+html2text
+httpx
+pydantic
+python-dotenv
+requests
+rich
+selenium
+uvicorn
+chromedriver-autoinstaller
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,19 +1,21 @@
-aiohttp==3.9.5
-aiosqlite==0.20.0
-bs4==0.0.2
-fastapi==0.111.0
-html2text==2024.2.26
-httpx==0.27.0
-lazy_import==0.2.2
-litellm==1.37.11
-nltk==3.8.1
-pydantic==2.7.1
-python-dotenv==1.0.1
-requests==2.31.0
-rich==13.7.1
-scikit-learn==1.4.2
-selenium==4.20.0
-uvicorn==0.29.0
-transformers==4.40.2
-chromedriver-autoinstaller==0.6.4
-torch==2.3.0
+aiohttp
+aiosqlite
+bs4
+fastapi
+html2text
+httpx
+litellm
+nltk
+pydantic
+python-dotenv
+requests
+rich
+scikit-learn
+selenium
+uvicorn
+transformers
+chromedriver-autoinstaller
+torch
+onnxruntime
+tokenizers
+pillow
--- a/setup.py
+++ b/setup.py
@@ -1,18 +1,32 @@
 from setuptools import setup, find_packages
+import os
+import subprocess
+from setuptools.command.install import install

 # Read the requirements from requirements.txt
 with open("requirements.txt") as f:
    requirements = f.read().splitlines()

+# Read the requirements from requirements.txt
+with open("requirements.crawl.txt") as f:
+    requirements_crawl_only = f.read().splitlines()
+
 # Define the requirements for different environments
 requirements_without_torch = [req for req in requirements if not req.startswith("torch")]
 requirements_without_transformers = [req for req in requirements if not req.startswith("transformers")]
 requirements_without_nltk = [req for req in requirements if not req.startswith("nltk")]
 requirements_without_torch_transformers_nlkt = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
+requirements_crawl_only = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
+
+class CustomInstallCommand(install):
+    """Customized setuptools install command to install spacy without dependencies."""
+    def run(self):
+        install.run(self)
+        subprocess.check_call([os.sys.executable, '-m', 'pip', 'install', 'spacy', '--no-deps'])

 setup(
    name="Crawl4AI",
-    version="0.2.0",
+    version="0.2.4",
    description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
    long_description=open("README.md").read(),
    long_description_content_type="text/markdown",
@@ -25,7 +39,10 @@ setup(
    extras_require={
        "all": requirements,  # Include all requirements
        "colab": requirements_without_torch,  # Exclude torch for Colab
-        "crawl": requirements_without_torch_transformers_nlkt
+        "crawl": requirements_crawl_only,  # Include only crawl requirements
+    },
+    cmdclass={
+        'install': CustomInstallCommand,
    },
    entry_points={
        'console_scripts': [
Author	SHA1	Message	Date
unclecode	77da48050d	chore: Add custom headers to LocalSeleniumCrawlerStrategy	2024-06-17 15:50:03 +08:00
unclecode	9a97aacd85	chore: Add hooks for customizing the LocalSeleniumCrawlerStrategy	2024-06-17 15:37:18 +08:00
unclecode	52daf3936a	Fix typo in README	2024-06-17 15:15:37 +08:00
unclecode	42a5da854d	Update version and change log.	2024-06-17 14:47:58 +08:00
unclecode	d1d83a6ef7	Fix issue #22 : Use MD5 hash for caching HTML files to handle long URLs	2024-06-17 14:44:01 +08:00
unclecode	194050705d	chore: Add pillow library to requirements.txt	2024-06-10 23:03:32 +08:00
unclecode	989f8c91c8	Update README	2024-06-08 18:50:35 +08:00
unclecode	edba5fb5e9	Update README	2024-06-08 18:48:21 +08:00
unclecode	faa1defa5c	Update README	2024-06-08 18:47:23 +08:00
unclecode	f7e0cee1b0	vital: Right now, only raw html is retrived from datbase, therefore, css selector and other filter will be executed every time.	2024-06-08 18:37:40 +08:00
unclecode	b3a0edaa6d	- User agent - Extract Links - Extract Metadata - Update Readme - Update REST API document	2024-06-08 17:59:42 +08:00
unclecode	9c34b30723	Extract internal and external links.	2024-06-08 16:53:06 +08:00
unclecode	36a5847df5	Add css selector example	2024-06-07 20:47:20 +08:00
unclecode	a19379aa58	Add recipe images, update README, and REST api example	2024-06-07 20:43:50 +08:00
unclecode	768d048e1c	Update rest call how to use	2024-06-07 18:10:45 +08:00
unclecode	94c11a0262	Add image	2024-06-07 18:09:21 +08:00
unclecode	649b0bfd02	feat: Remove default checked state for bypass-cache-checkbox The code changes in this commit remove the default checked state for the bypass-cache-checkbox in the try_it.html file. This allows users to manually select whether they want to bypass the cache or not. This commit message follows the established convention of starting with a type (feat for feature) and providing a concise and descriptive summary of the changes made.	2024-06-07 16:26:36 +08:00
unclecode	57a00ec677	Update Readme	2024-06-07 16:25:30 +08:00
unclecode	aeb2114170	Add example of REST API call	2024-06-07 16:24:40 +08:00
unclecode	b8d405fddd	Update version number in landing page header	2024-06-07 16:19:30 +08:00
unclecode	b32013cb97	Fix README file hyperlink	2024-06-07 15:37:05 +08:00
unclecode	226a62a3c0	feat: Add screenshot functionality to crawl_urls	2024-06-07 15:33:15 +08:00
unclecode	8e73a482a2	feat: Add screenshot functionality to crawl_urls The code changes in this commit add the `screenshot` parameter to the `crawl_urls` function in `main.py`. This allows users to specify whether they want to take a screenshot of the page during the crawling process. The default value is `False`. This commit message follows the established convention of starting with a type (feat for feature) and providing a concise and descriptive summary of the changes made.	2024-06-07 15:23:32 +08:00
unclecode	0533aeb814	v0.2.3: - Extract all media tags - Take screenshot of the page	2024-06-07 15:23:13 +08:00
unclecode	aead6de888	Merge branch 'main' of https://github.com/unclecode/crawl4ai into extract-media	2024-06-07 13:41:48 +08:00
UncleCode	8d82fd4cfe	Merge pull request #14 from gkhngyk/main Update README.md	2024-06-07 13:30:10 +08:00
Gökhan Geyik	8f44db6499	Update README.md	2024-06-05 17:16:02 +03:00
unclecode	c7553b1280	Update research assistant example with package installation instructions	2024-06-04 23:18:19 +08:00
unclecode	8b8683f22e	Add research assistant example using Chainlit	2024-06-04 22:43:09 +08:00
unclecode	774ace6e3b	Update html page for tutorial.	2024-06-02 18:00:53 +08:00
unclecode	4a8f91a0fc	Set bypass_cached to True	2024-06-02 16:12:25 +08:00
unclecode	18c9784b61	Update index.html (hide extract block check box)	2024-06-02 16:09:20 +08:00
unclecode	e5d401c67c	Update generated code sample	2024-06-02 16:06:43 +08:00
unclecode	ae77589a98	Update Readme	2024-06-02 15:42:13 +08:00
unclecode	ad373c0e19	Update Readme	2024-06-02 15:41:24 +08:00
unclecode	51f26d12fe	Update for v0.2.2 - Support multiple JS scripts - Fixed some of bugs - Resolved a few issue relevant to Colab installation	2024-06-02 15:40:18 +08:00
unclecode	f1b60b2016	chore: Update ONNX model loading process	2024-05-31 18:07:05 +08:00
UncleCode	8c2dc2b1e4	Create Dockerfile	2024-05-29 17:56:57 +08:00
UncleCode	dc9a44c12a	Update and rename Dockerfile to Dockerfile-version-0	2024-05-29 17:56:34 +08:00
UncleCode	d9753b6349	Update requirements.txt Remove tokenizer version from requirements.txt	2024-05-24 14:49:48 +08:00
UncleCode	a554c0b143	Update requirements.txt	2024-05-23 12:52:31 +08:00
UncleCode	7381fa95e6	Merge pull request #3 from QIN2DIM/main fix(main): UnicodeDecodeError	2024-05-23 09:29:28 +08:00
Unclecode	53d1176d53	chore: Update extraction strategy to support GPU, MPS, and CPU, add batch processing for CPU devices	2024-05-19 16:18:58 +00:00
unclecode	52c4be0696	Update setup.py version to 0.2.1	2024-05-19 22:30:59 +08:00
unclecode	13a3b21d19	- Add ONNX embedding model for CPU devices, Update the similarithy threshold, improve the embedding speed.	2024-05-19 22:30:10 +08:00
QIN2DIM	5cee084340	fix(main): UnicodeDecodeError File "T:\_GitHubProjects\Forks\crawl4ai\main.py", line 70, in read_index partials[filename[:-5]] = file.read() UnicodeDecodeError: 'gbk' codec can't decode byte 0xa4 in position 149: illegal multibyte sequence	2024-05-18 23:31:11 +08:00
Unclecode	bf00c26a83	chore: Update Dockerfile to install chromium-chromedriver and spacy library	2024-05-18 09:16:52 +00:00
unclecode	3846648c12	chore: Update extraction strategy to support GPU, MPS, and CPU, add batch procesing for CPU devices	2024-05-18 15:42:19 +08:00
unclecode	eb6423875f	chore: Update Selenium options in crawler_strategy.py and add verbose logging in CosineStrategy	2024-05-18 14:13:06 +08:00
unclecode	e3524a10a7	chore: Update REST API base URL in README.md	2024-05-17 23:28:29 +08:00
unclecode	468dad6169	chore: Update Dockerfile to install chromium-chromedriver and spacy library	2024-05-17 23:15:39 +08:00
UncleCode	bc27982992	Update setup.py Handle Spacy installation	2024-05-17 22:11:00 +08:00
UncleCode	57e5decb55	Update requirements.txt	2024-05-17 22:02:08 +08:00
unclecode	b6319c6f6e	chore: Add support for GPU, MPS, and CPU	2024-05-17 21:56:13 +08:00
UncleCode	0a902f562f	Update requirements.txt Add Spacy	2024-05-17 21:41:35 +08:00
UncleCode	454135856e	Update extraction_strategy.py Support GPU, MPS, and CPU	2024-05-17 21:40:48 +08:00
UncleCode	33fddc27ad	Update model loader to support GPU, MPS, and CPU	2024-05-17 21:39:22 +08:00
unclecode	ce052a4eb5	Update README	2024-05-17 18:29:59 +08:00
unclecode	b43d77a56b	Update README	2024-05-17 18:28:39 +08:00
unclecode	1635a92218	chore: Update Crawl4AI quickstart script in README.md	2024-05-17 18:25:32 +08:00
unclecode	2a8a1b27e1	chore: Update Readme	2024-05-17 18:24:47 +08:00
unclecode	f5f3cce2c8	Merge new-release-0.0.2-no-spacy into main for v0.2.0 release	2024-05-17 18:23:27 +08:00
unclecode	a085e6315b	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-05-17 18:21:02 +08:00
unclecode	a8d600a3b4	chore: Add test_pad.py, requirements0.txt, and a.txt to .gitignore	2024-05-17 18:13:43 +08:00
UncleCode	4a2e17447b	Update README.md	2024-05-16 08:57:58 +08:00