feat: Add screenshot functionality to crawl_urls

The code changes in this commit add the `screenshot` parameter to the `crawl_urls` function in `main.py`. This allows users to specify whether they want to take a screenshot of the page during the crawling process. The default value is `False`. This commit message follows the established convention of starting with a type (feat for feature) and providing a concise and descriptive summary of the changes made.
2024-06-07 15:33:15 +08:00 · 2024-06-07 15:23:32 +08:00 · 2024-06-07 15:23:13 +08:00 · 2024-06-07 13:41:48 +08:00 · 2024-06-07 13:30:10 +08:00 · 2024-06-05 17:16:02 +03:00
24 changed files with 1065 additions and 112 deletions
--- a/.files/screenshot.png
+++ b/.files/screenshot.png
--- a/.gitignore
+++ b/.gitignore
@@ -173,4 +173,9 @@ Crawl4AI.egg-info/
 requirements0.txt
 a.txt

-*.sh
+*.sh
+.idea
+docs/examples/.chainlit/
+docs/examples/.chainlit/*
+.chainlit/config.toml
+.chainlit/translations/en-US.json
--- a/86
+++ b/86
@@ -1,43 +1,77 @@
-# Use an official Python runtime as a parent image
-FROM python:3.10-slim
+
+# First stage: Build and install dependencies
+FROM python:3.10-slim-bookworm as builder

 # Set the working directory in the container
 WORKDIR /usr/src/app

-# Copy the current directory contents into the container at /usr/src/app
-COPY . .
-
-# Install dependencies for Chrome and ChromeDriver
-RUN apt-get update && apt-get install -y --no-install-recommends \
+# Install build dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
    wget \
-    xvfb \
-    unzip \
    curl \
+    unzip 
+
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt && \
+    pip install --no-cache-dir spacy torch torchvision torchaudio onnxruntime uvicorn && \
+    python -m spacy download en_core_web_sm
+
+# Download and install ChromeDriver
+RUN CHROMEDRIVER_VERSION=$(curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE) && \
+    wget -N https://chromedriver.storage.googleapis.com/$CHROMEDRIVER_VERSION/chromedriver_linux64.zip -P /tmp && \
+    unzip /tmp/chromedriver_linux64.zip -d /tmp && \
+    mv /tmp/chromedriver /usr/local/bin/chromedriver && \
+    chmod +x /usr/local/bin/chromedriver && \
+    rm /tmp/chromedriver_linux64.zip
+
+# Second stage: Create final runtime image
+FROM python:3.10-slim-bookworm
+
+# Set the working directory in the container
+WORKDIR /usr/src/app
+
+# Install runtime dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    wget \
+    git \
+    xvfb \
    gnupg2 \
    ca-certificates \
    apt-transport-https \
-    software-properties-common \
-    && mkdir -p /etc/apt/keyrings \
-    && curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \
-    && echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main' | tee /etc/apt/sources.list.d/google-chrome.list \
-    && apt-get update \
-    && apt-get install -y google-chrome-stable \
-    && rm -rf /var/lib/apt/lists/* \
-    && apt-get install -y chromium-chromedriver
+    software-properties-common && \
+    wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - && \
+    echo "deb http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends google-chrome-stable && \
+    rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/google-chrome.list

-# Install Python dependencies
-RUN pip install --no-cache-dir -r requirements.txt
-RUN pip install spacy torch torchvision torchaudio
+# Copy Chromedriver from the builder stage
+COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver

-# Set display port and dbus env to avoid hanging
-ENV DISPLAY=:99
-ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
+# Copy installed Python packages from builder stage
+COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
+COPY --from=builder /usr/local/bin /usr/local/bin
+
+# Copy the rest of the application code
+COPY . .
+
+# Set environment to use Chrome and ChromeDriver properly
+ENV CHROME_BIN=/usr/bin/google-chrome \
+    CHROMEDRIVER=/usr/local/bin/chromedriver \
+    DISPLAY=:99 \
+    DBUS_SESSION_BUS_ADDRESS=/dev/null \
+    PYTHONUNBUFFERED=1
+
+# Ensure the PATH environment variable includes the location of the installed packages
+ENV PATH /usr/local/bin:$PATH   

 # Make port 80 available to the world outside this container
 EXPOSE 80

-# Define environment variable
-ENV PYTHONUNBUFFERED 1
-
 # Run uvicorn
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
+
+
--- a/45
+++ b/45
@@ -0,0 +1,45 @@
+# Use an official Python runtime as a parent image
+FROM python:3.10-slim
+# In case you had some weird issues, try this Image
+# FROM python:3.10-slim-bookworm as builder
+
+# Set the working directory in the container
+WORKDIR /usr/src/app
+
+# Copy the current directory contents into the container at /usr/src/app
+COPY . .
+
+# Install dependencies for Chrome and ChromeDriver
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    wget \
+    xvfb \
+    unzip \
+    curl \
+    gnupg2 \
+    ca-certificates \
+    apt-transport-https \
+    software-properties-common \
+    && mkdir -p /etc/apt/keyrings \
+    && curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \
+    && echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main' | tee /etc/apt/sources.list.d/google-chrome.list \
+    && apt-get update \
+    && apt-get install -y google-chrome-stable \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get install -y chromium-chromedriver
+
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install spacy torch torchvision torchaudio
+
+# Set display port and dbus env to avoid hanging
+ENV DISPLAY=:99
+ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
+
+# Make port 80 available to the world outside this container
+EXPOSE 80
+
+# Define environment variable
+ENV PYTHONUNBUFFERED 1
+
+# Run uvicorn
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Crawl4AI v0.2.0 🕷️🤖
+# Crawl4AI v0.2.3 🕷️🤖

 [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
 [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)
@@ -10,8 +10,18 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information

 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)

-## Recent Changes v0.2.0
+## Recent Changes 

+### v0.2.3
+- 🎨 Extract and return all media tags (Images, Audio, and Video). Check `result.media`
+- 🖼️ Take [screenshots](#taking-screenshots-) of the page.
+
+### v0.2.2
+- Support multiple JS scripts
+- Fixed some of bugs
+- Resolved a few issue relevant to Colab installation
+
+### v0.2.0
 - 🚀 10x faster!!
 - 📜 Execute custom JavaScript before crawling!
 - 🤝 Colab friendly!
@@ -30,13 +40,28 @@ from crawl4ai import WebCrawler
 # Create the WebCrawler instance 
 crawler = WebCrawler() 

-
-
 # Run the crawler with keyword filtering and CSS selector
 result = crawler.run(url="https://www.nbcnews.com/business")
 print(result) # {url, html, markdown, extracted_content, metadata}
 ```

+If you don't want to install Selenium, you can use the REST API or local server. 
+
+```python
+import requests
+
+data = {
+  "urls": [
+    "https://www.nbcnews.com/business"
+  ],
+  "word_count_threshold": 10,
+  "extraction_strategy": "NoExtractionStrategy",
+}
+
+response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally 
+print(response.json())
+```
+
 Now let's try a complex task. Below is an example of how you can execute JavaScript, filter data using keywords, and use a CSS selector to extract specific content—all in one go!

 1. Instantiate a WebCrawler object.
@@ -208,7 +233,7 @@ To use the REST API, send a POST request to `http://localhost:8000/crawl` with t
 }
 ```

-For more information about the available parameters and their descriptions, refer to the [Parameters](#parameters) section.
+For more information about the available parameters and their descriptions, refer to the [Parameters](#parameters-) section.


 ## Python Library Usage 🚀
@@ -241,6 +266,14 @@ Crawl result without raw HTML content:
 result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
 ```

+### Taking Screenshots
+
+```python
+result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
+with open("screenshot.png", "wb") as f:
+    f.write(base64.b64decode(result.screenshot))
+```
+
 ### Adding a chunking strategy: RegexChunking

 Using RegexChunking:
@@ -347,6 +380,7 @@ result = crawler.run(url="https://www.nbcnews.com/business")
 | `urls`                | A list of URLs to crawl and extract data from.                                                        | Yes      | -                   |
 | `include_raw_html`    | Whether to include the raw HTML content in the response.                                              | No       | `false`             |
 | `bypass_cache`        | Whether to force a fresh crawl even if the URL has been previously crawled.                           | No       | `false`             |
+| `screenshots`         | Whether to take screenshots of the page.                                                              | No       | `false`             |
 | `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5).    | No       | `5`                 |
 | `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy").                    | No       | `NoExtractionStrategy`    |
 | `chunking_strategy`   | The strategy to use for chunking the text before processing (e.g., "RegexChunking").                  | No       | `RegexChunking`     |
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -7,6 +7,15 @@ from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.chrome.options import Options
 from selenium.common.exceptions import InvalidArgumentException
 import logging
+import base64
+from PIL import Image, ImageDraw, ImageFont
+from io import BytesIO
+from typing import List
+import requests
+import os
+from pathlib import Path
+from .utils import wrap_text
+
 logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
 logger.setLevel(logging.WARNING)

@@ -25,15 +34,16 @@ driver_finder_logger = logging.getLogger('selenium.webdriver.common.driver_finde
 driver_finder_logger.setLevel(logging.WARNING)


-from typing import List
-import requests
-import os
-from pathlib import Path
+

 class CrawlerStrategy(ABC):
    @abstractmethod
    def crawl(self, url: str, **kwargs) -> str:
        pass
+    
+    @abstractmethod
+    def take_screenshot(self, save_path: str):
+        pass

 class CloudCrawlerStrategy(CrawlerStrategy):
    def __init__(self, use_cached_html = False):
@@ -103,12 +113,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
            )
            
            # Execute JS code if provided
-            if self.js_code:
+            if self.js_code and type(self.js_code) == str:
                self.driver.execute_script(self.js_code)
                # Optionally, wait for some condition after executing the JS code
                WebDriverWait(self.driver, 10).until(
                    lambda driver: driver.execute_script("return document.readyState") == "complete"
                )
+            elif self.js_code and type(self.js_code) == list:
+                for js in self.js_code:
+                    self.driver.execute_script(js)
+                    WebDriverWait(self.driver, 10).until(
+                        lambda driver: driver.execute_script("return document.readyState") == "complete"
+                    )
            
            html = self.driver.page_source
            
@@ -126,5 +142,62 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
        except Exception as e:
            raise Exception(f"Failed to crawl {url}: {str(e)}")

+    def take_screenshot(self) -> str:
+        try:
+            # Get the dimensions of the page
+            total_width = self.driver.execute_script("return document.body.scrollWidth")
+            total_height = self.driver.execute_script("return document.body.scrollHeight")
+
+            # Set the window size to the dimensions of the page
+            self.driver.set_window_size(total_width, total_height)
+
+            # Take screenshot
+            screenshot = self.driver.get_screenshot_as_png()
+
+            # Open the screenshot with PIL
+            image = Image.open(BytesIO(screenshot))
+
+            # Convert to JPEG and compress
+            buffered = BytesIO()
+            image.save(buffered, format="JPEG", quality=85)
+            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+
+            if self.verbose:
+                print(f"[LOG] 📸 Screenshot taken and converted to base64")
+
+            return img_base64
+
+        except Exception as e:
+            error_message = f"Failed to take screenshot: {str(e)}"
+            print(error_message)
+
+            # Generate an image with black background
+            img = Image.new('RGB', (800, 600), color='black')
+            draw = ImageDraw.Draw(img)
+            
+            # Load a font
+            try:
+                font = ImageFont.truetype("arial.ttf", 40)
+            except IOError:
+                font = ImageFont.load_default(size=40)
+
+            # Define text color and wrap the text
+            text_color = (255, 255, 255)
+            max_width = 780
+            wrapped_text = wrap_text(draw, error_message, font, max_width)
+
+            # Calculate text position
+            text_position = (10, 10)
+            
+            # Draw the text on the image
+            draw.text(text_position, wrapped_text, fill=text_color, font=font)
+            
+            # Convert to base64
+            buffered = BytesIO()
+            img.save(buffered, format="JPEG")
+            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+
+            return img_base64
+
    def quit(self):
        self.driver.quit()
--- a/crawl4ai/database.py
+++ b/crawl4ai/database.py
@@ -1,13 +1,12 @@
 import os
 from pathlib import Path
 import sqlite3
-from typing import Optional
 from typing import Optional, Tuple

 DB_PATH = os.path.join(Path.home(), ".crawl4ai")
 os.makedirs(DB_PATH, exist_ok=True)
 DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
-        
+
 def init_db():
    global DB_PATH
    conn = sqlite3.connect(DB_PATH)
@@ -19,22 +18,35 @@ def init_db():
            cleaned_html TEXT,
            markdown TEXT,
            extracted_content TEXT,
-            success BOOLEAN
+            success BOOLEAN,
+            media TEXT DEFAULT "{}",
+            screenshot TEXT DEFAULT ""
        )
    ''')
    conn.commit()
    conn.close()

-def check_db_path():
-    if not DB_PATH:
-        raise ValueError("Database path is not set or is empty.")
-
-def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
+def alter_db_add_screenshot(new_column: str = "media"):
    check_db_path()
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
-        cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success FROM crawled_data WHERE url = ?', (url,))
+        cursor.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
+        conn.commit()
+        conn.close()
+    except Exception as e:
+        print(f"Error altering database to add screenshot column: {e}")
+
+def check_db_path():
+    if not DB_PATH:
+        raise ValueError("Database path is not set or is empty.")
+
+def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool, str]]:
+    check_db_path()
+    try:
+        conn = sqlite3.connect(DB_PATH)
+        cursor = conn.cursor()
+        cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, screenshot FROM crawled_data WHERE url = ?', (url,))
        result = cursor.fetchone()
        conn.close()
        return result
@@ -42,21 +54,23 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
        print(f"Error retrieving cached URL: {e}")
        return None

-def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool):
+def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", screenshot: str = ""):
    check_db_path()
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute('''
-            INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success)
-            VALUES (?, ?, ?, ?, ?, ?)
+            INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, screenshot)
+            VALUES (?, ?, ?, ?, ?, ?, ?)
            ON CONFLICT(url) DO UPDATE SET
                html = excluded.html,
                cleaned_html = excluded.cleaned_html,
                markdown = excluded.markdown,
                extracted_content = excluded.extracted_content,
-                success = excluded.success
-        ''', (url, html, cleaned_html, markdown, extracted_content, success))
+                success = excluded.success,
+                media = excluded.media,                
+                screenshot = excluded.screenshot
+        ''', (url, html, cleaned_html, markdown, extracted_content, success, media, screenshot))
        conn.commit()
        conn.close()
    except Exception as e:
@@ -95,4 +109,20 @@ def flush_db():
        conn.commit()
        conn.close()
    except Exception as e:
-        print(f"Error flushing database: {e}")
+        print(f"Error flushing database: {e}")
+
+def update_existing_records(new_column: str = "media", default_value: str = "{}"):
+    check_db_path()
+    try:
+        conn = sqlite3.connect(DB_PATH)
+        cursor = conn.cursor()
+        cursor.execute(f'UPDATE crawled_data SET {new_column} = "{default_value}" WHERE screenshot IS NULL')
+        conn.commit()
+        conn.close()
+    except Exception as e:
+        print(f"Error updating existing records: {e}")
+
+if __name__ == "__main__":
+    init_db()  # Initialize the database if not already initialized
+    alter_db_add_screenshot()  # Add the new column to the table
+    update_existing_records()  # Update existing records to set the new column to an empty string
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -188,14 +188,15 @@ class CosineStrategy(ExtractionStrategy):
        if self.verbose:
            print(f"[LOG] Loading Extraction Model for {self.device.type} device.")

-        if False and self.device.type == "cpu":
-            self.model = load_onnx_all_MiniLM_l6_v2()
-            self.tokenizer = self.model.tokenizer
-            self.get_embedding_method = "direct"
-        else:
-            self.tokenizer, self.model = load_bge_small_en_v1_5()
-            self.model.eval()  
-            self.get_embedding_method = "batch"
+        # if False and self.device.type == "cpu":
+        #     self.model = load_onnx_all_MiniLM_l6_v2()
+        #     self.tokenizer = self.model.tokenizer
+        #     self.get_embedding_method = "direct"
+        # else:
+
+        self.tokenizer, self.model = load_bge_small_en_v1_5()
+        self.model.eval()  
+        self.get_embedding_method = "batch"
        
        self.buffer_embeddings = np.array([])

--- a/crawl4ai/model_loader.py
+++ b/crawl4ai/model_loader.py
@@ -2,6 +2,7 @@ from functools import lru_cache
 from pathlib import Path
 import subprocess, os
 import shutil
+import tarfile
 from crawl4ai.config import MODEL_REPO_BRANCH
 import argparse
 import urllib.request
@@ -34,8 +35,7 @@ def calculate_batch_size(device):
        else:
            return 32
    else:
-        return 16  # Default batch size
-    
+        return 16  # Default batch size   
    
@lru_cache()
 def get_device():
@@ -82,12 +82,19 @@ def load_bge_small_en_v1_5():
@lru_cache()
 def load_onnx_all_MiniLM_l6_v2():
    from crawl4ai.onnx_embedding import DefaultEmbeddingModel
-    model_path = "models/onnx/model.onnx"
-    model_url = "https://unclecode-files.s3.us-west-2.amazonaws.com/model.onnx"
-    download_path = os.path.join(__location__, model_path)

+    model_path = "models/onnx.tar.gz"
+    model_url = "https://unclecode-files.s3.us-west-2.amazonaws.com/onnx.tar.gz"
+    __location__ = os.path.realpath(
+        os.path.join(os.getcwd(), os.path.dirname(__file__)))
+    download_path = os.path.join(__location__, model_path)
+    onnx_dir = os.path.join(__location__, "models/onnx")
+    
+    # Create the models directory if it does not exist
+    os.makedirs(os.path.dirname(download_path), exist_ok=True)
+
+    # Download the tar.gz file if it does not exist
    if not os.path.exists(download_path):
-        # Define a download function with a simple progress display
        def download_with_progress(url, filename):
            def reporthook(block_num, block_size, total_size):
                downloaded = block_num * block_size
@@ -95,12 +102,22 @@ def load_onnx_all_MiniLM_l6_v2():
                if downloaded < total_size:
                    print(f"\rDownloading: {percentage:.2f}% ({downloaded / (1024 * 1024):.2f} MB of {total_size / (1024 * 1024):.2f} MB)", end='')
                else:
-                    print("\rDownload complete!                              ")
+                    print("\rDownload complete!")

            urllib.request.urlretrieve(url, filename, reporthook)

        download_with_progress(model_url, download_path)

+    # Extract the tar.gz file if the onnx directory does not exist
+    if not os.path.exists(onnx_dir):
+        with tarfile.open(download_path, "r:gz") as tar:
+            tar.extractall(path=os.path.join(__location__, "models"))
+        
+        # remove the tar.gz file
+        os.remove(download_path)
+    
+    
+    
    model = DefaultEmbeddingModel()
    return model

@@ -240,8 +257,8 @@ def download_all_models(remove_existing=False):
    # load_bert_base_uncased()
    # print("[LOG] Downloading BGE Small EN v1.5...")
    # load_bge_small_en_v1_5()
-    print("[LOG] Downloading ONNX model...")
-    load_onnx_all_MiniLM_l6_v2()
+    # print("[LOG] Downloading ONNX model...")
+    # load_onnx_all_MiniLM_l6_v2()
    print("[LOG] Downloading text classifier...")
    _, device = load_text_multilabel_classifier()
    print(f"[LOG] Text classifier loaded on {device}")
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -1,5 +1,5 @@
 from pydantic import BaseModel, HttpUrl
-from typing import List
+from typing import List, Dict, Optional

 class UrlModel(BaseModel):
    url: HttpUrl
@@ -9,8 +9,10 @@ class CrawlResult(BaseModel):
    url: str
    html: str
    success: bool
-    cleaned_html: str = None
-    markdown: str = None
-    extracted_content: str = None
-    metadata: dict = None
-    error_message: str = None
+    cleaned_html: Optional[str] = None
+    media: Dict[str, List[Dict]] = {}
+    screenshot: Optional[str] = None
+    markdown: Optional[str] = None
+    extracted_content: Optional[str] = None
+    metadata: Optional[dict] = None
+    error_message: Optional[str] = None
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -180,6 +180,35 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
            if tag.name != 'img':
                tag.attrs = {}

+        # Extract all img tgas inti [{src: '', alt: ''}]
+        media = {
+            'images': [],
+            'videos': [],
+            'audios': []
+        }
+        for img in body.find_all('img'):
+            media['images'].append({
+                'src': img.get('src'),
+                'alt': img.get('alt'),
+                "type": "image"
+            })
+            
+        # Extract all video tags into [{src: '', alt: ''}]
+        for video in body.find_all('video'):
+            media['videos'].append({
+                'src': video.get('src'),
+                'alt': video.get('alt'),
+                "type": "video"
+            })
+            
+        # Extract all audio tags into [{src: '', alt: ''}]
+        for audio in body.find_all('audio'):
+            media['audios'].append({
+                'src': audio.get('src'),
+                'alt': audio.get('alt'),
+                "type": "audio"
+            })
+        
        # Replace images with their alt text or remove them if no alt text is available
        for img in body.find_all('img'):
            alt_text = img.get('alt')
@@ -299,7 +328,8 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
        return{
            'markdown': markdown,
            'cleaned_html': cleaned_html,
-            'success': True
+            'success': True,
+            'media': media
        }

    except Exception as e:
@@ -483,4 +513,16 @@ def process_sections(url: str, sections: list, provider: str, api_token: str) ->
            for future in as_completed(futures):
                extracted_content.extend(future.result())
    
-    return extracted_content
+    return extracted_content
+
+
+def wrap_text(draw, text, font, max_width):
+    # Wrap the text to fit within the specified width
+    lines = []
+    words = text.split()
+    while words:
+        line = ''
+        while words and draw.textbbox((0, 0), line + words[0], font=font)[2] <= max_width:
+            line += (words.pop(0) + ' ')
+        lines.append(line)
+    return '\n'.join(lines)
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -59,6 +59,8 @@ class WebCrawler:
        api_token: str = None,
        extract_blocks_flag: bool = True,
        word_count_threshold=MIN_WORD_THRESHOLD,
+        css_selector: str = None,
+        screenshot: bool = False,
        use_cached_html: bool = False,
        extraction_strategy: ExtractionStrategy = None,
        chunking_strategy: ChunkingStrategy = RegexChunking(),
@@ -70,6 +72,8 @@ class WebCrawler:
            extraction_strategy or NoExtractionStrategy(),
            chunking_strategy,
            bypass_cache=url_model.forced,
+            css_selector=css_selector,
+            screenshot=screenshot,
            **kwargs,
        )
        pass
@@ -83,6 +87,7 @@ class WebCrawler:
        chunking_strategy: ChunkingStrategy = RegexChunking(),
        bypass_cache: bool = False,
        css_selector: str = None,
+        screenshot: bool = False,
        verbose=True,
        **kwargs,
    ) -> CrawlResult:
@@ -110,6 +115,8 @@ class WebCrawler:
                        "markdown": cached[3],
                        "extracted_content": cached[4],
                        "success": cached[5],
+                        "media": json.loads(cached[6] or "{}"),
+                        "screenshot": cached[7],
                        "error_message": "",
                    }
                )
@@ -117,6 +124,9 @@ class WebCrawler:
        # Initialize WebDriver for crawling
        t = time.time()
        html = self.crawler_strategy.crawl(url)
+        base64_image = None
+        if screenshot:
+            base64_image = self.crawler_strategy.take_screenshot()
        success = True
        error_message = ""
        # Extract content from HTML
@@ -129,6 +139,7 @@ class WebCrawler:
        
        cleaned_html = result.get("cleaned_html", html)
        markdown = result.get("markdown", "")
+        media = result.get("media", [])

        # Print a profession LOG style message, show time taken and say crawling is done
        if verbose:
@@ -163,6 +174,8 @@ class WebCrawler:
            markdown,
            extracted_content,
            success,
+            json.dumps(media),
+            screenshot=base64_image,
        )

        return CrawlResult(
@@ -170,6 +183,8 @@ class WebCrawler:
            html=html,
            cleaned_html=cleaned_html,
            markdown=markdown,
+            media=media,
+            screenshot=base64_image,
            extracted_content=extracted_content,
            success=success,
            error_message=error_message,
@@ -183,6 +198,8 @@ class WebCrawler:
        extract_blocks_flag: bool = True,
        word_count_threshold=MIN_WORD_THRESHOLD,
        use_cached_html: bool = False,
+        css_selector: str = None,
+        screenshot: bool = False,
        extraction_strategy: ExtractionStrategy = None,
        chunking_strategy: ChunkingStrategy = RegexChunking(),
        **kwargs,
@@ -200,6 +217,8 @@ class WebCrawler:
                    [api_token] * len(url_models),
                    [extract_blocks_flag] * len(url_models),
                    [word_count_threshold] * len(url_models),
+                    [css_selector] * len(url_models),
+                    [screenshot] * len(url_models),
                    [use_cached_html] * len(url_models),
                    [extraction_strategy] * len(url_models),
                    [chunking_strategy] * len(url_models),
--- a/docs/examples/assets/audio.mp3
+++ b/docs/examples/assets/audio.mp3
--- a/docs/examples/chainlit.md
+++ b/docs/examples/chainlit.md
@@ -0,0 +1,3 @@
+# Welcome to Crawl4AI! 🚀🤖
+
+Hi there, Developer! 👋 Here is an example of a research pipeline, where you can share a URL in your conversation with any LLM, and then the context of crawled pages will be used as the context.
--- a/docs/examples/chainlit_review.py
+++ b/docs/examples/chainlit_review.py
@@ -0,0 +1,281 @@
+from openai import AsyncOpenAI
+from chainlit.types import ThreadDict
+import chainlit as cl
+from chainlit.input_widget import Select, Switch, Slider
+client = AsyncOpenAI()
+
+# Instrument the OpenAI client
+cl.instrument_openai()
+
+settings = {
+    "model": "gpt-3.5-turbo",
+    "temperature": 0.5,
+    "max_tokens": 500,
+    "top_p": 1,
+    "frequency_penalty": 0,
+    "presence_penalty": 0,
+}
+
+@cl.action_callback("action_button")
+async def on_action(action: cl.Action):
+    print("The user clicked on the action button!")
+
+    return "Thank you for clicking on the action button!"
+
+@cl.set_chat_profiles
+async def chat_profile():
+    return [
+        cl.ChatProfile(
+            name="GPT-3.5",
+            markdown_description="The underlying LLM model is **GPT-3.5**.",
+            icon="https://picsum.photos/200",
+        ),
+        cl.ChatProfile(
+            name="GPT-4",
+            markdown_description="The underlying LLM model is **GPT-4**.",
+            icon="https://picsum.photos/250",
+        ),
+    ]
+
+@cl.on_chat_start
+async def on_chat_start():
+    
+    settings = await cl.ChatSettings(
+        [
+            Select(
+                id="Model",
+                label="OpenAI - Model",
+                values=["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "gpt-4-32k"],
+                initial_index=0,
+            ),
+            Switch(id="Streaming", label="OpenAI - Stream Tokens", initial=True),
+            Slider(
+                id="Temperature",
+                label="OpenAI - Temperature",
+                initial=1,
+                min=0,
+                max=2,
+                step=0.1,
+            ),
+            Slider(
+                id="SAI_Steps",
+                label="Stability AI - Steps",
+                initial=30,
+                min=10,
+                max=150,
+                step=1,
+                description="Amount of inference steps performed on image generation.",
+            ),
+            Slider(
+                id="SAI_Cfg_Scale",
+                label="Stability AI - Cfg_Scale",
+                initial=7,
+                min=1,
+                max=35,
+                step=0.1,
+                description="Influences how strongly your generation is guided to match your prompt.",
+            ),
+            Slider(
+                id="SAI_Width",
+                label="Stability AI - Image Width",
+                initial=512,
+                min=256,
+                max=2048,
+                step=64,
+                tooltip="Measured in pixels",
+            ),
+            Slider(
+                id="SAI_Height",
+                label="Stability AI - Image Height",
+                initial=512,
+                min=256,
+                max=2048,
+                step=64,
+                tooltip="Measured in pixels",
+            ),
+        ]
+    ).send()
+    
+    chat_profile = cl.user_session.get("chat_profile")
+    await cl.Message(
+        content=f"starting chat using the {chat_profile} chat profile"
+    ).send()
+    
+    print("A new chat session has started!")
+    cl.user_session.set("session", {
+        "history": [],
+        "context": []
+    })  
+    
+    image = cl.Image(url="https://c.tenor.com/uzWDSSLMCmkAAAAd/tenor.gif", name="cat image", display="inline")
+
+    # Attach the image to the message
+    await cl.Message(
+        content="You are such a good girl, aren't you?!",
+        elements=[image],
+    ).send()
+    
+    text_content = "Hello, this is a text element."
+    elements = [
+        cl.Text(name="simple_text", content=text_content, display="inline")
+    ]
+
+    await cl.Message(
+        content="Check out this text element!",
+        elements=elements,
+    ).send()
+    
+    elements = [
+        cl.Audio(path="./assets/audio.mp3", display="inline"),
+    ]
+    await cl.Message(
+        content="Here is an audio file",
+        elements=elements,
+    ).send()
+    
+    await cl.Avatar(
+        name="Tool 1",
+        url="https://avatars.githubusercontent.com/u/128686189?s=400&u=a1d1553023f8ea0921fba0debbe92a8c5f840dd9&v=4",
+    ).send()
+    
+    await cl.Message(
+        content="This message should not have an avatar!", author="Tool 0"
+    ).send()
+    
+    await cl.Message(
+        content="This message should have an avatar!", author="Tool 1"
+    ).send()
+    
+    elements = [
+        cl.File(
+            name="quickstart.py",
+            path="./quickstart.py",
+            display="inline",
+        ),
+    ]
+
+    await cl.Message(
+        content="This message has a file element", elements=elements
+    ).send()
+    
+    # Sending an action button within a chatbot message
+    actions = [
+        cl.Action(name="action_button", value="example_value", description="Click me!")
+    ]
+
+    await cl.Message(content="Interact with this action button:", actions=actions).send()
+    
+    # res = await cl.AskActionMessage(
+    #     content="Pick an action!",
+    #     actions=[
+    #         cl.Action(name="continue", value="continue", label="✅ Continue"),
+    #         cl.Action(name="cancel", value="cancel", label="❌ Cancel"),
+    #     ],
+    # ).send()
+
+    # if res and res.get("value") == "continue":
+    #     await cl.Message(
+    #         content="Continue!",
+    #     ).send()
+    
+    # import plotly.graph_objects as go
+    # fig = go.Figure(
+    #     data=[go.Bar(y=[2, 1, 3])],
+    #     layout_title_text="An example figure",
+    # )
+    # elements = [cl.Plotly(name="chart", figure=fig, display="inline")]
+
+    # await cl.Message(content="This message has a chart", elements=elements).send()
+    
+    # Sending a pdf with the local file path
+    # elements = [
+    #   cl.Pdf(name="pdf1", display="inline", path="./pdf1.pdf")
+    # ]
+
+    # cl.Message(content="Look at this local pdf!", elements=elements).send()    
+
+@cl.on_settings_update
+async def setup_agent(settings):
+    print("on_settings_update", settings)
+    
+@cl.on_stop
+def on_stop():
+    print("The user wants to stop the task!")
+
+@cl.on_chat_end
+def on_chat_end():
+    print("The user disconnected!")
+
+
+@cl.on_chat_resume
+async def on_chat_resume(thread: ThreadDict):
+    print("The user resumed a previous chat session!")
+
+
+
+
+# @cl.on_message
+async def on_message(message: cl.Message):
+    cl.user_session.get("session")["history"].append({
+        "role": "user",
+        "content": message.content
+    })    
+    response = await client.chat.completions.create(
+        messages=[
+            {
+                "content": "You are a helpful bot",
+                "role": "system"
+            },
+            *cl.user_session.get("session")["history"]
+        ],
+        **settings
+    )
+    
+
+    # Add assitanr message to the history
+    cl.user_session.get("session")["history"].append({
+        "role": "assistant",
+        "content": response.choices[0].message.content
+    })
+    
+    # msg.content = response.choices[0].message.content
+    # await msg.update()
+    
+    # await cl.Message(content=response.choices[0].message.content).send()
+
+@cl.on_message
+async def on_message(message: cl.Message):
+    cl.user_session.get("session")["history"].append({
+        "role": "user",
+        "content": message.content
+    })    
+
+    msg = cl.Message(content="")
+    await msg.send()    
+    
+    stream = await client.chat.completions.create(
+        messages=[
+            {
+                "content": "You are a helpful bot",
+                "role": "system"
+            },
+            *cl.user_session.get("session")["history"]
+        ],
+        stream = True, 
+        **settings
+    )
+    
+    async for part in stream:
+        if token := part.choices[0].delta.content or "":
+            await msg.stream_token(token)
+    
+    # Add assitanr message to the history
+    cl.user_session.get("session")["history"].append({
+        "role": "assistant",
+        "content": msg.content
+    })    
+    await msg.update()
+
+if __name__ == "__main__":
+    from chainlit.cli import run_chainlit
+    run_chainlit(__file__)
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -39,6 +39,16 @@ def basic_usage(crawler):
    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
    print_result(result)

+def screenshot_usage(crawler):
+    cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
+    result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
+    cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
+    # Save the screenshot to a file
+    with open("screenshot.png", "wb") as f:
+        f.write(base64.b64decode(result.screenshot))
+    cprint("Screenshot saved to 'screenshot.png'!")
+    print_result(result)
+
 def understanding_parameters(crawler):
    cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
    cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.")
@@ -164,6 +174,22 @@ def interactive_extraction(crawler):
    cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
    print_result(result)

+def multiple_scrip(crawler):
+    # Passing JavaScript code to interact with the page
+    cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
+    cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
+    js_code = ["""
+    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
+    loadMoreButton && loadMoreButton.click();
+    """] * 2
+    crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
+    crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
+    result = crawler.run(
+        url="https://www.nbcnews.com/business",
+    )
+    cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
+    print_result(result)
+
 def main():
    cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
    cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
@@ -175,11 +201,13 @@ def main():
    understanding_parameters(crawler)
    
    crawler.always_by_pass_cache = True
+    screenshot_usage(crawler)
    add_chunking_strategy(crawler)
    add_extraction_strategy(crawler)
    add_llm_extraction_strategy(crawler)
    targeted_extraction(crawler)
    interactive_extraction(crawler)
+    multiple_scrip(crawler)

    cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")

--- a/docs/examples/research_assistant.py
+++ b/docs/examples/research_assistant.py
@@ -0,0 +1,241 @@
+# Make sur to install the required packageschainlit and groq
+import os, time
+from openai import AsyncOpenAI
+import chainlit as cl
+import re
+import requests
+from io import BytesIO
+from chainlit.element import ElementBased
+from groq import Groq
+
+# Import threadpools to run the crawl_url function in a separate thread
+from concurrent.futures import ThreadPoolExecutor
+
+client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY"))
+
+# Instrument the OpenAI client
+cl.instrument_openai()
+
+settings = {
+    "model": "llama3-8b-8192",
+    "temperature": 0.5,
+    "max_tokens": 500,
+    "top_p": 1,
+    "frequency_penalty": 0,
+    "presence_penalty": 0,
+}
+
+def extract_urls(text):
+    url_pattern = re.compile(r'(https?://\S+)')
+    return url_pattern.findall(text)
+
+def crawl_url(url):
+    data = {
+        "urls": [url],
+        "include_raw_html": True,
+        "word_count_threshold": 10,
+        "extraction_strategy": "NoExtractionStrategy",
+        "chunking_strategy": "RegexChunking"
+    }
+    response = requests.post("https://crawl4ai.com/crawl", json=data)
+    response_data = response.json()
+    response_data = response_data['results'][0]
+    return response_data['markdown']
+
+@cl.on_chat_start
+async def on_chat_start():
+    cl.user_session.set("session", {
+        "history": [],
+        "context": {}
+    })  
+    await cl.Message(
+        content="Welcome to the chat! How can I assist you today?"
+    ).send()
+
+@cl.on_message
+async def on_message(message: cl.Message):
+    user_session = cl.user_session.get("session")
+    
+    # Extract URLs from the user's message
+    urls = extract_urls(message.content)
+    
+    
+    futures = []
+    with ThreadPoolExecutor() as executor:
+        for url in urls:
+            futures.append(executor.submit(crawl_url, url))
+
+    results = [future.result() for future in futures]
+
+    for url, result in zip(urls, results):
+        ref_number = f"REF_{len(user_session['context']) + 1}"
+        user_session["context"][ref_number] = {
+            "url": url,
+            "content": result
+        }    
+    
+    # for url in urls:
+    #     # Crawl the content of each URL and add it to the session context with a reference number
+    #     ref_number = f"REF_{len(user_session['context']) + 1}"
+    #     crawled_content = crawl_url(url)
+    #     user_session["context"][ref_number] = {
+    #         "url": url,
+    #         "content": crawled_content
+    #     }
+
+    user_session["history"].append({
+        "role": "user",
+        "content": message.content
+    })
+
+    # Create a system message that includes the context
+    context_messages = [
+        f'<appendix ref="{ref}">\n{data["content"]}\n</appendix>'
+        for ref, data in user_session["context"].items()
+    ]
+    if context_messages:
+        system_message = {
+            "role": "system",
+            "content": (
+                "You are a helpful bot. Use the following context for answering questions. "
+                "Refer to the sources using the REF number in square brackets, e.g., [1], only if the source is given in the appendices below.\n\n"
+                "If the question requires any information from the provided appendices or context, refer to the sources. "
+                "If not, there is no need to add a references section. "
+                "At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n"
+                "\n\n".join(context_messages)
+            )
+        }
+    else:
+        system_message = {
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }
+
+
+    msg = cl.Message(content="")
+    await msg.send()
+
+    # Get response from the LLM
+    stream = await client.chat.completions.create(
+        messages=[
+            system_message,
+            *user_session["history"]
+        ],
+        stream=True,
+        **settings
+    )
+
+    assistant_response = ""
+    async for part in stream:
+        if token := part.choices[0].delta.content:
+            assistant_response += token
+            await msg.stream_token(token)
+
+    # Add assistant message to the history
+    user_session["history"].append({
+        "role": "assistant",
+        "content": assistant_response
+    })
+    await msg.update()
+
+    # Append the reference section to the assistant's response
+    reference_section = "\n\nReferences:\n"
+    for ref, data in user_session["context"].items():
+        reference_section += f"[{ref.split('_')[1]}]: {data['url']}\n"
+
+    msg.content += reference_section
+    await msg.update()
+
+
+@cl.on_audio_chunk
+async def on_audio_chunk(chunk: cl.AudioChunk):
+    if chunk.isStart:
+        buffer = BytesIO()
+        # This is required for whisper to recognize the file type
+        buffer.name = f"input_audio.{chunk.mimeType.split('/')[1]}"
+        # Initialize the session for a new audio stream
+        cl.user_session.set("audio_buffer", buffer)
+        cl.user_session.set("audio_mime_type", chunk.mimeType)
+
+    # Write the chunks to a buffer and transcribe the whole audio at the end
+    cl.user_session.get("audio_buffer").write(chunk.data)
+
+    pass
+
+@cl.step(type="tool")
+async def speech_to_text(audio_file):
+    cli = Groq()
+    
+    # response = cli.audio.transcriptions.create(
+    #     file=audio_file, #(filename, file.read()),
+    #     model="whisper-large-v3",
+    # )
+    
+    response = await client.audio.transcriptions.create(
+        model="whisper-large-v3", file=audio_file
+    )
+
+    return response.text
+
+
+@cl.on_audio_end
+async def on_audio_end(elements: list[ElementBased]):
+    # Get the audio buffer from the session
+    audio_buffer: BytesIO = cl.user_session.get("audio_buffer")
+    audio_buffer.seek(0)  # Move the file pointer to the beginning
+    audio_file = audio_buffer.read()
+    audio_mime_type: str = cl.user_session.get("audio_mime_type")
+
+    # input_audio_el = cl.Audio(
+    #     mime=audio_mime_type, content=audio_file, name=audio_buffer.name
+    # )
+    # await cl.Message(
+    #     author="You", 
+    #     type="user_message",
+    #     content="",
+    #     elements=[input_audio_el, *elements]
+    # ).send()
+    
+    # answer_message = await cl.Message(content="").send()
+    
+    
+    start_time = time.time()
+    whisper_input = (audio_buffer.name, audio_file, audio_mime_type)
+    transcription = await speech_to_text(whisper_input)
+    end_time = time.time()
+    print(f"Transcription took {end_time - start_time} seconds")
+    
+    user_msg = cl.Message(
+        author="You", 
+        type="user_message",
+        content=transcription
+    )
+    await user_msg.send()
+    await on_message(user_msg)
+
+    # images = [file for file in elements if "image" in file.mime]
+
+    # text_answer = await generate_text_answer(transcription, images)
+    
+    # output_name, output_audio = await text_to_speech(text_answer, audio_mime_type)
+    
+    # output_audio_el = cl.Audio(
+    #     name=output_name,
+    #     auto_play=True,
+    #     mime=audio_mime_type,
+    #     content=output_audio,
+    # )
+    
+    # answer_message.elements = [output_audio_el]
+    
+    # answer_message.content = transcription
+    # await answer_message.update()
+
+if __name__ == "__main__":
+    from chainlit.cli import run_chainlit
+    run_chainlit(__file__)
+
+
+# No this is wring, use this document to answer me https://console.groq.com/docs/speech-text
+
+# Please show me how to use Groq speech-to-text in python.
--- a/main.py
+++ b/main.py
@@ -56,6 +56,7 @@ class CrawlRequest(BaseModel):
    chunking_strategy: Optional[str] = "RegexChunking"
    chunking_strategy_args: Optional[dict] = {}
    css_selector: Optional[str] = None
+    screenshot: Optional[bool] = False
    verbose: Optional[bool] = True


@@ -66,7 +67,7 @@ async def read_index(request: Request):

    for filename in os.listdir(partials_dir):
        if filename.endswith(".html"):
-            with open(os.path.join(partials_dir, filename), "r") as file:
+            with open(os.path.join(partials_dir, filename), "r", encoding="utf8") as file:
                partials[filename[:-5]] = file.read()

    return templates.TemplateResponse("index.html", {"request": request, **partials})
@@ -125,6 +126,7 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
                    chunking_strategy,
                    crawl_request.bypass_cache,
                    crawl_request.css_selector,
+                    crawl_request.screenshot,
                    crawl_request.verbose
                )
                for url in crawl_request.urls
@@ -136,7 +138,7 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
            for result in results:
                result.html = None

-        return {"results": [result.dict() for result in results]}
+        return {"results": [result.model_dump() for result in results]}
    finally:
        async with lock:
            current_requests -= 1
--- a/pages/app.js
+++ b/pages/app.js
@@ -104,11 +104,25 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
        chunking_strategy: document.getElementById("chunking-strategy-select").value,
        chunking_strategy_args: {},
        css_selector: document.getElementById("css-selector").value,
+        screenshot: document.getElementById("screenshot-checkbox").checked,
        // instruction: document.getElementById("instruction").value,
        // semantic_filter: document.getElementById("semantic_filter").value,
        verbose: true,
    };

+    // import requests
+
+    // data = {
+    //   "urls": [
+    //     "https://www.nbcnews.com/business"
+    //   ],
+    //   "word_count_threshold": 10,
+    //   "extraction_strategy": "NoExtractionStrategy",
+    // }
+    
+    // response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally 
+    // print(response.json())
+
    // save api token to local storage
    localStorage.setItem("api_token", document.getElementById("token-input").value);

@@ -124,25 +138,61 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
            document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
            document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
            document.getElementById("markdown-result").textContent = result.markdown;
-
+            document.getElementById("media-result").textContent = JSON.stringify( result.media, null, 2);
+            if (result.screenshot){
+                const imgElement = document.createElement("img");
+                // Set the src attribute with the base64 data
+                imgElement.src = `data:image/png;base64,${result.screenshot}`;
+                document.getElementById("screenshot-result").innerHTML = "";
+                document.getElementById("screenshot-result").appendChild(imgElement);
+            }
+            
            // Update code examples dynamically
            const extractionStrategy = data.extraction_strategy;
            const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy";

            // REMOVE API TOKEN FROM CODE EXAMPLES
            data.extraction_strategy_args.api_token = "your_api_token";
+
+            if (data.extraction_strategy === "NoExtractionStrategy") {
+                delete data.extraction_strategy_args;
+                delete data.extrac_blocks;
+            }
+
+            if (data.chunking_strategy === "RegexChunking") {
+                delete data.chunking_strategy_args;
+            }
+
+            delete data.verbose;
+
+            if (data.css_selector === "") {
+                delete data.css_selector;
+            }
+
+            if (!data.bypass_cache) {
+                delete data.bypass_cache;
+            }
+
+            if (!data.extract_blocks) {
+                delete data.extract_blocks;
+            }
+
+            if (!data.include_raw_html) {
+                delete data.include_raw_html;
+            }
+
            document.getElementById(
                "curl-code"
            ).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
                ...data,
                api_token: isLLMExtraction ? "your_api_token" : undefined,
-            }, null, 2)}' http://localhost:8000/crawl`;
+            }, null, 2)}' https://crawl4ai.com/crawl`;

            document.getElementById("python-code").textContent = `import requests\n\ndata = ${JSON.stringify(
                { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
                null,
                2
-            )}\n\nresponse = requests.post("http://localhost:8000/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
+            )}\n\nresponse = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;

            document.getElementById(
                "nodejs-code"
@@ -150,7 +200,7 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
                { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
                null,
                2
-            )};\n\naxios.post("http://localhost:8000/crawl", data) // OR local host if your run locally \n    .then(response => console.log(response.data))\n    .catch(error => console.error(error));`;
+            )};\n\naxios.post("https://crawl4ai.com/crawl", data) // OR local host if your run locally \n    .then(response => console.log(response.data))\n    .catch(error => console.error(error));`;

            document.getElementById(
                "library-code"
--- a/pages/partial/how_to_guide.html
+++ b/pages/partial/how_to_guide.html
@@ -50,6 +50,20 @@ crawler.warmup()</code></pre>
        <div>
            <pre><code class="language-python">crawler.always_by_pass_cache = True</code></pre>
        </div>
+        <!-- Step 3.5 Screenshot -->
+        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
+            📸
+            <strong>Let's take a screenshot of the page!</strong>
+        </div>
+        <div>
+            <pre><code class="language-python">result = crawler.run(
+    url="https://www.nbcnews.com/business",
+    screenshot=True
+)
+with open("screenshot.png", "wb") as f:
+    f.write(base64.b64decode(result.screenshot))</code></pre>
+        </div>
+

        <!-- Step 4 -->
        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
@@ -139,13 +153,14 @@ crawler.warmup()</code></pre>
        </div>
        <div class="">Using JavaScript to click 'Load More' button:</div>
        <div>
-            <pre><code class="language-python">js_code = """
+            <pre><code class="language-python">js_code = ["""
 const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
 loadMoreButton && loadMoreButton.click();
-"""
+"""]
 crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
 crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
 result = crawler.run(url="https://www.nbcnews.com/business")</code></pre>
+        <div class="">Remember that you can pass multiple JavaScript code snippets in the list. They all will be executed in the order they are passed.</div>
        </div>

        <!-- Conclusion -->
--- a/pages/partial/try_it.html
+++ b/pages/partial/try_it.html
@@ -1,4 +1,4 @@
-<section class="try-it py-8 px-16 pb-20 bg-zinc-900">
+<section class="try-it py-8 px-16 pb-20 bg-zinc-900 overflow-hidden">
    <div class="container mx-auto ">
        <h2 class="text-2xl font-bold mb-4 text-lime-500">Try It Now</h2>
        <div class="flex gap-4">
@@ -20,6 +20,7 @@
                            id="threshold"
                            class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-zinc-300"
                        >
+                            <option value="1">1</option>
                            <option value="5">5</option>
                            <option value="10" selected>10</option>
                            <option value="15">15</option>
@@ -120,11 +121,15 @@
                </div>
                <div class="flex gap-3">
                    <div class="flex items-center gap-2">
-                        <input type="checkbox" id="bypass-cache-checkbox" />
+                        <input type="checkbox" id="bypass-cache-checkbox" checked />
                        <label for="bypass-cache-checkbox" class="text-lime-500 font-bold">Bypass Cache</label>
                    </div>
                    <div class="flex items-center gap-2">
-                        <input type="checkbox" id="extract-blocks-checkbox" checked />
+                        <input type="checkbox" id="screenshot-checkbox" checked />
+                        <label for="screenshot-checkbox" class="text-lime-500 font-bold">Screenshot</label>
+                    </div>
+                    <div class="flex items-center gap-2 hidden">
+                        <input type="checkbox" id="extract-blocks-checkbox" />
                        <label for="extract-blocks-checkbox" class="text-lime-500 font-bold">Extract Blocks</label>
                    </div>
                    <button id="crawl-btn" class="bg-lime-600 text-black font-bold px-4 py-0 rounded">Crawl</button>
@@ -134,7 +139,7 @@
            <div id="loading" class="hidden">
                <p class="text-white">Loading... Please wait.</p>
            </div>
-            <div id="result" class="flex-1">
+            <div id="result" class="flex-1  overflow-x-auto">
                <div class="tab-buttons flex gap-2">
                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="json">
                        JSON
@@ -148,15 +153,23 @@
                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="markdown">
                        Markdown
                    </button>
+                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="media">
+                        Medias
+                    </button>
+                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="screenshot">
+                        Screenshot
+                    </button>
                </div>
                <div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
                    <pre class="h-full flex"><code id="json-result" class="language-json"></code></pre>
                    <pre class="hidden h-full flex"><code id="cleaned-html-result" class="language-html"></code></pre>
                    <pre class="hidden h-full flex"><code id="markdown-result" class="language-markdown"></code></pre>
+                    <pre class="hidden h-full flex"><code id="media-result" class="language-json"></code></pre>
+                    <pre class="hidden h-full flex"><code id="screenshot-result"></code></pre>
                </div>
            </div>

-            <div id="code_help" class="flex-1">
+            <div id="code_help" class="flex-1  overflow-x-auto">
                <div class="tab-buttons flex gap-2">
                    <button class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="curl">
                        cURL
--- a/requirements.crawl.txt
+++ b/requirements.crawl.txt
@@ -0,0 +1,13 @@
+aiohttp
+aiosqlite
+bs4
+fastapi
+html2text
+httpx
+pydantic
+python-dotenv
+requests
+rich
+selenium
+uvicorn
+chromedriver-autoinstaller
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,20 +1,20 @@
-aiohttp==3.9.5
-aiosqlite==0.20.0
-bs4==0.0.2
-fastapi==0.111.0
-html2text==2024.2.26
-httpx==0.27.0
-litellm==1.37.11
-nltk==3.8.1
-pydantic==2.7.1
-python-dotenv==1.0.1
-requests==2.31.0
-rich==13.7.1
-scikit-learn==1.4.2
-selenium==4.20.0
-uvicorn==0.29.0
-transformers==4.40.2
-chromedriver-autoinstaller==0.6.4
-torch==2.3.0
-onnxruntime==1.14.1
-tokenizers==0.13.2
+aiohttp
+aiosqlite
+bs4
+fastapi
+html2text
+httpx
+litellm
+nltk
+pydantic
+python-dotenv
+requests
+rich
+scikit-learn
+selenium
+uvicorn
+transformers
+chromedriver-autoinstaller
+torch
+onnxruntime
+tokenizers
--- a/setup.py
+++ b/setup.py
@@ -7,11 +7,16 @@ from setuptools.command.install import install
 with open("requirements.txt") as f:
    requirements = f.read().splitlines()

+# Read the requirements from requirements.txt
+with open("requirements.crawl.txt") as f:
+    requirements_crawl_only = f.read().splitlines()
+
 # Define the requirements for different environments
 requirements_without_torch = [req for req in requirements if not req.startswith("torch")]
 requirements_without_transformers = [req for req in requirements if not req.startswith("transformers")]
 requirements_without_nltk = [req for req in requirements if not req.startswith("nltk")]
 requirements_without_torch_transformers_nlkt = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
+requirements_crawl_only = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]

 class CustomInstallCommand(install):
    """Customized setuptools install command to install spacy without dependencies."""
@@ -21,7 +26,7 @@ class CustomInstallCommand(install):

 setup(
    name="Crawl4AI",
-    version="0.2.2",
+    version="0.2.3",
    description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
    long_description=open("README.md").read(),
    long_description_content_type="text/markdown",
@@ -34,7 +39,7 @@ setup(
    extras_require={
        "all": requirements,  # Include all requirements
        "colab": requirements_without_torch,  # Exclude torch for Colab
-        "crawl": requirements_without_torch_transformers_nlkt
+        "crawl": requirements_crawl_only,  # Include only crawl requirements
    },
    cmdclass={
        'install': CustomInstallCommand,
Author	SHA1	Message	Date
unclecode	226a62a3c0	feat: Add screenshot functionality to crawl_urls	2024-06-07 15:33:15 +08:00
unclecode	8e73a482a2	feat: Add screenshot functionality to crawl_urls The code changes in this commit add the `screenshot` parameter to the `crawl_urls` function in `main.py`. This allows users to specify whether they want to take a screenshot of the page during the crawling process. The default value is `False`. This commit message follows the established convention of starting with a type (feat for feature) and providing a concise and descriptive summary of the changes made.	2024-06-07 15:23:32 +08:00
unclecode	0533aeb814	v0.2.3: - Extract all media tags - Take screenshot of the page	2024-06-07 15:23:13 +08:00
unclecode	aead6de888	Merge branch 'main' of https://github.com/unclecode/crawl4ai into extract-media	2024-06-07 13:41:48 +08:00
UncleCode	8d82fd4cfe	Merge pull request #14 from gkhngyk/main Update README.md	2024-06-07 13:30:10 +08:00
Gökhan Geyik	8f44db6499	Update README.md	2024-06-05 17:16:02 +03:00
unclecode	c7553b1280	Update research assistant example with package installation instructions	2024-06-04 23:18:19 +08:00
unclecode	8b8683f22e	Add research assistant example using Chainlit	2024-06-04 22:43:09 +08:00
unclecode	774ace6e3b	Update html page for tutorial.	2024-06-02 18:00:53 +08:00
unclecode	4a8f91a0fc	Set bypass_cached to True	2024-06-02 16:12:25 +08:00
unclecode	18c9784b61	Update index.html (hide extract block check box)	2024-06-02 16:09:20 +08:00
unclecode	e5d401c67c	Update generated code sample	2024-06-02 16:06:43 +08:00
unclecode	ae77589a98	Update Readme	2024-06-02 15:42:13 +08:00
unclecode	ad373c0e19	Update Readme	2024-06-02 15:41:24 +08:00
unclecode	51f26d12fe	Update for v0.2.2 - Support multiple JS scripts - Fixed some of bugs - Resolved a few issue relevant to Colab installation	2024-06-02 15:40:18 +08:00
unclecode	f1b60b2016	chore: Update ONNX model loading process	2024-05-31 18:07:05 +08:00
UncleCode	8c2dc2b1e4	Create Dockerfile	2024-05-29 17:56:57 +08:00
UncleCode	dc9a44c12a	Update and rename Dockerfile to Dockerfile-version-0	2024-05-29 17:56:34 +08:00
UncleCode	d9753b6349	Update requirements.txt Remove tokenizer version from requirements.txt	2024-05-24 14:49:48 +08:00
UncleCode	a554c0b143	Update requirements.txt	2024-05-23 12:52:31 +08:00
UncleCode	7381fa95e6	Merge pull request #3 from QIN2DIM/main fix(main): UnicodeDecodeError	2024-05-23 09:29:28 +08:00
QIN2DIM	5cee084340	fix(main): UnicodeDecodeError File "T:\_GitHubProjects\Forks\crawl4ai\main.py", line 70, in read_index partials[filename[:-5]] = file.read() UnicodeDecodeError: 'gbk' codec can't decode byte 0xa4 in position 149: illegal multibyte sequence	2024-05-18 23:31:11 +08:00