Merge branch 'main' of https://github.com/unclecode/crawl4ai

2024-06-07 08:18:19 +00:00
parent 4010558885 b32013cb97
commit a4a6b2075f
18 changed files with 789 additions and 38 deletions
--- a/.files/screenshot.png
+++ b/.files/screenshot.png
--- a/.gitignore
+++ b/.gitignore
@@ -175,3 +175,7 @@ a.txt
 *.sh
 .idea
 docs/examples/.chainlit/
 docs/examples/.chainlit/*
 .chainlit/config.toml
 .chainlit/translations/en-US.json
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Crawl4AI v0.2.2 🕷️🤖
+# Crawl4AI v0.2.3 🕷️🤖
 [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
 [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)
@@ -12,6 +12,10 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
 ## Recent Changes 
 ### v0.2.3
 - 🎨 Extract and return all media tags (Images, Audio, and Video). Check `result.media`
 - 🖼️ Take [screenshots](#taking-screenshots) of the page.
 ### v0.2.2
 - Support multiple JS scripts
 - Fixed some of bugs
@@ -229,7 +233,7 @@ To use the REST API, send a POST request to `http://localhost:8000/crawl` with t
 }
 ```
-For more information about the available parameters and their descriptions, refer to the [Parameters](#parameters) section.
+For more information about the available parameters and their descriptions, refer to the [Parameters](#parameters-) section.
 ## Python Library Usage 🚀
@@ -262,6 +266,14 @@ Crawl result without raw HTML content:
 result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
 ```
 ### Taking Screenshots
 ```python
 result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
 with open("screenshot.png", "wb") as f:
    f.write(base64.b64decode(result.screenshot))
 ```
 ### Adding a chunking strategy: RegexChunking
 Using RegexChunking:
@@ -368,6 +380,7 @@ result = crawler.run(url="https://www.nbcnews.com/business")
 | `urls`                | A list of URLs to crawl and extract data from.                                                        | Yes      | -                   |
 | `include_raw_html`    | Whether to include the raw HTML content in the response.                                              | No       | `false`             |
 | `bypass_cache`        | Whether to force a fresh crawl even if the URL has been previously crawled.                           | No       | `false`             |
 | `screenshots`         | Whether to take screenshots of the page.                                                              | No       | `false`             |
 | `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5).    | No       | `5`                 |
 | `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy").                    | No       | `NoExtractionStrategy`    |
 | `chunking_strategy`   | The strategy to use for chunking the text before processing (e.g., "RegexChunking").                  | No       | `RegexChunking`     |
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -7,6 +7,15 @@ from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.chrome.options import Options
 from selenium.common.exceptions import InvalidArgumentException
 import logging
 import base64
 from PIL import Image, ImageDraw, ImageFont
 from io import BytesIO
 from typing import List
 import requests
 import os
 from pathlib import Path
 from .utils import wrap_text
 logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
 logger.setLevel(logging.WARNING)
@@ -25,16 +34,17 @@ driver_finder_logger = logging.getLogger('selenium.webdriver.common.driver_finde
 driver_finder_logger.setLevel(logging.WARNING)
-from typing import List
+
 import requests
 import os
 from pathlib import Path
 class CrawlerStrategy(ABC):
    @abstractmethod
    def crawl(self, url: str, **kwargs) -> str:
        pass
    @abstractmethod
    def take_screenshot(self, save_path: str):
        pass
 class CloudCrawlerStrategy(CrawlerStrategy):
    def __init__(self, use_cached_html = False):
        super().__init__()
@@ -132,5 +142,62 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
        except Exception as e:
            raise Exception(f"Failed to crawl {url}: {str(e)}")
    def take_screenshot(self) -> str:
        try:
            # Get the dimensions of the page
            total_width = self.driver.execute_script("return document.body.scrollWidth")
            total_height = self.driver.execute_script("return document.body.scrollHeight")
            # Set the window size to the dimensions of the page
            self.driver.set_window_size(total_width, total_height)
            # Take screenshot
            screenshot = self.driver.get_screenshot_as_png()
            # Open the screenshot with PIL
            image = Image.open(BytesIO(screenshot))
            # Convert to JPEG and compress
            buffered = BytesIO()
            image.save(buffered, format="JPEG", quality=85)
            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
            if self.verbose:
                print(f"[LOG] 📸 Screenshot taken and converted to base64")
            return img_base64
        except Exception as e:
            error_message = f"Failed to take screenshot: {str(e)}"
            print(error_message)
            # Generate an image with black background
            img = Image.new('RGB', (800, 600), color='black')
            draw = ImageDraw.Draw(img)
            # Load a font
            try:
                font = ImageFont.truetype("arial.ttf", 40)
            except IOError:
                font = ImageFont.load_default(size=40)
            # Define text color and wrap the text
            text_color = (255, 255, 255)
            max_width = 780
            wrapped_text = wrap_text(draw, error_message, font, max_width)
            # Calculate text position
            text_position = (10, 10)
            # Draw the text on the image
            draw.text(text_position, wrapped_text, fill=text_color, font=font)
            # Convert to base64
            buffered = BytesIO()
            img.save(buffered, format="JPEG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
            return img_base64
    def quit(self):
        self.driver.quit()
--- a/crawl4ai/database.py
+++ b/crawl4ai/database.py
@@ -1,7 +1,6 @@
 import os
 from pathlib import Path
 import sqlite3
 from typing import Optional
 from typing import Optional, Tuple
 DB_PATH = os.path.join(Path.home(), ".crawl4ai")
@@ -19,22 +18,35 @@ def init_db():
            cleaned_html TEXT,
            markdown TEXT,
            extracted_content TEXT,
-            success BOOLEAN
+            success BOOLEAN,
            media TEXT DEFAULT "{}",
            screenshot TEXT DEFAULT ""
        )
    ''')
    conn.commit()
    conn.close()
-def check_db_path():
+def alter_db_add_screenshot(new_column: str = "media"):
    if not DB_PATH:
        raise ValueError("Database path is not set or is empty.")
 def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
    check_db_path()
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
-        cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success FROM crawled_data WHERE url = ?', (url,))
+        cursor.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
        conn.commit()
        conn.close()
    except Exception as e:
        print(f"Error altering database to add screenshot column: {e}")
 def check_db_path():
    if not DB_PATH:
        raise ValueError("Database path is not set or is empty.")
 def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool, str]]:
    check_db_path()
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, screenshot FROM crawled_data WHERE url = ?', (url,))
        result = cursor.fetchone()
        conn.close()
        return result
@@ -42,21 +54,23 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
        print(f"Error retrieving cached URL: {e}")
        return None
-def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool):
+def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", screenshot: str = ""):
    check_db_path()
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute('''
-            INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success)
+            INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, screenshot)
-            VALUES (?, ?, ?, ?, ?, ?)
+            VALUES (?, ?, ?, ?, ?, ?, ?)
            ON CONFLICT(url) DO UPDATE SET
                html = excluded.html,
                cleaned_html = excluded.cleaned_html,
                markdown = excluded.markdown,
                extracted_content = excluded.extracted_content,
-                success = excluded.success
+                success = excluded.success,
-        ''', (url, html, cleaned_html, markdown, extracted_content, success))
+                media = excluded.media,                
                screenshot = excluded.screenshot
        ''', (url, html, cleaned_html, markdown, extracted_content, success, media, screenshot))
        conn.commit()
        conn.close()
    except Exception as e:
@@ -96,3 +110,19 @@ def flush_db():
        conn.close()
    except Exception as e:
        print(f"Error flushing database: {e}")
 def update_existing_records(new_column: str = "media", default_value: str = "{}"):
    check_db_path()
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute(f'UPDATE crawled_data SET {new_column} = "{default_value}" WHERE screenshot IS NULL')
        conn.commit()
        conn.close()
    except Exception as e:
        print(f"Error updating existing records: {e}")
 if __name__ == "__main__":
    init_db()  # Initialize the database if not already initialized
    alter_db_add_screenshot()  # Add the new column to the table
    update_existing_records()  # Update existing records to set the new column to an empty string
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -1,5 +1,5 @@
 from pydantic import BaseModel, HttpUrl
-from typing import List
+from typing import List, Dict, Optional
 class UrlModel(BaseModel):
    url: HttpUrl
@@ -9,8 +9,10 @@ class CrawlResult(BaseModel):
    url: str
    html: str
    success: bool
-    cleaned_html: str = None
+    cleaned_html: Optional[str] = None
-    markdown: str = None
+    media: Dict[str, List[Dict]] = {}
-    extracted_content: str = None
+    screenshot: Optional[str] = None
-    metadata: dict = None
+    markdown: Optional[str] = None
-    error_message: str = None
+    extracted_content: Optional[str] = None
    metadata: Optional[dict] = None
    error_message: Optional[str] = None
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -180,6 +180,35 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
            if tag.name != 'img':
                tag.attrs = {}
        # Extract all img tgas inti [{src: '', alt: ''}]
        media = {
            'images': [],
            'videos': [],
            'audios': []
        }
        for img in body.find_all('img'):
            media['images'].append({
                'src': img.get('src'),
                'alt': img.get('alt'),
                "type": "image"
            })
        # Extract all video tags into [{src: '', alt: ''}]
        for video in body.find_all('video'):
            media['videos'].append({
                'src': video.get('src'),
                'alt': video.get('alt'),
                "type": "video"
            })
        # Extract all audio tags into [{src: '', alt: ''}]
        for audio in body.find_all('audio'):
            media['audios'].append({
                'src': audio.get('src'),
                'alt': audio.get('alt'),
                "type": "audio"
            })
        # Replace images with their alt text or remove them if no alt text is available
        for img in body.find_all('img'):
            alt_text = img.get('alt')
@@ -299,7 +328,8 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
        return{
            'markdown': markdown,
            'cleaned_html': cleaned_html,
-            'success': True
+            'success': True,
            'media': media
        }
    except Exception as e:
@@ -484,3 +514,15 @@ def process_sections(url: str, sections: list, provider: str, api_token: str) ->
                extracted_content.extend(future.result())
    return extracted_content
 def wrap_text(draw, text, font, max_width):
    # Wrap the text to fit within the specified width
    lines = []
    words = text.split()
    while words:
        line = ''
        while words and draw.textbbox((0, 0), line + words[0], font=font)[2] <= max_width:
            line += (words.pop(0) + ' ')
        lines.append(line)
    return '\n'.join(lines)
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -59,6 +59,8 @@ class WebCrawler:
        api_token: str = None,
        extract_blocks_flag: bool = True,
        word_count_threshold=MIN_WORD_THRESHOLD,
        css_selector: str = None,
        screenshot: bool = False,
        use_cached_html: bool = False,
        extraction_strategy: ExtractionStrategy = None,
        chunking_strategy: ChunkingStrategy = RegexChunking(),
@@ -70,6 +72,8 @@ class WebCrawler:
            extraction_strategy or NoExtractionStrategy(),
            chunking_strategy,
            bypass_cache=url_model.forced,
            css_selector=css_selector,
            screenshot=screenshot,
            **kwargs,
        )
        pass
@@ -83,6 +87,7 @@ class WebCrawler:
        chunking_strategy: ChunkingStrategy = RegexChunking(),
        bypass_cache: bool = False,
        css_selector: str = None,
        screenshot: bool = False,
        verbose=True,
        **kwargs,
    ) -> CrawlResult:
@@ -110,6 +115,8 @@ class WebCrawler:
                        "markdown": cached[3],
                        "extracted_content": cached[4],
                        "success": cached[5],
                        "media": json.loads(cached[6] or "{}"),
                        "screenshot": cached[7],
                        "error_message": "",
                    }
                )
@@ -117,6 +124,9 @@ class WebCrawler:
        # Initialize WebDriver for crawling
        t = time.time()
        html = self.crawler_strategy.crawl(url)
        base64_image = None
        if screenshot:
            base64_image = self.crawler_strategy.take_screenshot()
        success = True
        error_message = ""
        # Extract content from HTML
@@ -129,6 +139,7 @@ class WebCrawler:
        cleaned_html = result.get("cleaned_html", html)
        markdown = result.get("markdown", "")
        media = result.get("media", [])
        # Print a profession LOG style message, show time taken and say crawling is done
        if verbose:
@@ -163,6 +174,8 @@ class WebCrawler:
            markdown,
            extracted_content,
            success,
            json.dumps(media),
            screenshot=base64_image,
        )
        return CrawlResult(
@@ -170,6 +183,8 @@ class WebCrawler:
            html=html,
            cleaned_html=cleaned_html,
            markdown=markdown,
            media=media,
            screenshot=base64_image,
            extracted_content=extracted_content,
            success=success,
            error_message=error_message,
@@ -183,6 +198,8 @@ class WebCrawler:
        extract_blocks_flag: bool = True,
        word_count_threshold=MIN_WORD_THRESHOLD,
        use_cached_html: bool = False,
        css_selector: str = None,
        screenshot: bool = False,
        extraction_strategy: ExtractionStrategy = None,
        chunking_strategy: ChunkingStrategy = RegexChunking(),
        **kwargs,
@@ -200,6 +217,8 @@ class WebCrawler:
                    [api_token] * len(url_models),
                    [extract_blocks_flag] * len(url_models),
                    [word_count_threshold] * len(url_models),
                    [css_selector] * len(url_models),
                    [screenshot] * len(url_models),
                    [use_cached_html] * len(url_models),
                    [extraction_strategy] * len(url_models),
                    [chunking_strategy] * len(url_models),
--- a/docs/examples/assets/audio.mp3
+++ b/docs/examples/assets/audio.mp3
--- a/docs/examples/chainlit.md
+++ b/docs/examples/chainlit.md
@@ -0,0 +1,3 @@
 # Welcome to Crawl4AI! 🚀🤖
 Hi there, Developer! 👋 Here is an example of a research pipeline, where you can share a URL in your conversation with any LLM, and then the context of crawled pages will be used as the context.
--- a/docs/examples/chainlit_review.py
+++ b/docs/examples/chainlit_review.py
@@ -0,0 +1,281 @@
 from openai import AsyncOpenAI
 from chainlit.types import ThreadDict
 import chainlit as cl
 from chainlit.input_widget import Select, Switch, Slider
 client = AsyncOpenAI()
 # Instrument the OpenAI client
 cl.instrument_openai()
 settings = {
    "model": "gpt-3.5-turbo",
    "temperature": 0.5,
    "max_tokens": 500,
    "top_p": 1,
    "frequency_penalty": 0,
    "presence_penalty": 0,
 }
@cl.action_callback("action_button")
 async def on_action(action: cl.Action):
    print("The user clicked on the action button!")
    return "Thank you for clicking on the action button!"
@cl.set_chat_profiles
 async def chat_profile():
    return [
        cl.ChatProfile(
            name="GPT-3.5",
            markdown_description="The underlying LLM model is **GPT-3.5**.",
            icon="https://picsum.photos/200",
        ),
        cl.ChatProfile(
            name="GPT-4",
            markdown_description="The underlying LLM model is **GPT-4**.",
            icon="https://picsum.photos/250",
        ),
    ]
@cl.on_chat_start
 async def on_chat_start():
    settings = await cl.ChatSettings(
        [
            Select(
                id="Model",
                label="OpenAI - Model",
                values=["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "gpt-4-32k"],
                initial_index=0,
            ),
            Switch(id="Streaming", label="OpenAI - Stream Tokens", initial=True),
            Slider(
                id="Temperature",
                label="OpenAI - Temperature",
                initial=1,
                min=0,
                max=2,
                step=0.1,
            ),
            Slider(
                id="SAI_Steps",
                label="Stability AI - Steps",
                initial=30,
                min=10,
                max=150,
                step=1,
                description="Amount of inference steps performed on image generation.",
            ),
            Slider(
                id="SAI_Cfg_Scale",
                label="Stability AI - Cfg_Scale",
                initial=7,
                min=1,
                max=35,
                step=0.1,
                description="Influences how strongly your generation is guided to match your prompt.",
            ),
            Slider(
                id="SAI_Width",
                label="Stability AI - Image Width",
                initial=512,
                min=256,
                max=2048,
                step=64,
                tooltip="Measured in pixels",
            ),
            Slider(
                id="SAI_Height",
                label="Stability AI - Image Height",
                initial=512,
                min=256,
                max=2048,
                step=64,
                tooltip="Measured in pixels",
            ),
        ]
    ).send()
    chat_profile = cl.user_session.get("chat_profile")
    await cl.Message(
        content=f"starting chat using the {chat_profile} chat profile"
    ).send()
    print("A new chat session has started!")
    cl.user_session.set("session", {
        "history": [],
        "context": []
    })  
    image = cl.Image(url="https://c.tenor.com/uzWDSSLMCmkAAAAd/tenor.gif", name="cat image", display="inline")
    # Attach the image to the message
    await cl.Message(
        content="You are such a good girl, aren't you?!",
        elements=[image],
    ).send()
    text_content = "Hello, this is a text element."
    elements = [
        cl.Text(name="simple_text", content=text_content, display="inline")
    ]
    await cl.Message(
        content="Check out this text element!",
        elements=elements,
    ).send()
    elements = [
        cl.Audio(path="./assets/audio.mp3", display="inline"),
    ]
    await cl.Message(
        content="Here is an audio file",
        elements=elements,
    ).send()
    await cl.Avatar(
        name="Tool 1",
        url="https://avatars.githubusercontent.com/u/128686189?s=400&u=a1d1553023f8ea0921fba0debbe92a8c5f840dd9&v=4",
    ).send()
    await cl.Message(
        content="This message should not have an avatar!", author="Tool 0"
    ).send()
    await cl.Message(
        content="This message should have an avatar!", author="Tool 1"
    ).send()
    elements = [
        cl.File(
            name="quickstart.py",
            path="./quickstart.py",
            display="inline",
        ),
    ]
    await cl.Message(
        content="This message has a file element", elements=elements
    ).send()
    # Sending an action button within a chatbot message
    actions = [
        cl.Action(name="action_button", value="example_value", description="Click me!")
    ]
    await cl.Message(content="Interact with this action button:", actions=actions).send()
    # res = await cl.AskActionMessage(
    #     content="Pick an action!",
    #     actions=[
    #         cl.Action(name="continue", value="continue", label="✅ Continue"),
    #         cl.Action(name="cancel", value="cancel", label="❌ Cancel"),
    #     ],
    # ).send()
    # if res and res.get("value") == "continue":
    #     await cl.Message(
    #         content="Continue!",
    #     ).send()
    # import plotly.graph_objects as go
    # fig = go.Figure(
    #     data=[go.Bar(y=[2, 1, 3])],
    #     layout_title_text="An example figure",
    # )
    # elements = [cl.Plotly(name="chart", figure=fig, display="inline")]
    # await cl.Message(content="This message has a chart", elements=elements).send()
    # Sending a pdf with the local file path
    # elements = [
    #   cl.Pdf(name="pdf1", display="inline", path="./pdf1.pdf")
    # ]
    # cl.Message(content="Look at this local pdf!", elements=elements).send()    
@cl.on_settings_update
 async def setup_agent(settings):
    print("on_settings_update", settings)
@cl.on_stop
 def on_stop():
    print("The user wants to stop the task!")
@cl.on_chat_end
 def on_chat_end():
    print("The user disconnected!")
@cl.on_chat_resume
 async def on_chat_resume(thread: ThreadDict):
    print("The user resumed a previous chat session!")
 # @cl.on_message
 async def on_message(message: cl.Message):
    cl.user_session.get("session")["history"].append({
        "role": "user",
        "content": message.content
    })    
    response = await client.chat.completions.create(
        messages=[
            {
                "content": "You are a helpful bot",
                "role": "system"
            },
            *cl.user_session.get("session")["history"]
        ],
        **settings
    )
    # Add assitanr message to the history
    cl.user_session.get("session")["history"].append({
        "role": "assistant",
        "content": response.choices[0].message.content
    })
    # msg.content = response.choices[0].message.content
    # await msg.update()
    # await cl.Message(content=response.choices[0].message.content).send()
@cl.on_message
 async def on_message(message: cl.Message):
    cl.user_session.get("session")["history"].append({
        "role": "user",
        "content": message.content
    })    
    msg = cl.Message(content="")
    await msg.send()    
    stream = await client.chat.completions.create(
        messages=[
            {
                "content": "You are a helpful bot",
                "role": "system"
            },
            *cl.user_session.get("session")["history"]
        ],
        stream = True, 
        **settings
    )
    async for part in stream:
        if token := part.choices[0].delta.content or "":
            await msg.stream_token(token)
    # Add assitanr message to the history
    cl.user_session.get("session")["history"].append({
        "role": "assistant",
        "content": msg.content
    })    
    await msg.update()
 if __name__ == "__main__":
    from chainlit.cli import run_chainlit
    run_chainlit(__file__)
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -39,6 +39,16 @@ def basic_usage(crawler):
    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
    print_result(result)
 def screenshot_usage(crawler):
    cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
    result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
    cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
    # Save the screenshot to a file
    with open("screenshot.png", "wb") as f:
        f.write(base64.b64decode(result.screenshot))
    cprint("Screenshot saved to 'screenshot.png'!")
    print_result(result)
 def understanding_parameters(crawler):
    cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
    cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.")
@@ -191,6 +201,7 @@ def main():
    understanding_parameters(crawler)
    crawler.always_by_pass_cache = True
    screenshot_usage(crawler)
    add_chunking_strategy(crawler)
    add_extraction_strategy(crawler)
    add_llm_extraction_strategy(crawler)
--- a/docs/examples/research_assistant.py
+++ b/docs/examples/research_assistant.py
@@ -0,0 +1,241 @@
 # Make sur to install the required packageschainlit and groq
 import os, time
 from openai import AsyncOpenAI
 import chainlit as cl
 import re
 import requests
 from io import BytesIO
 from chainlit.element import ElementBased
 from groq import Groq
 # Import threadpools to run the crawl_url function in a separate thread
 from concurrent.futures import ThreadPoolExecutor
 client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY"))
 # Instrument the OpenAI client
 cl.instrument_openai()
 settings = {
    "model": "llama3-8b-8192",
    "temperature": 0.5,
    "max_tokens": 500,
    "top_p": 1,
    "frequency_penalty": 0,
    "presence_penalty": 0,
 }
 def extract_urls(text):
    url_pattern = re.compile(r'(https?://\S+)')
    return url_pattern.findall(text)
 def crawl_url(url):
    data = {
        "urls": [url],
        "include_raw_html": True,
        "word_count_threshold": 10,
        "extraction_strategy": "NoExtractionStrategy",
        "chunking_strategy": "RegexChunking"
    }
    response = requests.post("https://crawl4ai.com/crawl", json=data)
    response_data = response.json()
    response_data = response_data['results'][0]
    return response_data['markdown']
@cl.on_chat_start
 async def on_chat_start():
    cl.user_session.set("session", {
        "history": [],
        "context": {}
    })  
    await cl.Message(
        content="Welcome to the chat! How can I assist you today?"
    ).send()
@cl.on_message
 async def on_message(message: cl.Message):
    user_session = cl.user_session.get("session")
    # Extract URLs from the user's message
    urls = extract_urls(message.content)
    futures = []
    with ThreadPoolExecutor() as executor:
        for url in urls:
            futures.append(executor.submit(crawl_url, url))
    results = [future.result() for future in futures]
    for url, result in zip(urls, results):
        ref_number = f"REF_{len(user_session['context']) + 1}"
        user_session["context"][ref_number] = {
            "url": url,
            "content": result
        }    
    # for url in urls:
    #     # Crawl the content of each URL and add it to the session context with a reference number
    #     ref_number = f"REF_{len(user_session['context']) + 1}"
    #     crawled_content = crawl_url(url)
    #     user_session["context"][ref_number] = {
    #         "url": url,
    #         "content": crawled_content
    #     }
    user_session["history"].append({
        "role": "user",
        "content": message.content
    })
    # Create a system message that includes the context
    context_messages = [
        f'<appendix ref="{ref}">\n{data["content"]}\n</appendix>'
        for ref, data in user_session["context"].items()
    ]
    if context_messages:
        system_message = {
            "role": "system",
            "content": (
                "You are a helpful bot. Use the following context for answering questions. "
                "Refer to the sources using the REF number in square brackets, e.g., [1], only if the source is given in the appendices below.\n\n"
                "If the question requires any information from the provided appendices or context, refer to the sources. "
                "If not, there is no need to add a references section. "
                "At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n"
                "\n\n".join(context_messages)
            )
        }
    else:
        system_message = {
            "role": "system",
            "content": "You are a helpful assistant."
        }
    msg = cl.Message(content="")
    await msg.send()
    # Get response from the LLM
    stream = await client.chat.completions.create(
        messages=[
            system_message,
            *user_session["history"]
        ],
        stream=True,
        **settings
    )
    assistant_response = ""
    async for part in stream:
        if token := part.choices[0].delta.content:
            assistant_response += token
            await msg.stream_token(token)
    # Add assistant message to the history
    user_session["history"].append({
        "role": "assistant",
        "content": assistant_response
    })
    await msg.update()
    # Append the reference section to the assistant's response
    reference_section = "\n\nReferences:\n"
    for ref, data in user_session["context"].items():
        reference_section += f"[{ref.split('_')[1]}]: {data['url']}\n"
    msg.content += reference_section
    await msg.update()
@cl.on_audio_chunk
 async def on_audio_chunk(chunk: cl.AudioChunk):
    if chunk.isStart:
        buffer = BytesIO()
        # This is required for whisper to recognize the file type
        buffer.name = f"input_audio.{chunk.mimeType.split('/')[1]}"
        # Initialize the session for a new audio stream
        cl.user_session.set("audio_buffer", buffer)
        cl.user_session.set("audio_mime_type", chunk.mimeType)
    # Write the chunks to a buffer and transcribe the whole audio at the end
    cl.user_session.get("audio_buffer").write(chunk.data)
    pass
@cl.step(type="tool")
 async def speech_to_text(audio_file):
    cli = Groq()
    # response = cli.audio.transcriptions.create(
    #     file=audio_file, #(filename, file.read()),
    #     model="whisper-large-v3",
    # )
    response = await client.audio.transcriptions.create(
        model="whisper-large-v3", file=audio_file
    )
    return response.text
@cl.on_audio_end
 async def on_audio_end(elements: list[ElementBased]):
    # Get the audio buffer from the session
    audio_buffer: BytesIO = cl.user_session.get("audio_buffer")
    audio_buffer.seek(0)  # Move the file pointer to the beginning
    audio_file = audio_buffer.read()
    audio_mime_type: str = cl.user_session.get("audio_mime_type")
    # input_audio_el = cl.Audio(
    #     mime=audio_mime_type, content=audio_file, name=audio_buffer.name
    # )
    # await cl.Message(
    #     author="You", 
    #     type="user_message",
    #     content="",
    #     elements=[input_audio_el, *elements]
    # ).send()
    # answer_message = await cl.Message(content="").send()
    start_time = time.time()
    whisper_input = (audio_buffer.name, audio_file, audio_mime_type)
    transcription = await speech_to_text(whisper_input)
    end_time = time.time()
    print(f"Transcription took {end_time - start_time} seconds")
    user_msg = cl.Message(
        author="You", 
        type="user_message",
        content=transcription
    )
    await user_msg.send()
    await on_message(user_msg)
    # images = [file for file in elements if "image" in file.mime]
    # text_answer = await generate_text_answer(transcription, images)
    # output_name, output_audio = await text_to_speech(text_answer, audio_mime_type)
    # output_audio_el = cl.Audio(
    #     name=output_name,
    #     auto_play=True,
    #     mime=audio_mime_type,
    #     content=output_audio,
    # )
    # answer_message.elements = [output_audio_el]
    # answer_message.content = transcription
    # await answer_message.update()
 if __name__ == "__main__":
    from chainlit.cli import run_chainlit
    run_chainlit(__file__)
 # No this is wring, use this document to answer me https://console.groq.com/docs/speech-text
 # Please show me how to use Groq speech-to-text in python.
--- a/main.py
+++ b/main.py
@@ -56,6 +56,7 @@ class CrawlRequest(BaseModel):
    chunking_strategy: Optional[str] = "RegexChunking"
    chunking_strategy_args: Optional[dict] = {}
    css_selector: Optional[str] = None
    screenshot: Optional[bool] = False
    verbose: Optional[bool] = True
@@ -125,6 +126,7 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
                    chunking_strategy,
                    crawl_request.bypass_cache,
                    crawl_request.css_selector,
                    crawl_request.screenshot,
                    crawl_request.verbose
                )
                for url in crawl_request.urls
@@ -136,7 +138,7 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
            for result in results:
                result.html = None
-        return {"results": [result.dict() for result in results]}
+        return {"results": [result.model_dump() for result in results]}
    finally:
        async with lock:
            current_requests -= 1
--- a/pages/app.js
+++ b/pages/app.js
@@ -104,6 +104,7 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
        chunking_strategy: document.getElementById("chunking-strategy-select").value,
        chunking_strategy_args: {},
        css_selector: document.getElementById("css-selector").value,
        screenshot: document.getElementById("screenshot-checkbox").checked,
        // instruction: document.getElementById("instruction").value,
        // semantic_filter: document.getElementById("semantic_filter").value,
        verbose: true,
@@ -137,6 +138,14 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
            document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
            document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
            document.getElementById("markdown-result").textContent = result.markdown;
            document.getElementById("media-result").textContent = JSON.stringify( result.media, null, 2);
            if (result.screenshot){
                const imgElement = document.createElement("img");
                // Set the src attribute with the base64 data
                imgElement.src = `data:image/png;base64,${result.screenshot}`;
                document.getElementById("screenshot-result").innerHTML = "";
                document.getElementById("screenshot-result").appendChild(imgElement);
            }
            // Update code examples dynamically
            const extractionStrategy = data.extraction_strategy;
--- a/pages/partial/how_to_guide.html
+++ b/pages/partial/how_to_guide.html
@@ -50,6 +50,20 @@ crawler.warmup()</code></pre>
        <div>
            <pre><code class="language-python">crawler.always_by_pass_cache = True</code></pre>
        </div>
        <!-- Step 3.5 Screenshot -->
        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
            📸
            <strong>Let's take a screenshot of the page!</strong>
        </div>
        <div>
            <pre><code class="language-python">result = crawler.run(
    url="https://www.nbcnews.com/business",
    screenshot=True
 )
 with open("screenshot.png", "wb") as f:
    f.write(base64.b64decode(result.screenshot))</code></pre>
        </div>
        <!-- Step 4 -->
        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
@@ -139,13 +153,14 @@ crawler.warmup()</code></pre>
        </div>
        <div class="">Using JavaScript to click 'Load More' button:</div>
        <div>
-            <pre><code class="language-python">js_code = """
+            <pre><code class="language-python">js_code = ["""
 const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
 loadMoreButton && loadMoreButton.click();
-"""
+"""]
 crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
 crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
 result = crawler.run(url="https://www.nbcnews.com/business")</code></pre>
        <div class="">Remember that you can pass multiple JavaScript code snippets in the list. They all will be executed in the order they are passed.</div>
        </div>
        <!-- Conclusion -->
--- a/pages/partial/try_it.html
+++ b/pages/partial/try_it.html
@@ -1,4 +1,4 @@
-<section class="try-it py-8 px-16 pb-20 bg-zinc-900">
+<section class="try-it py-8 px-16 pb-20 bg-zinc-900 overflow-hidden">
    <div class="container mx-auto ">
        <h2 class="text-2xl font-bold mb-4 text-lime-500">Try It Now</h2>
        <div class="flex gap-4">
@@ -124,6 +124,10 @@
                        <input type="checkbox" id="bypass-cache-checkbox" checked />
                        <label for="bypass-cache-checkbox" class="text-lime-500 font-bold">Bypass Cache</label>
                    </div>
                    <div class="flex items-center gap-2">
                        <input type="checkbox" id="screenshot-checkbox" checked />
                        <label for="screenshot-checkbox" class="text-lime-500 font-bold">Screenshot</label>
                    </div>
                    <div class="flex items-center gap-2 hidden">
                        <input type="checkbox" id="extract-blocks-checkbox" />
                        <label for="extract-blocks-checkbox" class="text-lime-500 font-bold">Extract Blocks</label>
@@ -135,7 +139,7 @@
            <div id="loading" class="hidden">
                <p class="text-white">Loading... Please wait.</p>
            </div>
-            <div id="result" class="flex-1">
+            <div id="result" class="flex-1  overflow-x-auto">
                <div class="tab-buttons flex gap-2">
                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="json">
                        JSON
@@ -149,15 +153,23 @@
                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="markdown">
                        Markdown
                    </button>
                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="media">
                        Medias
                    </button>
                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="screenshot">
                        Screenshot
                    </button>
                </div>
                <div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
                    <pre class="h-full flex"><code id="json-result" class="language-json"></code></pre>
                    <pre class="hidden h-full flex"><code id="cleaned-html-result" class="language-html"></code></pre>
                    <pre class="hidden h-full flex"><code id="markdown-result" class="language-markdown"></code></pre>
                    <pre class="hidden h-full flex"><code id="media-result" class="language-json"></code></pre>
                    <pre class="hidden h-full flex"><code id="screenshot-result"></code></pre>
                </div>
            </div>
-            <div id="code_help" class="flex-1">
+            <div id="code_help" class="flex-1  overflow-x-auto">
                <div class="tab-buttons flex gap-2">
                    <button class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="curl">
                        cURL
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@ class CustomInstallCommand(install):
 setup(
    name="Crawl4AI",
-    version="0.2.2",
+    version="0.2.3",
    description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
    long_description=open("README.md").read(),
    long_description_content_type="text/markdown",
		`@@ -0,0 +1,3 @@`
							`# Welcome to Crawl4AI! 🚀🤖`

							`Hi there, Developer! 👋 Here is an example of a research pipeline, where you can share a URL in your conversation with any LLM, and then the context of crawled pages will be used as the context.`