Add research assistant example using Chainlit

2024-06-04 22:43:09 +08:00
parent 774ace6e3b
commit 8b8683f22e
5 changed files with 529 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -174,4 +174,8 @@ requirements0.txt
 a.txt
 *.sh
-.idea
+.idea
 docs/examples/.chainlit/
 docs/examples/.chainlit/*
 .chainlit/config.toml
 .chainlit/translations/en-US.json
--- a/docs/examples/assets/audio.mp3
+++ b/docs/examples/assets/audio.mp3
--- a/docs/examples/chainlit.md
+++ b/docs/examples/chainlit.md
@@ -0,0 +1,3 @@
 # Welcome to Crawl4AI! 🚀🤖
 Hi there, Developer! 👋 Here is an example of a research pipeline, where you can share a URL in your conversation with any LLM, and then the context of crawled pages will be used as the context.
--- a/docs/examples/chainlit_review.py
+++ b/docs/examples/chainlit_review.py
@@ -0,0 +1,281 @@
 from openai import AsyncOpenAI
 from chainlit.types import ThreadDict
 import chainlit as cl
 from chainlit.input_widget import Select, Switch, Slider
 client = AsyncOpenAI()
 # Instrument the OpenAI client
 cl.instrument_openai()
 settings = {
    "model": "gpt-3.5-turbo",
    "temperature": 0.5,
    "max_tokens": 500,
    "top_p": 1,
    "frequency_penalty": 0,
    "presence_penalty": 0,
 }
@cl.action_callback("action_button")
 async def on_action(action: cl.Action):
    print("The user clicked on the action button!")
    return "Thank you for clicking on the action button!"
@cl.set_chat_profiles
 async def chat_profile():
    return [
        cl.ChatProfile(
            name="GPT-3.5",
            markdown_description="The underlying LLM model is **GPT-3.5**.",
            icon="https://picsum.photos/200",
        ),
        cl.ChatProfile(
            name="GPT-4",
            markdown_description="The underlying LLM model is **GPT-4**.",
            icon="https://picsum.photos/250",
        ),
    ]
@cl.on_chat_start
 async def on_chat_start():
    settings = await cl.ChatSettings(
        [
            Select(
                id="Model",
                label="OpenAI - Model",
                values=["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "gpt-4-32k"],
                initial_index=0,
            ),
            Switch(id="Streaming", label="OpenAI - Stream Tokens", initial=True),
            Slider(
                id="Temperature",
                label="OpenAI - Temperature",
                initial=1,
                min=0,
                max=2,
                step=0.1,
            ),
            Slider(
                id="SAI_Steps",
                label="Stability AI - Steps",
                initial=30,
                min=10,
                max=150,
                step=1,
                description="Amount of inference steps performed on image generation.",
            ),
            Slider(
                id="SAI_Cfg_Scale",
                label="Stability AI - Cfg_Scale",
                initial=7,
                min=1,
                max=35,
                step=0.1,
                description="Influences how strongly your generation is guided to match your prompt.",
            ),
            Slider(
                id="SAI_Width",
                label="Stability AI - Image Width",
                initial=512,
                min=256,
                max=2048,
                step=64,
                tooltip="Measured in pixels",
            ),
            Slider(
                id="SAI_Height",
                label="Stability AI - Image Height",
                initial=512,
                min=256,
                max=2048,
                step=64,
                tooltip="Measured in pixels",
            ),
        ]
    ).send()
    chat_profile = cl.user_session.get("chat_profile")
    await cl.Message(
        content=f"starting chat using the {chat_profile} chat profile"
    ).send()
    print("A new chat session has started!")
    cl.user_session.set("session", {
        "history": [],
        "context": []
    })  
    image = cl.Image(url="https://c.tenor.com/uzWDSSLMCmkAAAAd/tenor.gif", name="cat image", display="inline")
    # Attach the image to the message
    await cl.Message(
        content="You are such a good girl, aren't you?!",
        elements=[image],
    ).send()
    text_content = "Hello, this is a text element."
    elements = [
        cl.Text(name="simple_text", content=text_content, display="inline")
    ]
    await cl.Message(
        content="Check out this text element!",
        elements=elements,
    ).send()
    elements = [
        cl.Audio(path="./assets/audio.mp3", display="inline"),
    ]
    await cl.Message(
        content="Here is an audio file",
        elements=elements,
    ).send()
    await cl.Avatar(
        name="Tool 1",
        url="https://avatars.githubusercontent.com/u/128686189?s=400&u=a1d1553023f8ea0921fba0debbe92a8c5f840dd9&v=4",
    ).send()
    await cl.Message(
        content="This message should not have an avatar!", author="Tool 0"
    ).send()
    await cl.Message(
        content="This message should have an avatar!", author="Tool 1"
    ).send()
    elements = [
        cl.File(
            name="quickstart.py",
            path="./quickstart.py",
            display="inline",
        ),
    ]
    await cl.Message(
        content="This message has a file element", elements=elements
    ).send()
    # Sending an action button within a chatbot message
    actions = [
        cl.Action(name="action_button", value="example_value", description="Click me!")
    ]
    await cl.Message(content="Interact with this action button:", actions=actions).send()
    # res = await cl.AskActionMessage(
    #     content="Pick an action!",
    #     actions=[
    #         cl.Action(name="continue", value="continue", label="✅ Continue"),
    #         cl.Action(name="cancel", value="cancel", label="❌ Cancel"),
    #     ],
    # ).send()
    # if res and res.get("value") == "continue":
    #     await cl.Message(
    #         content="Continue!",
    #     ).send()
    # import plotly.graph_objects as go
    # fig = go.Figure(
    #     data=[go.Bar(y=[2, 1, 3])],
    #     layout_title_text="An example figure",
    # )
    # elements = [cl.Plotly(name="chart", figure=fig, display="inline")]
    # await cl.Message(content="This message has a chart", elements=elements).send()
    # Sending a pdf with the local file path
    # elements = [
    #   cl.Pdf(name="pdf1", display="inline", path="./pdf1.pdf")
    # ]
    # cl.Message(content="Look at this local pdf!", elements=elements).send()    
@cl.on_settings_update
 async def setup_agent(settings):
    print("on_settings_update", settings)
@cl.on_stop
 def on_stop():
    print("The user wants to stop the task!")
@cl.on_chat_end
 def on_chat_end():
    print("The user disconnected!")
@cl.on_chat_resume
 async def on_chat_resume(thread: ThreadDict):
    print("The user resumed a previous chat session!")
 # @cl.on_message
 async def on_message(message: cl.Message):
    cl.user_session.get("session")["history"].append({
        "role": "user",
        "content": message.content
    })    
    response = await client.chat.completions.create(
        messages=[
            {
                "content": "You are a helpful bot",
                "role": "system"
            },
            *cl.user_session.get("session")["history"]
        ],
        **settings
    )
    # Add assitanr message to the history
    cl.user_session.get("session")["history"].append({
        "role": "assistant",
        "content": response.choices[0].message.content
    })
    # msg.content = response.choices[0].message.content
    # await msg.update()
    # await cl.Message(content=response.choices[0].message.content).send()
@cl.on_message
 async def on_message(message: cl.Message):
    cl.user_session.get("session")["history"].append({
        "role": "user",
        "content": message.content
    })    
    msg = cl.Message(content="")
    await msg.send()    
    stream = await client.chat.completions.create(
        messages=[
            {
                "content": "You are a helpful bot",
                "role": "system"
            },
            *cl.user_session.get("session")["history"]
        ],
        stream = True, 
        **settings
    )
    async for part in stream:
        if token := part.choices[0].delta.content or "":
            await msg.stream_token(token)
    # Add assitanr message to the history
    cl.user_session.get("session")["history"].append({
        "role": "assistant",
        "content": msg.content
    })    
    await msg.update()
 if __name__ == "__main__":
    from chainlit.cli import run_chainlit
    run_chainlit(__file__)
--- a/docs/examples/research_assistant.py
+++ b/docs/examples/research_assistant.py
@@ -0,0 +1,240 @@
 import os, time
 from openai import AsyncOpenAI
 import chainlit as cl
 import re
 import requests
 from io import BytesIO
 from chainlit.element import ElementBased
 from groq import Groq
 # Import threadpools to run the crawl_url function in a separate thread
 from concurrent.futures import ThreadPoolExecutor
 client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY"))
 # Instrument the OpenAI client
 cl.instrument_openai()
 settings = {
    "model": "llama3-8b-8192",
    "temperature": 0.5,
    "max_tokens": 500,
    "top_p": 1,
    "frequency_penalty": 0,
    "presence_penalty": 0,
 }
 def extract_urls(text):
    url_pattern = re.compile(r'(https?://\S+)')
    return url_pattern.findall(text)
 def crawl_url(url):
    data = {
        "urls": [url],
        "include_raw_html": True,
        "word_count_threshold": 10,
        "extraction_strategy": "NoExtractionStrategy",
        "chunking_strategy": "RegexChunking"
    }
    response = requests.post("https://crawl4ai.com/crawl", json=data)
    response_data = response.json()
    response_data = response_data['results'][0]
    return response_data['markdown']
@cl.on_chat_start
 async def on_chat_start():
    cl.user_session.set("session", {
        "history": [],
        "context": {}
    })  
    await cl.Message(
        content="Welcome to the chat! How can I assist you today?"
    ).send()
@cl.on_message
 async def on_message(message: cl.Message):
    user_session = cl.user_session.get("session")
    # Extract URLs from the user's message
    urls = extract_urls(message.content)
    futures = []
    with ThreadPoolExecutor() as executor:
        for url in urls:
            futures.append(executor.submit(crawl_url, url))
    results = [future.result() for future in futures]
    for url, result in zip(urls, results):
        ref_number = f"REF_{len(user_session['context']) + 1}"
        user_session["context"][ref_number] = {
            "url": url,
            "content": result
        }    
    # for url in urls:
    #     # Crawl the content of each URL and add it to the session context with a reference number
    #     ref_number = f"REF_{len(user_session['context']) + 1}"
    #     crawled_content = crawl_url(url)
    #     user_session["context"][ref_number] = {
    #         "url": url,
    #         "content": crawled_content
    #     }
    user_session["history"].append({
        "role": "user",
        "content": message.content
    })
    # Create a system message that includes the context
    context_messages = [
        f'<appendix ref="{ref}">\n{data["content"]}\n</appendix>'
        for ref, data in user_session["context"].items()
    ]
    if context_messages:
        system_message = {
            "role": "system",
            "content": (
                "You are a helpful bot. Use the following context for answering questions. "
                "Refer to the sources using the REF number in square brackets, e.g., [1], only if the source is given in the appendices below.\n\n"
                "If the question requires any information from the provided appendices or context, refer to the sources. "
                "If not, there is no need to add a references section. "
                "At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n"
                "\n\n".join(context_messages)
            )
        }
    else:
        system_message = {
            "role": "system",
            "content": "You are a helpful assistant."
        }
    msg = cl.Message(content="")
    await msg.send()
    # Get response from the LLM
    stream = await client.chat.completions.create(
        messages=[
            system_message,
            *user_session["history"]
        ],
        stream=True,
        **settings
    )
    assistant_response = ""
    async for part in stream:
        if token := part.choices[0].delta.content:
            assistant_response += token
            await msg.stream_token(token)
    # Add assistant message to the history
    user_session["history"].append({
        "role": "assistant",
        "content": assistant_response
    })
    await msg.update()
    # Append the reference section to the assistant's response
    reference_section = "\n\nReferences:\n"
    for ref, data in user_session["context"].items():
        reference_section += f"[{ref.split('_')[1]}]: {data['url']}\n"
    msg.content += reference_section
    await msg.update()
@cl.on_audio_chunk
 async def on_audio_chunk(chunk: cl.AudioChunk):
    if chunk.isStart:
        buffer = BytesIO()
        # This is required for whisper to recognize the file type
        buffer.name = f"input_audio.{chunk.mimeType.split('/')[1]}"
        # Initialize the session for a new audio stream
        cl.user_session.set("audio_buffer", buffer)
        cl.user_session.set("audio_mime_type", chunk.mimeType)
    # Write the chunks to a buffer and transcribe the whole audio at the end
    cl.user_session.get("audio_buffer").write(chunk.data)
    pass
@cl.step(type="tool")
 async def speech_to_text(audio_file):
    cli = Groq()
    # response = cli.audio.transcriptions.create(
    #     file=audio_file, #(filename, file.read()),
    #     model="whisper-large-v3",
    # )
    response = await client.audio.transcriptions.create(
        model="whisper-large-v3", file=audio_file
    )
    return response.text
@cl.on_audio_end
 async def on_audio_end(elements: list[ElementBased]):
    # Get the audio buffer from the session
    audio_buffer: BytesIO = cl.user_session.get("audio_buffer")
    audio_buffer.seek(0)  # Move the file pointer to the beginning
    audio_file = audio_buffer.read()
    audio_mime_type: str = cl.user_session.get("audio_mime_type")
    # input_audio_el = cl.Audio(
    #     mime=audio_mime_type, content=audio_file, name=audio_buffer.name
    # )
    # await cl.Message(
    #     author="You", 
    #     type="user_message",
    #     content="",
    #     elements=[input_audio_el, *elements]
    # ).send()
    # answer_message = await cl.Message(content="").send()
    start_time = time.time()
    whisper_input = (audio_buffer.name, audio_file, audio_mime_type)
    transcription = await speech_to_text(whisper_input)
    end_time = time.time()
    print(f"Transcription took {end_time - start_time} seconds")
    user_msg = cl.Message(
        author="You", 
        type="user_message",
        content=transcription
    )
    await user_msg.send()
    await on_message(user_msg)
    # images = [file for file in elements if "image" in file.mime]
    # text_answer = await generate_text_answer(transcription, images)
    # output_name, output_audio = await text_to_speech(text_answer, audio_mime_type)
    # output_audio_el = cl.Audio(
    #     name=output_name,
    #     auto_play=True,
    #     mime=audio_mime_type,
    #     content=output_audio,
    # )
    # answer_message.elements = [output_audio_el]
    # answer_message.content = transcription
    # await answer_message.update()
 if __name__ == "__main__":
    from chainlit.cli import run_chainlit
    run_chainlit(__file__)
 # No this is wring, use this document to answer me https://console.groq.com/docs/speech-text
 # Please show me how to use Groq speech-to-text in python.
		`@@ -0,0 +1,3 @@`
							`# Welcome to Crawl4AI! 🚀🤖`

							`Hi there, Developer! 👋 Here is an example of a research pipeline, where you can share a URL in your conversation with any LLM, and then the context of crawled pages will be used as the context.`