diff --git a/.files/screenshot.png b/.files/screenshot.png
new file mode 100644
index 00000000..c8005487
Binary files /dev/null and b/.files/screenshot.png differ
diff --git a/.gitignore b/.gitignore
index 846ac59a..407c5cdb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -174,4 +174,8 @@ requirements0.txt
a.txt
*.sh
-.idea
\ No newline at end of file
+.idea
+docs/examples/.chainlit/
+docs/examples/.chainlit/*
+.chainlit/config.toml
+.chainlit/translations/en-US.json
diff --git a/README.md b/README.md
index 4ea1fc9c..8c7efc60 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Crawl4AI v0.2.2 š·ļøš¤
+# Crawl4AI v0.2.3 š·ļøš¤
[](https://github.com/unclecode/crawl4ai/stargazers)
[](https://github.com/unclecode/crawl4ai/network/members)
@@ -12,6 +12,10 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
## Recent Changes
+### v0.2.3
+- šØ Extract and return all media tags (Images, Audio, and Video). Check `result.media`
+- š¼ļø Take [screenshots](#taking-screenshots) of the page.
+
### v0.2.2
- Support multiple JS scripts
- Fixed some of bugs
@@ -229,7 +233,7 @@ To use the REST API, send a POST request to `http://localhost:8000/crawl` with t
}
```
-For more information about the available parameters and their descriptions, refer to the [Parameters](#parameters) section.
+For more information about the available parameters and their descriptions, refer to the [Parameters](#parameters-) section.
## Python Library Usage š
@@ -262,6 +266,14 @@ Crawl result without raw HTML content:
result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
```
+### Taking Screenshots
+
+```python
+result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
+with open("screenshot.png", "wb") as f:
+ f.write(base64.b64decode(result.screenshot))
+```
+
### Adding a chunking strategy: RegexChunking
Using RegexChunking:
@@ -368,6 +380,7 @@ result = crawler.run(url="https://www.nbcnews.com/business")
| `urls` | A list of URLs to crawl and extract data from. | Yes | - |
| `include_raw_html` | Whether to include the raw HTML content in the response. | No | `false` |
| `bypass_cache` | Whether to force a fresh crawl even if the URL has been previously crawled. | No | `false` |
+| `screenshots` | Whether to take screenshots of the page. | No | `false` |
| `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5). | No | `5` |
| `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `NoExtractionStrategy` |
| `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` |
diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py
index 60d5c54f..b85055a5 100644
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -7,6 +7,15 @@ from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import InvalidArgumentException
import logging
+import base64
+from PIL import Image, ImageDraw, ImageFont
+from io import BytesIO
+from typing import List
+import requests
+import os
+from pathlib import Path
+from .utils import wrap_text
+
logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
logger.setLevel(logging.WARNING)
@@ -25,15 +34,16 @@ driver_finder_logger = logging.getLogger('selenium.webdriver.common.driver_finde
driver_finder_logger.setLevel(logging.WARNING)
-from typing import List
-import requests
-import os
-from pathlib import Path
+
class CrawlerStrategy(ABC):
@abstractmethod
def crawl(self, url: str, **kwargs) -> str:
pass
+
+ @abstractmethod
+ def take_screenshot(self, save_path: str):
+ pass
class CloudCrawlerStrategy(CrawlerStrategy):
def __init__(self, use_cached_html = False):
@@ -132,5 +142,62 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
except Exception as e:
raise Exception(f"Failed to crawl {url}: {str(e)}")
+ def take_screenshot(self) -> str:
+ try:
+ # Get the dimensions of the page
+ total_width = self.driver.execute_script("return document.body.scrollWidth")
+ total_height = self.driver.execute_script("return document.body.scrollHeight")
+
+ # Set the window size to the dimensions of the page
+ self.driver.set_window_size(total_width, total_height)
+
+ # Take screenshot
+ screenshot = self.driver.get_screenshot_as_png()
+
+ # Open the screenshot with PIL
+ image = Image.open(BytesIO(screenshot))
+
+ # Convert to JPEG and compress
+ buffered = BytesIO()
+ image.save(buffered, format="JPEG", quality=85)
+ img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+
+ if self.verbose:
+ print(f"[LOG] šø Screenshot taken and converted to base64")
+
+ return img_base64
+
+ except Exception as e:
+ error_message = f"Failed to take screenshot: {str(e)}"
+ print(error_message)
+
+ # Generate an image with black background
+ img = Image.new('RGB', (800, 600), color='black')
+ draw = ImageDraw.Draw(img)
+
+ # Load a font
+ try:
+ font = ImageFont.truetype("arial.ttf", 40)
+ except IOError:
+ font = ImageFont.load_default(size=40)
+
+ # Define text color and wrap the text
+ text_color = (255, 255, 255)
+ max_width = 780
+ wrapped_text = wrap_text(draw, error_message, font, max_width)
+
+ # Calculate text position
+ text_position = (10, 10)
+
+ # Draw the text on the image
+ draw.text(text_position, wrapped_text, fill=text_color, font=font)
+
+ # Convert to base64
+ buffered = BytesIO()
+ img.save(buffered, format="JPEG")
+ img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+
+ return img_base64
+
def quit(self):
self.driver.quit()
\ No newline at end of file
diff --git a/crawl4ai/database.py b/crawl4ai/database.py
index 391d3f4f..380973b8 100644
--- a/crawl4ai/database.py
+++ b/crawl4ai/database.py
@@ -1,13 +1,12 @@
import os
from pathlib import Path
import sqlite3
-from typing import Optional
from typing import Optional, Tuple
DB_PATH = os.path.join(Path.home(), ".crawl4ai")
os.makedirs(DB_PATH, exist_ok=True)
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
-
+
def init_db():
global DB_PATH
conn = sqlite3.connect(DB_PATH)
@@ -19,22 +18,35 @@ def init_db():
cleaned_html TEXT,
markdown TEXT,
extracted_content TEXT,
- success BOOLEAN
+ success BOOLEAN,
+ media TEXT DEFAULT "{}",
+ screenshot TEXT DEFAULT ""
)
''')
conn.commit()
conn.close()
-def check_db_path():
- if not DB_PATH:
- raise ValueError("Database path is not set or is empty.")
-
-def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
+def alter_db_add_screenshot(new_column: str = "media"):
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
- cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success FROM crawled_data WHERE url = ?', (url,))
+ cursor.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
+ conn.commit()
+ conn.close()
+ except Exception as e:
+ print(f"Error altering database to add screenshot column: {e}")
+
+def check_db_path():
+ if not DB_PATH:
+ raise ValueError("Database path is not set or is empty.")
+
+def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool, str]]:
+ check_db_path()
+ try:
+ conn = sqlite3.connect(DB_PATH)
+ cursor = conn.cursor()
+ cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, screenshot FROM crawled_data WHERE url = ?', (url,))
result = cursor.fetchone()
conn.close()
return result
@@ -42,21 +54,23 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
print(f"Error retrieving cached URL: {e}")
return None
-def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool):
+def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", screenshot: str = ""):
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('''
- INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success)
- VALUES (?, ?, ?, ?, ?, ?)
+ INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, screenshot)
+ VALUES (?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
html = excluded.html,
cleaned_html = excluded.cleaned_html,
markdown = excluded.markdown,
extracted_content = excluded.extracted_content,
- success = excluded.success
- ''', (url, html, cleaned_html, markdown, extracted_content, success))
+ success = excluded.success,
+ media = excluded.media,
+ screenshot = excluded.screenshot
+ ''', (url, html, cleaned_html, markdown, extracted_content, success, media, screenshot))
conn.commit()
conn.close()
except Exception as e:
@@ -95,4 +109,20 @@ def flush_db():
conn.commit()
conn.close()
except Exception as e:
- print(f"Error flushing database: {e}")
\ No newline at end of file
+ print(f"Error flushing database: {e}")
+
+def update_existing_records(new_column: str = "media", default_value: str = "{}"):
+ check_db_path()
+ try:
+ conn = sqlite3.connect(DB_PATH)
+ cursor = conn.cursor()
+ cursor.execute(f'UPDATE crawled_data SET {new_column} = "{default_value}" WHERE screenshot IS NULL')
+ conn.commit()
+ conn.close()
+ except Exception as e:
+ print(f"Error updating existing records: {e}")
+
+if __name__ == "__main__":
+ init_db() # Initialize the database if not already initialized
+ alter_db_add_screenshot() # Add the new column to the table
+ update_existing_records() # Update existing records to set the new column to an empty string
diff --git a/crawl4ai/models.py b/crawl4ai/models.py
index c2c2d61e..4a21579c 100644
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -1,5 +1,5 @@
from pydantic import BaseModel, HttpUrl
-from typing import List
+from typing import List, Dict, Optional
class UrlModel(BaseModel):
url: HttpUrl
@@ -9,8 +9,10 @@ class CrawlResult(BaseModel):
url: str
html: str
success: bool
- cleaned_html: str = None
- markdown: str = None
- extracted_content: str = None
- metadata: dict = None
- error_message: str = None
\ No newline at end of file
+ cleaned_html: Optional[str] = None
+ media: Dict[str, List[Dict]] = {}
+ screenshot: Optional[str] = None
+ markdown: Optional[str] = None
+ extracted_content: Optional[str] = None
+ metadata: Optional[dict] = None
+ error_message: Optional[str] = None
\ No newline at end of file
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index cbeca812..cd6f7c93 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -180,6 +180,35 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
if tag.name != 'img':
tag.attrs = {}
+ # Extract all img tgas inti [{src: '', alt: ''}]
+ media = {
+ 'images': [],
+ 'videos': [],
+ 'audios': []
+ }
+ for img in body.find_all('img'):
+ media['images'].append({
+ 'src': img.get('src'),
+ 'alt': img.get('alt'),
+ "type": "image"
+ })
+
+ # Extract all video tags into [{src: '', alt: ''}]
+ for video in body.find_all('video'):
+ media['videos'].append({
+ 'src': video.get('src'),
+ 'alt': video.get('alt'),
+ "type": "video"
+ })
+
+ # Extract all audio tags into [{src: '', alt: ''}]
+ for audio in body.find_all('audio'):
+ media['audios'].append({
+ 'src': audio.get('src'),
+ 'alt': audio.get('alt'),
+ "type": "audio"
+ })
+
# Replace images with their alt text or remove them if no alt text is available
for img in body.find_all('img'):
alt_text = img.get('alt')
@@ -299,7 +328,8 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
return{
'markdown': markdown,
'cleaned_html': cleaned_html,
- 'success': True
+ 'success': True,
+ 'media': media
}
except Exception as e:
@@ -483,4 +513,16 @@ def process_sections(url: str, sections: list, provider: str, api_token: str) ->
for future in as_completed(futures):
extracted_content.extend(future.result())
- return extracted_content
\ No newline at end of file
+ return extracted_content
+
+
+def wrap_text(draw, text, font, max_width):
+ # Wrap the text to fit within the specified width
+ lines = []
+ words = text.split()
+ while words:
+ line = ''
+ while words and draw.textbbox((0, 0), line + words[0], font=font)[2] <= max_width:
+ line += (words.pop(0) + ' ')
+ lines.append(line)
+ return '\n'.join(lines)
\ No newline at end of file
diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py
index 4535930c..f27bf8cf 100644
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -59,6 +59,8 @@ class WebCrawler:
api_token: str = None,
extract_blocks_flag: bool = True,
word_count_threshold=MIN_WORD_THRESHOLD,
+ css_selector: str = None,
+ screenshot: bool = False,
use_cached_html: bool = False,
extraction_strategy: ExtractionStrategy = None,
chunking_strategy: ChunkingStrategy = RegexChunking(),
@@ -70,6 +72,8 @@ class WebCrawler:
extraction_strategy or NoExtractionStrategy(),
chunking_strategy,
bypass_cache=url_model.forced,
+ css_selector=css_selector,
+ screenshot=screenshot,
**kwargs,
)
pass
@@ -83,6 +87,7 @@ class WebCrawler:
chunking_strategy: ChunkingStrategy = RegexChunking(),
bypass_cache: bool = False,
css_selector: str = None,
+ screenshot: bool = False,
verbose=True,
**kwargs,
) -> CrawlResult:
@@ -110,6 +115,8 @@ class WebCrawler:
"markdown": cached[3],
"extracted_content": cached[4],
"success": cached[5],
+ "media": json.loads(cached[6] or "{}"),
+ "screenshot": cached[7],
"error_message": "",
}
)
@@ -117,6 +124,9 @@ class WebCrawler:
# Initialize WebDriver for crawling
t = time.time()
html = self.crawler_strategy.crawl(url)
+ base64_image = None
+ if screenshot:
+ base64_image = self.crawler_strategy.take_screenshot()
success = True
error_message = ""
# Extract content from HTML
@@ -129,6 +139,7 @@ class WebCrawler:
cleaned_html = result.get("cleaned_html", html)
markdown = result.get("markdown", "")
+ media = result.get("media", [])
# Print a profession LOG style message, show time taken and say crawling is done
if verbose:
@@ -163,6 +174,8 @@ class WebCrawler:
markdown,
extracted_content,
success,
+ json.dumps(media),
+ screenshot=base64_image,
)
return CrawlResult(
@@ -170,6 +183,8 @@ class WebCrawler:
html=html,
cleaned_html=cleaned_html,
markdown=markdown,
+ media=media,
+ screenshot=base64_image,
extracted_content=extracted_content,
success=success,
error_message=error_message,
@@ -183,6 +198,8 @@ class WebCrawler:
extract_blocks_flag: bool = True,
word_count_threshold=MIN_WORD_THRESHOLD,
use_cached_html: bool = False,
+ css_selector: str = None,
+ screenshot: bool = False,
extraction_strategy: ExtractionStrategy = None,
chunking_strategy: ChunkingStrategy = RegexChunking(),
**kwargs,
@@ -200,6 +217,8 @@ class WebCrawler:
[api_token] * len(url_models),
[extract_blocks_flag] * len(url_models),
[word_count_threshold] * len(url_models),
+ [css_selector] * len(url_models),
+ [screenshot] * len(url_models),
[use_cached_html] * len(url_models),
[extraction_strategy] * len(url_models),
[chunking_strategy] * len(url_models),
diff --git a/docs/examples/assets/audio.mp3 b/docs/examples/assets/audio.mp3
new file mode 100644
index 00000000..299149c6
Binary files /dev/null and b/docs/examples/assets/audio.mp3 differ
diff --git a/docs/examples/chainlit.md b/docs/examples/chainlit.md
new file mode 100644
index 00000000..3b34b02f
--- /dev/null
+++ b/docs/examples/chainlit.md
@@ -0,0 +1,3 @@
+# Welcome to Crawl4AI! šš¤
+
+Hi there, Developer! š Here is an example of a research pipeline, where you can share a URL in your conversation with any LLM, and then the context of crawled pages will be used as the context.
\ No newline at end of file
diff --git a/docs/examples/chainlit_review.py b/docs/examples/chainlit_review.py
new file mode 100644
index 00000000..2c03d17d
--- /dev/null
+++ b/docs/examples/chainlit_review.py
@@ -0,0 +1,281 @@
+from openai import AsyncOpenAI
+from chainlit.types import ThreadDict
+import chainlit as cl
+from chainlit.input_widget import Select, Switch, Slider
+client = AsyncOpenAI()
+
+# Instrument the OpenAI client
+cl.instrument_openai()
+
+settings = {
+ "model": "gpt-3.5-turbo",
+ "temperature": 0.5,
+ "max_tokens": 500,
+ "top_p": 1,
+ "frequency_penalty": 0,
+ "presence_penalty": 0,
+}
+
+@cl.action_callback("action_button")
+async def on_action(action: cl.Action):
+ print("The user clicked on the action button!")
+
+ return "Thank you for clicking on the action button!"
+
+@cl.set_chat_profiles
+async def chat_profile():
+ return [
+ cl.ChatProfile(
+ name="GPT-3.5",
+ markdown_description="The underlying LLM model is **GPT-3.5**.",
+ icon="https://picsum.photos/200",
+ ),
+ cl.ChatProfile(
+ name="GPT-4",
+ markdown_description="The underlying LLM model is **GPT-4**.",
+ icon="https://picsum.photos/250",
+ ),
+ ]
+
+@cl.on_chat_start
+async def on_chat_start():
+
+ settings = await cl.ChatSettings(
+ [
+ Select(
+ id="Model",
+ label="OpenAI - Model",
+ values=["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "gpt-4-32k"],
+ initial_index=0,
+ ),
+ Switch(id="Streaming", label="OpenAI - Stream Tokens", initial=True),
+ Slider(
+ id="Temperature",
+ label="OpenAI - Temperature",
+ initial=1,
+ min=0,
+ max=2,
+ step=0.1,
+ ),
+ Slider(
+ id="SAI_Steps",
+ label="Stability AI - Steps",
+ initial=30,
+ min=10,
+ max=150,
+ step=1,
+ description="Amount of inference steps performed on image generation.",
+ ),
+ Slider(
+ id="SAI_Cfg_Scale",
+ label="Stability AI - Cfg_Scale",
+ initial=7,
+ min=1,
+ max=35,
+ step=0.1,
+ description="Influences how strongly your generation is guided to match your prompt.",
+ ),
+ Slider(
+ id="SAI_Width",
+ label="Stability AI - Image Width",
+ initial=512,
+ min=256,
+ max=2048,
+ step=64,
+ tooltip="Measured in pixels",
+ ),
+ Slider(
+ id="SAI_Height",
+ label="Stability AI - Image Height",
+ initial=512,
+ min=256,
+ max=2048,
+ step=64,
+ tooltip="Measured in pixels",
+ ),
+ ]
+ ).send()
+
+ chat_profile = cl.user_session.get("chat_profile")
+ await cl.Message(
+ content=f"starting chat using the {chat_profile} chat profile"
+ ).send()
+
+ print("A new chat session has started!")
+ cl.user_session.set("session", {
+ "history": [],
+ "context": []
+ })
+
+ image = cl.Image(url="https://c.tenor.com/uzWDSSLMCmkAAAAd/tenor.gif", name="cat image", display="inline")
+
+ # Attach the image to the message
+ await cl.Message(
+ content="You are such a good girl, aren't you?!",
+ elements=[image],
+ ).send()
+
+ text_content = "Hello, this is a text element."
+ elements = [
+ cl.Text(name="simple_text", content=text_content, display="inline")
+ ]
+
+ await cl.Message(
+ content="Check out this text element!",
+ elements=elements,
+ ).send()
+
+ elements = [
+ cl.Audio(path="./assets/audio.mp3", display="inline"),
+ ]
+ await cl.Message(
+ content="Here is an audio file",
+ elements=elements,
+ ).send()
+
+ await cl.Avatar(
+ name="Tool 1",
+ url="https://avatars.githubusercontent.com/u/128686189?s=400&u=a1d1553023f8ea0921fba0debbe92a8c5f840dd9&v=4",
+ ).send()
+
+ await cl.Message(
+ content="This message should not have an avatar!", author="Tool 0"
+ ).send()
+
+ await cl.Message(
+ content="This message should have an avatar!", author="Tool 1"
+ ).send()
+
+ elements = [
+ cl.File(
+ name="quickstart.py",
+ path="./quickstart.py",
+ display="inline",
+ ),
+ ]
+
+ await cl.Message(
+ content="This message has a file element", elements=elements
+ ).send()
+
+ # Sending an action button within a chatbot message
+ actions = [
+ cl.Action(name="action_button", value="example_value", description="Click me!")
+ ]
+
+ await cl.Message(content="Interact with this action button:", actions=actions).send()
+
+ # res = await cl.AskActionMessage(
+ # content="Pick an action!",
+ # actions=[
+ # cl.Action(name="continue", value="continue", label="ā
Continue"),
+ # cl.Action(name="cancel", value="cancel", label="ā Cancel"),
+ # ],
+ # ).send()
+
+ # if res and res.get("value") == "continue":
+ # await cl.Message(
+ # content="Continue!",
+ # ).send()
+
+ # import plotly.graph_objects as go
+ # fig = go.Figure(
+ # data=[go.Bar(y=[2, 1, 3])],
+ # layout_title_text="An example figure",
+ # )
+ # elements = [cl.Plotly(name="chart", figure=fig, display="inline")]
+
+ # await cl.Message(content="This message has a chart", elements=elements).send()
+
+ # Sending a pdf with the local file path
+ # elements = [
+ # cl.Pdf(name="pdf1", display="inline", path="./pdf1.pdf")
+ # ]
+
+ # cl.Message(content="Look at this local pdf!", elements=elements).send()
+
+@cl.on_settings_update
+async def setup_agent(settings):
+ print("on_settings_update", settings)
+
+@cl.on_stop
+def on_stop():
+ print("The user wants to stop the task!")
+
+@cl.on_chat_end
+def on_chat_end():
+ print("The user disconnected!")
+
+
+@cl.on_chat_resume
+async def on_chat_resume(thread: ThreadDict):
+ print("The user resumed a previous chat session!")
+
+
+
+
+# @cl.on_message
+async def on_message(message: cl.Message):
+ cl.user_session.get("session")["history"].append({
+ "role": "user",
+ "content": message.content
+ })
+ response = await client.chat.completions.create(
+ messages=[
+ {
+ "content": "You are a helpful bot",
+ "role": "system"
+ },
+ *cl.user_session.get("session")["history"]
+ ],
+ **settings
+ )
+
+
+ # Add assitanr message to the history
+ cl.user_session.get("session")["history"].append({
+ "role": "assistant",
+ "content": response.choices[0].message.content
+ })
+
+ # msg.content = response.choices[0].message.content
+ # await msg.update()
+
+ # await cl.Message(content=response.choices[0].message.content).send()
+
+@cl.on_message
+async def on_message(message: cl.Message):
+ cl.user_session.get("session")["history"].append({
+ "role": "user",
+ "content": message.content
+ })
+
+ msg = cl.Message(content="")
+ await msg.send()
+
+ stream = await client.chat.completions.create(
+ messages=[
+ {
+ "content": "You are a helpful bot",
+ "role": "system"
+ },
+ *cl.user_session.get("session")["history"]
+ ],
+ stream = True,
+ **settings
+ )
+
+ async for part in stream:
+ if token := part.choices[0].delta.content or "":
+ await msg.stream_token(token)
+
+ # Add assitanr message to the history
+ cl.user_session.get("session")["history"].append({
+ "role": "assistant",
+ "content": msg.content
+ })
+ await msg.update()
+
+if __name__ == "__main__":
+ from chainlit.cli import run_chainlit
+ run_chainlit(__file__)
\ No newline at end of file
diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py
index 6046c9bb..0fdd3772 100644
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -39,6 +39,16 @@ def basic_usage(crawler):
cprint("[LOG] š¦ [bold yellow]Basic crawl result:[/bold yellow]")
print_result(result)
+def screenshot_usage(crawler):
+ cprint("\nšø [bold cyan]Let's take a screenshot of the page![/bold cyan]")
+ result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
+ cprint("[LOG] š¦ [bold yellow]Screenshot result:[/bold yellow]")
+ # Save the screenshot to a file
+ with open("screenshot.png", "wb") as f:
+ f.write(base64.b64decode(result.screenshot))
+ cprint("Screenshot saved to 'screenshot.png'!")
+ print_result(result)
+
def understanding_parameters(crawler):
cprint("\nš§ [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.")
@@ -191,6 +201,7 @@ def main():
understanding_parameters(crawler)
crawler.always_by_pass_cache = True
+ screenshot_usage(crawler)
add_chunking_strategy(crawler)
add_extraction_strategy(crawler)
add_llm_extraction_strategy(crawler)
diff --git a/docs/examples/research_assistant.py b/docs/examples/research_assistant.py
new file mode 100644
index 00000000..620c5bdd
--- /dev/null
+++ b/docs/examples/research_assistant.py
@@ -0,0 +1,241 @@
+# Make sur to install the required packageschainlit and groq
+import os, time
+from openai import AsyncOpenAI
+import chainlit as cl
+import re
+import requests
+from io import BytesIO
+from chainlit.element import ElementBased
+from groq import Groq
+
+# Import threadpools to run the crawl_url function in a separate thread
+from concurrent.futures import ThreadPoolExecutor
+
+client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY"))
+
+# Instrument the OpenAI client
+cl.instrument_openai()
+
+settings = {
+ "model": "llama3-8b-8192",
+ "temperature": 0.5,
+ "max_tokens": 500,
+ "top_p": 1,
+ "frequency_penalty": 0,
+ "presence_penalty": 0,
+}
+
+def extract_urls(text):
+ url_pattern = re.compile(r'(https?://\S+)')
+ return url_pattern.findall(text)
+
+def crawl_url(url):
+ data = {
+ "urls": [url],
+ "include_raw_html": True,
+ "word_count_threshold": 10,
+ "extraction_strategy": "NoExtractionStrategy",
+ "chunking_strategy": "RegexChunking"
+ }
+ response = requests.post("https://crawl4ai.com/crawl", json=data)
+ response_data = response.json()
+ response_data = response_data['results'][0]
+ return response_data['markdown']
+
+@cl.on_chat_start
+async def on_chat_start():
+ cl.user_session.set("session", {
+ "history": [],
+ "context": {}
+ })
+ await cl.Message(
+ content="Welcome to the chat! How can I assist you today?"
+ ).send()
+
+@cl.on_message
+async def on_message(message: cl.Message):
+ user_session = cl.user_session.get("session")
+
+ # Extract URLs from the user's message
+ urls = extract_urls(message.content)
+
+
+ futures = []
+ with ThreadPoolExecutor() as executor:
+ for url in urls:
+ futures.append(executor.submit(crawl_url, url))
+
+ results = [future.result() for future in futures]
+
+ for url, result in zip(urls, results):
+ ref_number = f"REF_{len(user_session['context']) + 1}"
+ user_session["context"][ref_number] = {
+ "url": url,
+ "content": result
+ }
+
+ # for url in urls:
+ # # Crawl the content of each URL and add it to the session context with a reference number
+ # ref_number = f"REF_{len(user_session['context']) + 1}"
+ # crawled_content = crawl_url(url)
+ # user_session["context"][ref_number] = {
+ # "url": url,
+ # "content": crawled_content
+ # }
+
+ user_session["history"].append({
+ "role": "user",
+ "content": message.content
+ })
+
+ # Create a system message that includes the context
+ context_messages = [
+ f'
crawler.always_by_pass_cache = True
result = crawler.run(
+ url="https://www.nbcnews.com/business",
+ screenshot=True
+)
+with open("screenshot.png", "wb") as f:
+ f.write(base64.b64decode(result.screenshot))
+ js_code = """
+ js_code = ["""
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
loadMoreButton && loadMoreButton.click();
-"""
+"""]
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
result = crawler.run(url="https://www.nbcnews.com/business")
+ Remember that you can pass multiple JavaScript code snippets in the list. They all will be executed in the order they are passed.