diff --git a/.files/screenshot.png b/.files/screenshot.png new file mode 100644 index 00000000..c8005487 Binary files /dev/null and b/.files/screenshot.png differ diff --git a/.gitignore b/.gitignore index 846ac59a..407c5cdb 100644 --- a/.gitignore +++ b/.gitignore @@ -174,4 +174,8 @@ requirements0.txt a.txt *.sh -.idea \ No newline at end of file +.idea +docs/examples/.chainlit/ +docs/examples/.chainlit/* +.chainlit/config.toml +.chainlit/translations/en-US.json diff --git a/README.md b/README.md index 4ea1fc9c..8c7efc60 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Crawl4AI v0.2.2 šŸ•·ļøšŸ¤– +# Crawl4AI v0.2.3 šŸ•·ļøšŸ¤– [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers) [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members) @@ -12,6 +12,10 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information ## Recent Changes +### v0.2.3 +- šŸŽØ Extract and return all media tags (Images, Audio, and Video). Check `result.media` +- šŸ–¼ļø Take [screenshots](#taking-screenshots) of the page. + ### v0.2.2 - Support multiple JS scripts - Fixed some of bugs @@ -229,7 +233,7 @@ To use the REST API, send a POST request to `http://localhost:8000/crawl` with t } ``` -For more information about the available parameters and their descriptions, refer to the [Parameters](#parameters) section. +For more information about the available parameters and their descriptions, refer to the [Parameters](#parameters-) section. ## Python Library Usage šŸš€ @@ -262,6 +266,14 @@ Crawl result without raw HTML content: result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False) ``` +### Taking Screenshots + +```python +result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True) +with open("screenshot.png", "wb") as f: + f.write(base64.b64decode(result.screenshot)) +``` + ### Adding a chunking strategy: RegexChunking Using RegexChunking: @@ -368,6 +380,7 @@ result = crawler.run(url="https://www.nbcnews.com/business") | `urls` | A list of URLs to crawl and extract data from. | Yes | - | | `include_raw_html` | Whether to include the raw HTML content in the response. | No | `false` | | `bypass_cache` | Whether to force a fresh crawl even if the URL has been previously crawled. | No | `false` | +| `screenshots` | Whether to take screenshots of the page. | No | `false` | | `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5). | No | `5` | | `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `NoExtractionStrategy` | | `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` | diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index 60d5c54f..b85055a5 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -7,6 +7,15 @@ from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options from selenium.common.exceptions import InvalidArgumentException import logging +import base64 +from PIL import Image, ImageDraw, ImageFont +from io import BytesIO +from typing import List +import requests +import os +from pathlib import Path +from .utils import wrap_text + logger = logging.getLogger('selenium.webdriver.remote.remote_connection') logger.setLevel(logging.WARNING) @@ -25,15 +34,16 @@ driver_finder_logger = logging.getLogger('selenium.webdriver.common.driver_finde driver_finder_logger.setLevel(logging.WARNING) -from typing import List -import requests -import os -from pathlib import Path + class CrawlerStrategy(ABC): @abstractmethod def crawl(self, url: str, **kwargs) -> str: pass + + @abstractmethod + def take_screenshot(self, save_path: str): + pass class CloudCrawlerStrategy(CrawlerStrategy): def __init__(self, use_cached_html = False): @@ -132,5 +142,62 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): except Exception as e: raise Exception(f"Failed to crawl {url}: {str(e)}") + def take_screenshot(self) -> str: + try: + # Get the dimensions of the page + total_width = self.driver.execute_script("return document.body.scrollWidth") + total_height = self.driver.execute_script("return document.body.scrollHeight") + + # Set the window size to the dimensions of the page + self.driver.set_window_size(total_width, total_height) + + # Take screenshot + screenshot = self.driver.get_screenshot_as_png() + + # Open the screenshot with PIL + image = Image.open(BytesIO(screenshot)) + + # Convert to JPEG and compress + buffered = BytesIO() + image.save(buffered, format="JPEG", quality=85) + img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8') + + if self.verbose: + print(f"[LOG] šŸ“ø Screenshot taken and converted to base64") + + return img_base64 + + except Exception as e: + error_message = f"Failed to take screenshot: {str(e)}" + print(error_message) + + # Generate an image with black background + img = Image.new('RGB', (800, 600), color='black') + draw = ImageDraw.Draw(img) + + # Load a font + try: + font = ImageFont.truetype("arial.ttf", 40) + except IOError: + font = ImageFont.load_default(size=40) + + # Define text color and wrap the text + text_color = (255, 255, 255) + max_width = 780 + wrapped_text = wrap_text(draw, error_message, font, max_width) + + # Calculate text position + text_position = (10, 10) + + # Draw the text on the image + draw.text(text_position, wrapped_text, fill=text_color, font=font) + + # Convert to base64 + buffered = BytesIO() + img.save(buffered, format="JPEG") + img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8') + + return img_base64 + def quit(self): self.driver.quit() \ No newline at end of file diff --git a/crawl4ai/database.py b/crawl4ai/database.py index 391d3f4f..380973b8 100644 --- a/crawl4ai/database.py +++ b/crawl4ai/database.py @@ -1,13 +1,12 @@ import os from pathlib import Path import sqlite3 -from typing import Optional from typing import Optional, Tuple DB_PATH = os.path.join(Path.home(), ".crawl4ai") os.makedirs(DB_PATH, exist_ok=True) DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") - + def init_db(): global DB_PATH conn = sqlite3.connect(DB_PATH) @@ -19,22 +18,35 @@ def init_db(): cleaned_html TEXT, markdown TEXT, extracted_content TEXT, - success BOOLEAN + success BOOLEAN, + media TEXT DEFAULT "{}", + screenshot TEXT DEFAULT "" ) ''') conn.commit() conn.close() -def check_db_path(): - if not DB_PATH: - raise ValueError("Database path is not set or is empty.") - -def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]: +def alter_db_add_screenshot(new_column: str = "media"): check_db_path() try: conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() - cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success FROM crawled_data WHERE url = ?', (url,)) + cursor.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') + conn.commit() + conn.close() + except Exception as e: + print(f"Error altering database to add screenshot column: {e}") + +def check_db_path(): + if not DB_PATH: + raise ValueError("Database path is not set or is empty.") + +def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool, str]]: + check_db_path() + try: + conn = sqlite3.connect(DB_PATH) + cursor = conn.cursor() + cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, screenshot FROM crawled_data WHERE url = ?', (url,)) result = cursor.fetchone() conn.close() return result @@ -42,21 +54,23 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]: print(f"Error retrieving cached URL: {e}") return None -def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool): +def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", screenshot: str = ""): check_db_path() try: conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() cursor.execute(''' - INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success) - VALUES (?, ?, ?, ?, ?, ?) + INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, screenshot) + VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(url) DO UPDATE SET html = excluded.html, cleaned_html = excluded.cleaned_html, markdown = excluded.markdown, extracted_content = excluded.extracted_content, - success = excluded.success - ''', (url, html, cleaned_html, markdown, extracted_content, success)) + success = excluded.success, + media = excluded.media, + screenshot = excluded.screenshot + ''', (url, html, cleaned_html, markdown, extracted_content, success, media, screenshot)) conn.commit() conn.close() except Exception as e: @@ -95,4 +109,20 @@ def flush_db(): conn.commit() conn.close() except Exception as e: - print(f"Error flushing database: {e}") \ No newline at end of file + print(f"Error flushing database: {e}") + +def update_existing_records(new_column: str = "media", default_value: str = "{}"): + check_db_path() + try: + conn = sqlite3.connect(DB_PATH) + cursor = conn.cursor() + cursor.execute(f'UPDATE crawled_data SET {new_column} = "{default_value}" WHERE screenshot IS NULL') + conn.commit() + conn.close() + except Exception as e: + print(f"Error updating existing records: {e}") + +if __name__ == "__main__": + init_db() # Initialize the database if not already initialized + alter_db_add_screenshot() # Add the new column to the table + update_existing_records() # Update existing records to set the new column to an empty string diff --git a/crawl4ai/models.py b/crawl4ai/models.py index c2c2d61e..4a21579c 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -1,5 +1,5 @@ from pydantic import BaseModel, HttpUrl -from typing import List +from typing import List, Dict, Optional class UrlModel(BaseModel): url: HttpUrl @@ -9,8 +9,10 @@ class CrawlResult(BaseModel): url: str html: str success: bool - cleaned_html: str = None - markdown: str = None - extracted_content: str = None - metadata: dict = None - error_message: str = None \ No newline at end of file + cleaned_html: Optional[str] = None + media: Dict[str, List[Dict]] = {} + screenshot: Optional[str] = None + markdown: Optional[str] = None + extracted_content: Optional[str] = None + metadata: Optional[dict] = None + error_message: Optional[str] = None \ No newline at end of file diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index cbeca812..cd6f7c93 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -180,6 +180,35 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_ if tag.name != 'img': tag.attrs = {} + # Extract all img tgas inti [{src: '', alt: ''}] + media = { + 'images': [], + 'videos': [], + 'audios': [] + } + for img in body.find_all('img'): + media['images'].append({ + 'src': img.get('src'), + 'alt': img.get('alt'), + "type": "image" + }) + + # Extract all video tags into [{src: '', alt: ''}] + for video in body.find_all('video'): + media['videos'].append({ + 'src': video.get('src'), + 'alt': video.get('alt'), + "type": "video" + }) + + # Extract all audio tags into [{src: '', alt: ''}] + for audio in body.find_all('audio'): + media['audios'].append({ + 'src': audio.get('src'), + 'alt': audio.get('alt'), + "type": "audio" + }) + # Replace images with their alt text or remove them if no alt text is available for img in body.find_all('img'): alt_text = img.get('alt') @@ -299,7 +328,8 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_ return{ 'markdown': markdown, 'cleaned_html': cleaned_html, - 'success': True + 'success': True, + 'media': media } except Exception as e: @@ -483,4 +513,16 @@ def process_sections(url: str, sections: list, provider: str, api_token: str) -> for future in as_completed(futures): extracted_content.extend(future.result()) - return extracted_content \ No newline at end of file + return extracted_content + + +def wrap_text(draw, text, font, max_width): + # Wrap the text to fit within the specified width + lines = [] + words = text.split() + while words: + line = '' + while words and draw.textbbox((0, 0), line + words[0], font=font)[2] <= max_width: + line += (words.pop(0) + ' ') + lines.append(line) + return '\n'.join(lines) \ No newline at end of file diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 4535930c..f27bf8cf 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -59,6 +59,8 @@ class WebCrawler: api_token: str = None, extract_blocks_flag: bool = True, word_count_threshold=MIN_WORD_THRESHOLD, + css_selector: str = None, + screenshot: bool = False, use_cached_html: bool = False, extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), @@ -70,6 +72,8 @@ class WebCrawler: extraction_strategy or NoExtractionStrategy(), chunking_strategy, bypass_cache=url_model.forced, + css_selector=css_selector, + screenshot=screenshot, **kwargs, ) pass @@ -83,6 +87,7 @@ class WebCrawler: chunking_strategy: ChunkingStrategy = RegexChunking(), bypass_cache: bool = False, css_selector: str = None, + screenshot: bool = False, verbose=True, **kwargs, ) -> CrawlResult: @@ -110,6 +115,8 @@ class WebCrawler: "markdown": cached[3], "extracted_content": cached[4], "success": cached[5], + "media": json.loads(cached[6] or "{}"), + "screenshot": cached[7], "error_message": "", } ) @@ -117,6 +124,9 @@ class WebCrawler: # Initialize WebDriver for crawling t = time.time() html = self.crawler_strategy.crawl(url) + base64_image = None + if screenshot: + base64_image = self.crawler_strategy.take_screenshot() success = True error_message = "" # Extract content from HTML @@ -129,6 +139,7 @@ class WebCrawler: cleaned_html = result.get("cleaned_html", html) markdown = result.get("markdown", "") + media = result.get("media", []) # Print a profession LOG style message, show time taken and say crawling is done if verbose: @@ -163,6 +174,8 @@ class WebCrawler: markdown, extracted_content, success, + json.dumps(media), + screenshot=base64_image, ) return CrawlResult( @@ -170,6 +183,8 @@ class WebCrawler: html=html, cleaned_html=cleaned_html, markdown=markdown, + media=media, + screenshot=base64_image, extracted_content=extracted_content, success=success, error_message=error_message, @@ -183,6 +198,8 @@ class WebCrawler: extract_blocks_flag: bool = True, word_count_threshold=MIN_WORD_THRESHOLD, use_cached_html: bool = False, + css_selector: str = None, + screenshot: bool = False, extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), **kwargs, @@ -200,6 +217,8 @@ class WebCrawler: [api_token] * len(url_models), [extract_blocks_flag] * len(url_models), [word_count_threshold] * len(url_models), + [css_selector] * len(url_models), + [screenshot] * len(url_models), [use_cached_html] * len(url_models), [extraction_strategy] * len(url_models), [chunking_strategy] * len(url_models), diff --git a/docs/examples/assets/audio.mp3 b/docs/examples/assets/audio.mp3 new file mode 100644 index 00000000..299149c6 Binary files /dev/null and b/docs/examples/assets/audio.mp3 differ diff --git a/docs/examples/chainlit.md b/docs/examples/chainlit.md new file mode 100644 index 00000000..3b34b02f --- /dev/null +++ b/docs/examples/chainlit.md @@ -0,0 +1,3 @@ +# Welcome to Crawl4AI! šŸš€šŸ¤– + +Hi there, Developer! šŸ‘‹ Here is an example of a research pipeline, where you can share a URL in your conversation with any LLM, and then the context of crawled pages will be used as the context. \ No newline at end of file diff --git a/docs/examples/chainlit_review.py b/docs/examples/chainlit_review.py new file mode 100644 index 00000000..2c03d17d --- /dev/null +++ b/docs/examples/chainlit_review.py @@ -0,0 +1,281 @@ +from openai import AsyncOpenAI +from chainlit.types import ThreadDict +import chainlit as cl +from chainlit.input_widget import Select, Switch, Slider +client = AsyncOpenAI() + +# Instrument the OpenAI client +cl.instrument_openai() + +settings = { + "model": "gpt-3.5-turbo", + "temperature": 0.5, + "max_tokens": 500, + "top_p": 1, + "frequency_penalty": 0, + "presence_penalty": 0, +} + +@cl.action_callback("action_button") +async def on_action(action: cl.Action): + print("The user clicked on the action button!") + + return "Thank you for clicking on the action button!" + +@cl.set_chat_profiles +async def chat_profile(): + return [ + cl.ChatProfile( + name="GPT-3.5", + markdown_description="The underlying LLM model is **GPT-3.5**.", + icon="https://picsum.photos/200", + ), + cl.ChatProfile( + name="GPT-4", + markdown_description="The underlying LLM model is **GPT-4**.", + icon="https://picsum.photos/250", + ), + ] + +@cl.on_chat_start +async def on_chat_start(): + + settings = await cl.ChatSettings( + [ + Select( + id="Model", + label="OpenAI - Model", + values=["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "gpt-4-32k"], + initial_index=0, + ), + Switch(id="Streaming", label="OpenAI - Stream Tokens", initial=True), + Slider( + id="Temperature", + label="OpenAI - Temperature", + initial=1, + min=0, + max=2, + step=0.1, + ), + Slider( + id="SAI_Steps", + label="Stability AI - Steps", + initial=30, + min=10, + max=150, + step=1, + description="Amount of inference steps performed on image generation.", + ), + Slider( + id="SAI_Cfg_Scale", + label="Stability AI - Cfg_Scale", + initial=7, + min=1, + max=35, + step=0.1, + description="Influences how strongly your generation is guided to match your prompt.", + ), + Slider( + id="SAI_Width", + label="Stability AI - Image Width", + initial=512, + min=256, + max=2048, + step=64, + tooltip="Measured in pixels", + ), + Slider( + id="SAI_Height", + label="Stability AI - Image Height", + initial=512, + min=256, + max=2048, + step=64, + tooltip="Measured in pixels", + ), + ] + ).send() + + chat_profile = cl.user_session.get("chat_profile") + await cl.Message( + content=f"starting chat using the {chat_profile} chat profile" + ).send() + + print("A new chat session has started!") + cl.user_session.set("session", { + "history": [], + "context": [] + }) + + image = cl.Image(url="https://c.tenor.com/uzWDSSLMCmkAAAAd/tenor.gif", name="cat image", display="inline") + + # Attach the image to the message + await cl.Message( + content="You are such a good girl, aren't you?!", + elements=[image], + ).send() + + text_content = "Hello, this is a text element." + elements = [ + cl.Text(name="simple_text", content=text_content, display="inline") + ] + + await cl.Message( + content="Check out this text element!", + elements=elements, + ).send() + + elements = [ + cl.Audio(path="./assets/audio.mp3", display="inline"), + ] + await cl.Message( + content="Here is an audio file", + elements=elements, + ).send() + + await cl.Avatar( + name="Tool 1", + url="https://avatars.githubusercontent.com/u/128686189?s=400&u=a1d1553023f8ea0921fba0debbe92a8c5f840dd9&v=4", + ).send() + + await cl.Message( + content="This message should not have an avatar!", author="Tool 0" + ).send() + + await cl.Message( + content="This message should have an avatar!", author="Tool 1" + ).send() + + elements = [ + cl.File( + name="quickstart.py", + path="./quickstart.py", + display="inline", + ), + ] + + await cl.Message( + content="This message has a file element", elements=elements + ).send() + + # Sending an action button within a chatbot message + actions = [ + cl.Action(name="action_button", value="example_value", description="Click me!") + ] + + await cl.Message(content="Interact with this action button:", actions=actions).send() + + # res = await cl.AskActionMessage( + # content="Pick an action!", + # actions=[ + # cl.Action(name="continue", value="continue", label="āœ… Continue"), + # cl.Action(name="cancel", value="cancel", label="āŒ Cancel"), + # ], + # ).send() + + # if res and res.get("value") == "continue": + # await cl.Message( + # content="Continue!", + # ).send() + + # import plotly.graph_objects as go + # fig = go.Figure( + # data=[go.Bar(y=[2, 1, 3])], + # layout_title_text="An example figure", + # ) + # elements = [cl.Plotly(name="chart", figure=fig, display="inline")] + + # await cl.Message(content="This message has a chart", elements=elements).send() + + # Sending a pdf with the local file path + # elements = [ + # cl.Pdf(name="pdf1", display="inline", path="./pdf1.pdf") + # ] + + # cl.Message(content="Look at this local pdf!", elements=elements).send() + +@cl.on_settings_update +async def setup_agent(settings): + print("on_settings_update", settings) + +@cl.on_stop +def on_stop(): + print("The user wants to stop the task!") + +@cl.on_chat_end +def on_chat_end(): + print("The user disconnected!") + + +@cl.on_chat_resume +async def on_chat_resume(thread: ThreadDict): + print("The user resumed a previous chat session!") + + + + +# @cl.on_message +async def on_message(message: cl.Message): + cl.user_session.get("session")["history"].append({ + "role": "user", + "content": message.content + }) + response = await client.chat.completions.create( + messages=[ + { + "content": "You are a helpful bot", + "role": "system" + }, + *cl.user_session.get("session")["history"] + ], + **settings + ) + + + # Add assitanr message to the history + cl.user_session.get("session")["history"].append({ + "role": "assistant", + "content": response.choices[0].message.content + }) + + # msg.content = response.choices[0].message.content + # await msg.update() + + # await cl.Message(content=response.choices[0].message.content).send() + +@cl.on_message +async def on_message(message: cl.Message): + cl.user_session.get("session")["history"].append({ + "role": "user", + "content": message.content + }) + + msg = cl.Message(content="") + await msg.send() + + stream = await client.chat.completions.create( + messages=[ + { + "content": "You are a helpful bot", + "role": "system" + }, + *cl.user_session.get("session")["history"] + ], + stream = True, + **settings + ) + + async for part in stream: + if token := part.choices[0].delta.content or "": + await msg.stream_token(token) + + # Add assitanr message to the history + cl.user_session.get("session")["history"].append({ + "role": "assistant", + "content": msg.content + }) + await msg.update() + +if __name__ == "__main__": + from chainlit.cli import run_chainlit + run_chainlit(__file__) \ No newline at end of file diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py index 6046c9bb..0fdd3772 100644 --- a/docs/examples/quickstart.py +++ b/docs/examples/quickstart.py @@ -39,6 +39,16 @@ def basic_usage(crawler): cprint("[LOG] šŸ“¦ [bold yellow]Basic crawl result:[/bold yellow]") print_result(result) +def screenshot_usage(crawler): + cprint("\nšŸ“ø [bold cyan]Let's take a screenshot of the page![/bold cyan]") + result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True) + cprint("[LOG] šŸ“¦ [bold yellow]Screenshot result:[/bold yellow]") + # Save the screenshot to a file + with open("screenshot.png", "wb") as f: + f.write(base64.b64decode(result.screenshot)) + cprint("Screenshot saved to 'screenshot.png'!") + print_result(result) + def understanding_parameters(crawler): cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]") cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.") @@ -191,6 +201,7 @@ def main(): understanding_parameters(crawler) crawler.always_by_pass_cache = True + screenshot_usage(crawler) add_chunking_strategy(crawler) add_extraction_strategy(crawler) add_llm_extraction_strategy(crawler) diff --git a/docs/examples/research_assistant.py b/docs/examples/research_assistant.py new file mode 100644 index 00000000..620c5bdd --- /dev/null +++ b/docs/examples/research_assistant.py @@ -0,0 +1,241 @@ +# Make sur to install the required packageschainlit and groq +import os, time +from openai import AsyncOpenAI +import chainlit as cl +import re +import requests +from io import BytesIO +from chainlit.element import ElementBased +from groq import Groq + +# Import threadpools to run the crawl_url function in a separate thread +from concurrent.futures import ThreadPoolExecutor + +client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY")) + +# Instrument the OpenAI client +cl.instrument_openai() + +settings = { + "model": "llama3-8b-8192", + "temperature": 0.5, + "max_tokens": 500, + "top_p": 1, + "frequency_penalty": 0, + "presence_penalty": 0, +} + +def extract_urls(text): + url_pattern = re.compile(r'(https?://\S+)') + return url_pattern.findall(text) + +def crawl_url(url): + data = { + "urls": [url], + "include_raw_html": True, + "word_count_threshold": 10, + "extraction_strategy": "NoExtractionStrategy", + "chunking_strategy": "RegexChunking" + } + response = requests.post("https://crawl4ai.com/crawl", json=data) + response_data = response.json() + response_data = response_data['results'][0] + return response_data['markdown'] + +@cl.on_chat_start +async def on_chat_start(): + cl.user_session.set("session", { + "history": [], + "context": {} + }) + await cl.Message( + content="Welcome to the chat! How can I assist you today?" + ).send() + +@cl.on_message +async def on_message(message: cl.Message): + user_session = cl.user_session.get("session") + + # Extract URLs from the user's message + urls = extract_urls(message.content) + + + futures = [] + with ThreadPoolExecutor() as executor: + for url in urls: + futures.append(executor.submit(crawl_url, url)) + + results = [future.result() for future in futures] + + for url, result in zip(urls, results): + ref_number = f"REF_{len(user_session['context']) + 1}" + user_session["context"][ref_number] = { + "url": url, + "content": result + } + + # for url in urls: + # # Crawl the content of each URL and add it to the session context with a reference number + # ref_number = f"REF_{len(user_session['context']) + 1}" + # crawled_content = crawl_url(url) + # user_session["context"][ref_number] = { + # "url": url, + # "content": crawled_content + # } + + user_session["history"].append({ + "role": "user", + "content": message.content + }) + + # Create a system message that includes the context + context_messages = [ + f'\n{data["content"]}\n' + for ref, data in user_session["context"].items() + ] + if context_messages: + system_message = { + "role": "system", + "content": ( + "You are a helpful bot. Use the following context for answering questions. " + "Refer to the sources using the REF number in square brackets, e.g., [1], only if the source is given in the appendices below.\n\n" + "If the question requires any information from the provided appendices or context, refer to the sources. " + "If not, there is no need to add a references section. " + "At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n" + "\n\n".join(context_messages) + ) + } + else: + system_message = { + "role": "system", + "content": "You are a helpful assistant." + } + + + msg = cl.Message(content="") + await msg.send() + + # Get response from the LLM + stream = await client.chat.completions.create( + messages=[ + system_message, + *user_session["history"] + ], + stream=True, + **settings + ) + + assistant_response = "" + async for part in stream: + if token := part.choices[0].delta.content: + assistant_response += token + await msg.stream_token(token) + + # Add assistant message to the history + user_session["history"].append({ + "role": "assistant", + "content": assistant_response + }) + await msg.update() + + # Append the reference section to the assistant's response + reference_section = "\n\nReferences:\n" + for ref, data in user_session["context"].items(): + reference_section += f"[{ref.split('_')[1]}]: {data['url']}\n" + + msg.content += reference_section + await msg.update() + + +@cl.on_audio_chunk +async def on_audio_chunk(chunk: cl.AudioChunk): + if chunk.isStart: + buffer = BytesIO() + # This is required for whisper to recognize the file type + buffer.name = f"input_audio.{chunk.mimeType.split('/')[1]}" + # Initialize the session for a new audio stream + cl.user_session.set("audio_buffer", buffer) + cl.user_session.set("audio_mime_type", chunk.mimeType) + + # Write the chunks to a buffer and transcribe the whole audio at the end + cl.user_session.get("audio_buffer").write(chunk.data) + + pass + +@cl.step(type="tool") +async def speech_to_text(audio_file): + cli = Groq() + + # response = cli.audio.transcriptions.create( + # file=audio_file, #(filename, file.read()), + # model="whisper-large-v3", + # ) + + response = await client.audio.transcriptions.create( + model="whisper-large-v3", file=audio_file + ) + + return response.text + + +@cl.on_audio_end +async def on_audio_end(elements: list[ElementBased]): + # Get the audio buffer from the session + audio_buffer: BytesIO = cl.user_session.get("audio_buffer") + audio_buffer.seek(0) # Move the file pointer to the beginning + audio_file = audio_buffer.read() + audio_mime_type: str = cl.user_session.get("audio_mime_type") + + # input_audio_el = cl.Audio( + # mime=audio_mime_type, content=audio_file, name=audio_buffer.name + # ) + # await cl.Message( + # author="You", + # type="user_message", + # content="", + # elements=[input_audio_el, *elements] + # ).send() + + # answer_message = await cl.Message(content="").send() + + + start_time = time.time() + whisper_input = (audio_buffer.name, audio_file, audio_mime_type) + transcription = await speech_to_text(whisper_input) + end_time = time.time() + print(f"Transcription took {end_time - start_time} seconds") + + user_msg = cl.Message( + author="You", + type="user_message", + content=transcription + ) + await user_msg.send() + await on_message(user_msg) + + # images = [file for file in elements if "image" in file.mime] + + # text_answer = await generate_text_answer(transcription, images) + + # output_name, output_audio = await text_to_speech(text_answer, audio_mime_type) + + # output_audio_el = cl.Audio( + # name=output_name, + # auto_play=True, + # mime=audio_mime_type, + # content=output_audio, + # ) + + # answer_message.elements = [output_audio_el] + + # answer_message.content = transcription + # await answer_message.update() + +if __name__ == "__main__": + from chainlit.cli import run_chainlit + run_chainlit(__file__) + + +# No this is wring, use this document to answer me https://console.groq.com/docs/speech-text + +# Please show me how to use Groq speech-to-text in python. \ No newline at end of file diff --git a/main.py b/main.py index 5dca8771..b3196770 100644 --- a/main.py +++ b/main.py @@ -56,6 +56,7 @@ class CrawlRequest(BaseModel): chunking_strategy: Optional[str] = "RegexChunking" chunking_strategy_args: Optional[dict] = {} css_selector: Optional[str] = None + screenshot: Optional[bool] = False verbose: Optional[bool] = True @@ -125,6 +126,7 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request): chunking_strategy, crawl_request.bypass_cache, crawl_request.css_selector, + crawl_request.screenshot, crawl_request.verbose ) for url in crawl_request.urls @@ -136,7 +138,7 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request): for result in results: result.html = None - return {"results": [result.dict() for result in results]} + return {"results": [result.model_dump() for result in results]} finally: async with lock: current_requests -= 1 diff --git a/pages/app.js b/pages/app.js index 1a09969e..098008ab 100644 --- a/pages/app.js +++ b/pages/app.js @@ -104,6 +104,7 @@ document.getElementById("crawl-btn").addEventListener("click", () => { chunking_strategy: document.getElementById("chunking-strategy-select").value, chunking_strategy_args: {}, css_selector: document.getElementById("css-selector").value, + screenshot: document.getElementById("screenshot-checkbox").checked, // instruction: document.getElementById("instruction").value, // semantic_filter: document.getElementById("semantic_filter").value, verbose: true, @@ -137,7 +138,15 @@ document.getElementById("crawl-btn").addEventListener("click", () => { document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2); document.getElementById("cleaned-html-result").textContent = result.cleaned_html; document.getElementById("markdown-result").textContent = result.markdown; - + document.getElementById("media-result").textContent = JSON.stringify( result.media, null, 2); + if (result.screenshot){ + const imgElement = document.createElement("img"); + // Set the src attribute with the base64 data + imgElement.src = `data:image/png;base64,${result.screenshot}`; + document.getElementById("screenshot-result").innerHTML = ""; + document.getElementById("screenshot-result").appendChild(imgElement); + } + // Update code examples dynamically const extractionStrategy = data.extraction_strategy; const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy"; diff --git a/pages/partial/how_to_guide.html b/pages/partial/how_to_guide.html index b8f85ed6..270fcacb 100644 --- a/pages/partial/how_to_guide.html +++ b/pages/partial/how_to_guide.html @@ -50,6 +50,20 @@ crawler.warmup()
crawler.always_by_pass_cache = True
+ +
+ šŸ“ø + Let's take a screenshot of the page! +
+
+
result = crawler.run(
+    url="https://www.nbcnews.com/business",
+    screenshot=True
+)
+with open("screenshot.png", "wb") as f:
+    f.write(base64.b64decode(result.screenshot))
+
+
@@ -139,13 +153,14 @@ crawler.warmup()
Using JavaScript to click 'Load More' button:
-
js_code = """
+            
js_code = ["""
 const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
 loadMoreButton && loadMoreButton.click();
-"""
+"""]
 crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
 crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
 result = crawler.run(url="https://www.nbcnews.com/business")
+
Remember that you can pass multiple JavaScript code snippets in the list. They all will be executed in the order they are passed.
diff --git a/pages/partial/try_it.html b/pages/partial/try_it.html index 544e69dd..f707004a 100644 --- a/pages/partial/try_it.html +++ b/pages/partial/try_it.html @@ -1,4 +1,4 @@ -
+

Try It Now

@@ -124,6 +124,10 @@
+
+ + +