From 659c8cd95303f18fa031f99338ec85a0c3100083 Mon Sep 17 00:00:00 2001 From: unclecode Date: Fri, 2 Aug 2024 15:55:32 +0800 Subject: [PATCH] refactor: Update image description minimum word threshold in get_content_of_website_optimized --- CONTRIBUTORS.md | 31 +++++++++++++++++++ README.md | 16 +++++++++- .../examples/llm_extraction_openai_pricing.py | 3 +- docs/md/changelog.md | 19 ++++++++++++ docs/md/index.md | 2 +- docs/md/installation.md | 2 ++ docs/md/introduction.md | 12 ------- setup.py | 2 +- 8 files changed, 71 insertions(+), 16 deletions(-) create mode 100644 CONTRIBUTORS.md diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md new file mode 100644 index 00000000..0e45ca85 --- /dev/null +++ b/CONTRIBUTORS.md @@ -0,0 +1,31 @@ +# Contributors to Crawl4AI + +We would like to thank the following people for their contributions to Crawl4AI: + +## Core Team + +- [Unclecode](https://github.com/unclecode) - Project Creator and Main Developer +- [Nasrin](https://github.com/ntohidi) - Project Manager and Developer + +## Community Contributors + +- [Aravind Karnam](https://github.com/aravindkarnam) - Developed textual description extraction feature +- [FractalMind](https://github.com/FractalMind) - Created the first official Docker Hub image and fixed Dockerfile errors +- [ketonkss4](https://github.com/ketonkss4) - Identified Selenium's new capabilities, helping reduce dependencies + +## Other Contributors + +- [Gokhan](https://github.com/gkhngyk) +- [Shiv Kumar](https://github.com/shivkumar0757) +- [QIN2DIM](https://github.com/QIN2DIM) + + +## Acknowledgements + +We also want to thank all the users who have reported bugs, suggested features, or helped in any other way to make Crawl4AI better. + +--- + +If you've contributed to Crawl4AI and your name isn't on this list, please [open a pull request](https://github.com/unclecode/crawl4ai/pulls) with your name, link, and contribution, and we'll review it promptly. + +Thank you all for your contributions! \ No newline at end of file diff --git a/README.md b/README.md index 39f70ef8..20cca5f3 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Crawl4AI v0.2.75 🕷️🤖 +# Crawl4AI v0.2.7765 🕷️🤖 [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers) [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members) @@ -10,6 +10,8 @@ Crawl4AI simplifies web crawling and data extraction, making it accessible for l ## Try it Now! +✨ Play around with this [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1sJPAmeLj5PMrg2VgOwMJ2ubGIcK0cJeX?usp=sharing) + ✨ visit our [Documentation Website](https://crawl4ai.com/mkdocs/) ✨ Check [Demo](https://crawl4ai.com/mkdocs/demo) @@ -31,6 +33,18 @@ Crawl4AI simplifies web crawling and data extraction, making it accessible for l - 🎯 CSS selector support - 📝 Passes instructions/keywords to refine extraction +# Crawl4AI + +## 🌟 Shoutout to Contributors of v0.2.76! + +A big thank you to the amazing contributors who've made this release possible: + +- [@aravindkarnam](https://github.com/aravindkarnam) for the new image description feature +- [@FractalMind](https://github.com/FractalMind) for our official Docker Hub image +- [@ketonkss4](https://github.com/ketonkss4) for helping streamline our Selenium setup + +Your contributions are driving Crawl4AI forward! 🚀 + ## Cool Examples 🚀 ### Quick Start diff --git a/docs/examples/llm_extraction_openai_pricing.py b/docs/examples/llm_extraction_openai_pricing.py index 9330ad31..d05a1b6b 100644 --- a/docs/examples/llm_extraction_openai_pricing.py +++ b/docs/examples/llm_extraction_openai_pricing.py @@ -21,7 +21,8 @@ result = crawler.run( url=url, word_count_threshold=1, extraction_strategy= LLMExtractionStrategy( - provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), + # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), + provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'), schema=OpenAIModelFee.model_json_schema(), extraction_type="schema", instruction="From the crawled content, extract all mentioned model names along with their "\ diff --git a/docs/md/changelog.md b/docs/md/changelog.md index c9d78140..301badfd 100644 --- a/docs/md/changelog.md +++ b/docs/md/changelog.md @@ -1,5 +1,24 @@ # Changelog +# Changelog + +## [v0.2.76] - 2024-08-02 + +Major improvements in functionality, performance, and cross-platform compatibility! 🚀 + +- 🐳 **Docker enhancements**: Significantly improved Dockerfile for easy installation on Linux, Mac, and Windows. +- 🌐 **Official Docker Hub image**: Launched our first official image on Docker Hub for streamlined deployment. +- 🔧 **Selenium upgrade**: Removed dependency on ChromeDriver, now using Selenium's built-in capabilities for better compatibility. +- 🖼️ **Image description**: Implemented ability to generate textual descriptions for extracted images from web pages. +- ⚡ **Performance boost**: Various improvements to enhance overall speed and performance. + +A big shoutout to our amazing community contributors: +- [@aravindkarnam](https://github.com/aravindkarnam) for developing the textual description extraction feature. +- [@FractalMind](https://github.com/FractalMind) for creating the first official Docker Hub image and fixing Dockerfile errors. +- [@ketonkss4](https://github.com/ketonkss4) for identifying Selenium's new capabilities, helping us reduce dependencies. + +Your contributions are driving Crawl4AI forward! 🙌 + ## [v0.2.75] - 2024-07-19 Minor improvements for a more maintainable codebase: diff --git a/docs/md/index.md b/docs/md/index.md index f0b69c6f..8c4abb48 100644 --- a/docs/md/index.md +++ b/docs/md/index.md @@ -1,4 +1,4 @@ -# Crawl4AI v0.2.75 +# Crawl4AI v0.2.76 Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI. diff --git a/docs/md/installation.md b/docs/md/installation.md index b6d9dc4b..8ff0ba1f 100644 --- a/docs/md/installation.md +++ b/docs/md/installation.md @@ -8,6 +8,8 @@ There are three ways to use Crawl4AI: ## Option 1: Library Installation +You can try this Colab for a quick start: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1sJPAmeLj5PMrg2VgOwMJ2ubGIcK0cJeX#scrollTo=g1RrmI4W_rPk) + Crawl4AI offers flexible installation options to suit various use cases. Choose the option that best fits your needs: - **Default Installation** (Basic functionality): diff --git a/docs/md/introduction.md b/docs/md/introduction.md index d4a13081..6d1ad56b 100644 --- a/docs/md/introduction.md +++ b/docs/md/introduction.md @@ -20,18 +20,6 @@ Crawl4AI is designed to simplify the process of crawling web pages and extractin - **🎯 CSS Selector Support**: Extract specific content using CSS selectors. - **📝 Instruction/Keyword Refinement**: Pass instructions or keywords to refine the extraction process. -## Recent Changes (v0.2.5) 🌟 - -- **New Hooks**: Added six important hooks to the crawler: - - 🟢 `on_driver_created`: Called when the driver is ready for initializations. - - 🔵 `before_get_url`: Called right before Selenium fetches the URL. - - 🟣 `after_get_url`: Called after Selenium fetches the URL. - - 🟠 `before_return_html`: Called when the data is parsed and ready. - - 🟡 `on_user_agent_updated`: Called when the user changes the user agent, causing the driver to reinitialize. -- **New Example**: Added an example in [`quickstart.py`](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart.py) in the example folder under the docs. -- **Improved Semantic Context**: Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness. -- **Dockerfile Update**: Updated Dockerfile to ensure compatibility across multiple platforms. - Check the [Changelog](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md) for more details. ## Power and Simplicity of Crawl4AI 🚀 diff --git a/setup.py b/setup.py index bee477d7..5918c721 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ transformer_requirements = [req for req in requirements if req.startswith(("tran setup( name="Crawl4AI", - version="0.2.74", + version="0.2.76", description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown",