From 853b9d59d88f2eb403df2fecc9cfeec4c7e62261 Mon Sep 17 00:00:00 2001 From: unclecode Date: Tue, 18 Jun 2024 20:00:51 +0800 Subject: [PATCH] feat: Add hooks for enhanced control over Selenium drivers - Added six hooks: on_driver_created, before_get_url, after_get_url, before_return_html, on_user_agent_updated. - Included example usage in quickstart.py. - Updated README and changelog. --- CHANGELOG.md | 12 +++++++++++- README.md | 10 ++++++++++ crawl4ai/crawler_strategy.py | 4 +++- pages/index.html | 2 +- setup.py | 2 +- 5 files changed, 26 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 793353b7..eb854b1d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # Changelog +## [0.2.5] - 2024-06-18 +### Added +- Added five important hooks to the crawler: + - on_driver_created: Called when the driver is ready for initializations. + - before_get_url: Called right before Selenium fetches the URL. + - after_get_url: Called after Selenium fetches the URL. + - before_return_html: Called when the data is parsed and ready. + - on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize. +- Added an example in `quickstart.py` in the example folder under the docs. + ## [0.2.4] - 2024-06-17 ### Fixed -- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs \ No newline at end of file +- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs diff --git a/README.md b/README.md index ab4cf3f6..795b8f36 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,16 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information ## Recent Changes +### v0.2.5 +- 🌟 Added six important hooks to the crawler: + - 🟢 on_driver_created: Called when the driver is ready for initializations. + - 🔵 before_get_url: Called right before Selenium fetches the URL. + - 🟣 after_get_url: Called after Selenium fetches the URL. + - 🟠 before_return_html: Called when the data is parsed and ready. + - 🟡 on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize. +- 📄 Added an example in `quickstart.py` in the example folder under the docs. + + ### v0.2.4 - 🐞 Resolve the issue with the long url. (Issue #22) diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index 8cadd75c..ecf0863a 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -104,6 +104,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): # Hooks self.hooks = { 'on_driver_created': None, + 'on_user_agent_updated': None, 'before_get_url': None, 'after_get_url': None, 'before_return_html': None @@ -114,6 +115,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): self.service = Service(chromedriver_autoinstaller.install()) self.service.log_path = "NUL" self.driver = webdriver.Chrome(service=self.service, options=self.options) + self.driver = self.execute_hook('on_driver_created', self.driver) def set_hook(self, hook_type: str, hook: Callable): if hook_type in self.hooks: @@ -137,7 +139,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): self.options.add_argument(f"user-agent={user_agent}") self.driver.quit() self.driver = webdriver.Chrome(service=self.service, options=self.options) - self.driver = self.execute_hook('on_driver_created', self.driver) + self.driver = self.execute_hook('on_user_agent_updated', self.driver) def set_custom_headers(self, headers: dict): # Enable Network domain for sending headers diff --git a/pages/index.html b/pages/index.html index fa352090..c9e2b54f 100644 --- a/pages/index.html +++ b/pages/index.html @@ -25,7 +25,7 @@
-

🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.4

+

🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.5

📊 Total Website Processed diff --git a/setup.py b/setup.py index 168dfac6..2d05e206 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ class CustomInstallCommand(install): setup( name="Crawl4AI", - version="0.2.4", + version="0.2.5", description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper", long_description=open("README.md").read(), long_description_content_type="text/markdown",