From 853b9d59d88f2eb403df2fecc9cfeec4c7e62261 Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Tue, 18 Jun 2024 20:00:51 +0800
Subject: [PATCH] feat: Add hooks for enhanced control over Selenium drivers

- Added six hooks: on_driver_created, before_get_url, after_get_url, before_return_html, on_user_agent_updated.
- Included example usage in quickstart.py.
- Updated README and changelog.
---
 CHANGELOG.md                 | 12 +++++++++++-
 README.md                    | 10 ++++++++++
 crawl4ai/crawler_strategy.py |  4 +++-
 pages/index.html             |  2 +-
 setup.py                     |  2 +-
 5 files changed, 26 insertions(+), 4 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 793353b7..eb854b1d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,15 @@
 # Changelog
 
+## [0.2.5] - 2024-06-18
+### Added
+- Added five important hooks to the crawler:
+  - on_driver_created: Called when the driver is ready for initializations.
+  - before_get_url: Called right before Selenium fetches the URL.
+  - after_get_url: Called after Selenium fetches the URL.
+  - before_return_html: Called when the data is parsed and ready.
+  - on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize.
+- Added an example in `quickstart.py` in the example folder under the docs.
+
 ## [0.2.4] - 2024-06-17
 ### Fixed
-- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
\ No newline at end of file
+- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
diff --git a/README.md b/README.md
index ab4cf3f6..795b8f36 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,16 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
 
 ## Recent Changes 
 
+### v0.2.5
+- 🌟 Added six important hooks to the crawler:
+  - 🟢 on_driver_created: Called when the driver is ready for initializations.
+  - 🔵 before_get_url: Called right before Selenium fetches the URL.
+  - 🟣 after_get_url: Called after Selenium fetches the URL.
+  - 🟠 before_return_html: Called when the data is parsed and ready.
+  - 🟡 on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize.
+- 📄 Added an example in `quickstart.py` in the example folder under the docs.
+
+
 ### v0.2.4
 - 🐞 Resolve the issue with the long url. (Issue #22)
 
diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py
index 8cadd75c..ecf0863a 100644
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -104,6 +104,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
         # Hooks
         self.hooks = {
             'on_driver_created': None,
+            'on_user_agent_updated': None,
             'before_get_url': None,
             'after_get_url': None,
             'before_return_html': None
@@ -114,6 +115,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
         self.service = Service(chromedriver_autoinstaller.install())
         self.service.log_path = "NUL"
         self.driver = webdriver.Chrome(service=self.service, options=self.options)
+        self.driver = self.execute_hook('on_driver_created', self.driver)
 
     def set_hook(self, hook_type: str, hook: Callable):
         if hook_type in self.hooks:
@@ -137,7 +139,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
         self.options.add_argument(f"user-agent={user_agent}")
         self.driver.quit()
         self.driver = webdriver.Chrome(service=self.service, options=self.options)
-        self.driver = self.execute_hook('on_driver_created', self.driver)
+        self.driver = self.execute_hook('on_user_agent_updated', self.driver)
 
     def set_custom_headers(self, headers: dict):
         # Enable Network domain for sending headers
diff --git a/pages/index.html b/pages/index.html
index fa352090..c9e2b54f 100644
--- a/pages/index.html
+++ b/pages/index.html
@@ -25,7 +25,7 @@
         <header class="bg-zinc-950 text-lime-500 py-4 flex">
             
             <div class="mx-auto px-4">
-                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.4</h1>
+                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.5</h1>
             </div>
             <div class="mx-auto px-4 flex font-bold text-xl gap-2">
                 <span>📊 Total Website Processed</span>
diff --git a/setup.py b/setup.py
index 168dfac6..2d05e206 100644
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@ class CustomInstallCommand(install):
 
 setup(
     name="Crawl4AI",
-    version="0.2.4",
+    version="0.2.5",
     description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
     long_description=open("README.md").read(),
     long_description_content_type="text/markdown",