feat: Add hooks for enhanced control over Selenium drivers
- Added six hooks: on_driver_created, before_get_url, after_get_url, before_return_html, on_user_agent_updated. - Included example usage in quickstart.py. - Updated README and changelog.
This commit is contained in:
12
CHANGELOG.md
12
CHANGELOG.md
@@ -1,5 +1,15 @@
|
||||
# Changelog
|
||||
|
||||
## [0.2.5] - 2024-06-18
|
||||
### Added
|
||||
- Added five important hooks to the crawler:
|
||||
- on_driver_created: Called when the driver is ready for initializations.
|
||||
- before_get_url: Called right before Selenium fetches the URL.
|
||||
- after_get_url: Called after Selenium fetches the URL.
|
||||
- before_return_html: Called when the data is parsed and ready.
|
||||
- on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize.
|
||||
- Added an example in `quickstart.py` in the example folder under the docs.
|
||||
|
||||
## [0.2.4] - 2024-06-17
|
||||
### Fixed
|
||||
- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
|
||||
- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
|
||||
|
||||
10
README.md
10
README.md
@@ -13,6 +13,16 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
|
||||
|
||||
## Recent Changes
|
||||
|
||||
### v0.2.5
|
||||
- 🌟 Added six important hooks to the crawler:
|
||||
- 🟢 on_driver_created: Called when the driver is ready for initializations.
|
||||
- 🔵 before_get_url: Called right before Selenium fetches the URL.
|
||||
- 🟣 after_get_url: Called after Selenium fetches the URL.
|
||||
- 🟠 before_return_html: Called when the data is parsed and ready.
|
||||
- 🟡 on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize.
|
||||
- 📄 Added an example in `quickstart.py` in the example folder under the docs.
|
||||
|
||||
|
||||
### v0.2.4
|
||||
- 🐞 Resolve the issue with the long url. (Issue #22)
|
||||
|
||||
|
||||
@@ -104,6 +104,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
# Hooks
|
||||
self.hooks = {
|
||||
'on_driver_created': None,
|
||||
'on_user_agent_updated': None,
|
||||
'before_get_url': None,
|
||||
'after_get_url': None,
|
||||
'before_return_html': None
|
||||
@@ -114,6 +115,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
self.service = Service(chromedriver_autoinstaller.install())
|
||||
self.service.log_path = "NUL"
|
||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||
self.driver = self.execute_hook('on_driver_created', self.driver)
|
||||
|
||||
def set_hook(self, hook_type: str, hook: Callable):
|
||||
if hook_type in self.hooks:
|
||||
@@ -137,7 +139,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
self.options.add_argument(f"user-agent={user_agent}")
|
||||
self.driver.quit()
|
||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||
self.driver = self.execute_hook('on_driver_created', self.driver)
|
||||
self.driver = self.execute_hook('on_user_agent_updated', self.driver)
|
||||
|
||||
def set_custom_headers(self, headers: dict):
|
||||
# Enable Network domain for sending headers
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
<header class="bg-zinc-950 text-lime-500 py-4 flex">
|
||||
|
||||
<div class="mx-auto px-4">
|
||||
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.4</h1>
|
||||
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.5</h1>
|
||||
</div>
|
||||
<div class="mx-auto px-4 flex font-bold text-xl gap-2">
|
||||
<span>📊 Total Website Processed</span>
|
||||
|
||||
2
setup.py
2
setup.py
@@ -26,7 +26,7 @@ class CustomInstallCommand(install):
|
||||
|
||||
setup(
|
||||
name="Crawl4AI",
|
||||
version="0.2.4",
|
||||
version="0.2.5",
|
||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
||||
long_description=open("README.md").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
|
||||
Reference in New Issue
Block a user