From 7f41ff4a747a53991a4ce6d82006c00f3cdef4d7 Mon Sep 17 00:00:00 2001 From: unclecode Date: Sat, 6 Jul 2024 14:28:01 +0800 Subject: [PATCH] The `after_get_url` hook is executed after getting the URL, allowing for further customization. --- crawl4ai/crawler_strategy.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index 65afbe4f..168d21df 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -215,11 +215,14 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): WebDriverWait(self.driver, 10).until( EC.presence_of_all_elements_located((By.TAG_NAME, "body")) ) + self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") - html = sanitize_input_encode(self._ensure_page_load()) # self.driver.page_source + + self.driver = self.execute_hook('after_get_url', self.driver) + html = sanitize_input_encode(self._ensure_page_load()) # self.driver.page_source can_not_be_done_headless = False # Look at my creativity for naming variables - # TODO: Very ugly way for now but it works + # TODO: Very ugly approach, but promise to change it! if kwargs.get('bypass_headless', False) or html == "": print("[LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode...") can_not_be_done_headless = True @@ -229,11 +232,10 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): options.add_argument("--window-size=5,5") driver = webdriver.Chrome(service=self.service, options=options) driver.get(url) + self.driver = self.execute_hook('after_get_url', driver) html = sanitize_input_encode(driver.page_source) driver.quit() - self.driver = self.execute_hook('after_get_url', self.driver) - # Execute JS code if provided if self.js_code and type(self.js_code) == str: self.driver.execute_script(self.js_code)