feat: Add delay after fetching URL in crawler hooks
This commit adds a delay of 5 seconds after fetching the URL in the `after_get_url` hook of the crawler hooks. The delay is implemented using the `time.sleep()` function. This change ensures that the entire page is fetched before proceeding with further actions.
This commit is contained in:
@@ -249,15 +249,40 @@ def using_crawler_hooks(crawler):
|
|||||||
|
|
||||||
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
||||||
|
|
||||||
crawler.set_hook('on_driver_created', on_driver_created)
|
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||||
crawler.set_hook('before_get_url', before_get_url)
|
crawler_strategy.set_hook('on_driver_created', on_driver_created)
|
||||||
crawler.set_hook('after_get_url', after_get_url)
|
crawler_strategy.set_hook('before_get_url', before_get_url)
|
||||||
crawler.set_hook('before_return_html', before_return_html)
|
crawler_strategy.set_hook('after_get_url', after_get_url)
|
||||||
|
crawler_strategy.set_hook('before_return_html', before_return_html)
|
||||||
|
|
||||||
|
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||||
|
crawler.warmup()
|
||||||
result = crawler.run(url="https://example.com")
|
result = crawler.run(url="https://example.com")
|
||||||
|
|
||||||
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||||||
print_result(result= result)
|
print_result(result= result)
|
||||||
|
|
||||||
|
def using_crawler_hooks_dleay_example(crawler):
|
||||||
|
def delay(driver):
|
||||||
|
print("Delaying for 5 seconds...")
|
||||||
|
time.sleep(5)
|
||||||
|
print("Resuming...")
|
||||||
|
|
||||||
|
def create_crawler():
|
||||||
|
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||||
|
crawler_strategy.set_hook('after_get_url', delay)
|
||||||
|
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||||
|
crawler.warmup()
|
||||||
|
return crawler
|
||||||
|
|
||||||
|
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]")
|
||||||
|
crawler = create_crawler()
|
||||||
|
result = crawler.run(url="https://google.com", bypass_cache=True)
|
||||||
|
|
||||||
|
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||||||
|
print_result(result)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
||||||
|
|||||||
@@ -14,6 +14,9 @@ Let's see how we can customize the crawler using hooks! In this example, we'll:
|
|||||||
### Hook Definitions
|
### Hook Definitions
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
from crawl4ai.web_crawler import WebCrawler
|
||||||
|
from crawl4ai.crawler_strategy import *
|
||||||
|
|
||||||
def on_driver_created(driver):
|
def on_driver_created(driver):
|
||||||
print("[HOOK] on_driver_created")
|
print("[HOOK] on_driver_created")
|
||||||
# Example customization: maximize the window
|
# Example customization: maximize the window
|
||||||
@@ -66,12 +69,13 @@ def before_return_html(driver, html):
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
print("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
print("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
||||||
crawler = WebCrawler(verbose=True)
|
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||||
|
crawler_strategy.set_hook('on_driver_created', on_driver_created)
|
||||||
|
crawler_strategy.set_hook('before_get_url', before_get_url)
|
||||||
|
crawler_strategy.set_hook('after_get_url', after_get_url)
|
||||||
|
crawler_strategy.set_hook('before_return_html', before_return_html)
|
||||||
|
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||||
crawler.warmup()
|
crawler.warmup()
|
||||||
crawler.set_hook('on_driver_created', on_driver_created)
|
|
||||||
crawler.set_hook('before_get_url', before_get_url)
|
|
||||||
crawler.set_hook('after_get_url', after_get_url)
|
|
||||||
crawler.set_hook('before_return_html', before_return_html)
|
|
||||||
|
|
||||||
result = crawler.run(url="https://example.com")
|
result = crawler.run(url="https://example.com")
|
||||||
|
|
||||||
|
|||||||
@@ -176,41 +176,29 @@ print(f"JavaScript Code (Load More button) result: {result}")
|
|||||||
Let's see how we can customize the crawler using hooks!
|
Let's see how we can customize the crawler using hooks!
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def on_driver_created(driver):
|
import time
|
||||||
print("[HOOK] on_driver_created")
|
|
||||||
driver.maximize_window()
|
|
||||||
driver.get('https://example.com/login')
|
|
||||||
driver.find_element(By.NAME, 'username').send_keys('testuser')
|
|
||||||
driver.find_element(By.NAME, 'password').send_keys('password123')
|
|
||||||
driver.find_element(By.NAME, 'login').click()
|
|
||||||
driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
|
|
||||||
return driver
|
|
||||||
|
|
||||||
def before_get_url(driver):
|
from crawl4ai.web_crawler import WebCrawler
|
||||||
print("[HOOK] before_get_url")
|
from crawl4ai.crawler_strategy import *
|
||||||
driver.execute_cdp_cmd('Network.enable', {})
|
|
||||||
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
|
|
||||||
return driver
|
|
||||||
|
|
||||||
def after_get_url(driver):
|
def delay(driver):
|
||||||
print("[HOOK] after_get_url")
|
print("Delaying for 5 seconds...")
|
||||||
print(driver.current_url)
|
time.sleep(5)
|
||||||
return driver
|
print("Resuming...")
|
||||||
|
|
||||||
|
def create_crawler():
|
||||||
|
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||||
|
crawler_strategy.set_hook('after_get_url', delay)
|
||||||
|
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||||
|
crawler.warmup()
|
||||||
|
return crawler
|
||||||
|
|
||||||
def before_return_html(driver, html):
|
crawler = create_crawler()
|
||||||
print("[HOOK] before_return_html")
|
result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
|
||||||
print(len(html))
|
|
||||||
return driver
|
|
||||||
|
|
||||||
crawler.set_hook('on_driver_created', on_driver_created)
|
|
||||||
crawler.set_hook('before_get_url', before_get_url)
|
|
||||||
crawler.set_hook('after_get_url', after_get_url)
|
|
||||||
crawler.set_hook('before_return_html', before_return_html)
|
|
||||||
|
|
||||||
result = crawler.run(url="https://example.com")
|
|
||||||
print(f"Crawler Hooks result: {result}")
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
check [Hooks](examples/hooks_auth.md) for more examples.
|
||||||
|
|
||||||
## Congratulations! 🎉
|
## Congratulations! 🎉
|
||||||
|
|
||||||
You've made it through the Crawl4AI Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️
|
You've made it through the Crawl4AI Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️
|
||||||
|
|||||||
Reference in New Issue
Block a user