From ea2f83ac1064219a16935f7c55eb4ad78cfe2121 Mon Sep 17 00:00:00 2001 From: unclecode Date: Mon, 8 Jul 2024 15:59:59 +0800 Subject: [PATCH] feat: Add delay after fetching URL in crawler hooks This commit adds a delay of 5 seconds after fetching the URL in the `after_get_url` hook of the crawler hooks. The delay is implemented using the `time.sleep()` function. This change ensures that the entire page is fetched before proceeding with further actions. --- docs/examples/quickstart.py | 33 ++++++++++++++++++++--- docs/md/examples/hooks_auth.md | 14 ++++++---- docs/md/quickstart.md | 48 +++++++++++++--------------------- 3 files changed, 56 insertions(+), 39 deletions(-) diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py index 24486cc1..89c63139 100644 --- a/docs/examples/quickstart.py +++ b/docs/examples/quickstart.py @@ -249,15 +249,40 @@ def using_crawler_hooks(crawler): cprint("\nπŸ”— [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True) - crawler.set_hook('on_driver_created', on_driver_created) - crawler.set_hook('before_get_url', before_get_url) - crawler.set_hook('after_get_url', after_get_url) - crawler.set_hook('before_return_html', before_return_html) + crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True) + crawler_strategy.set_hook('on_driver_created', on_driver_created) + crawler_strategy.set_hook('before_get_url', before_get_url) + crawler_strategy.set_hook('after_get_url', after_get_url) + crawler_strategy.set_hook('before_return_html', before_return_html) + crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy) + crawler.warmup() result = crawler.run(url="https://example.com") cprint("[LOG] πŸ“¦ [bold yellow]Crawler Hooks result:[/bold yellow]") print_result(result= result) + +def using_crawler_hooks_dleay_example(crawler): + def delay(driver): + print("Delaying for 5 seconds...") + time.sleep(5) + print("Resuming...") + + def create_crawler(): + crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True) + crawler_strategy.set_hook('after_get_url', delay) + crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy) + crawler.warmup() + return crawler + + cprint("\nπŸ”— [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]") + crawler = create_crawler() + result = crawler.run(url="https://google.com", bypass_cache=True) + + cprint("[LOG] πŸ“¦ [bold yellow]Crawler Hooks result:[/bold yellow]") + print_result(result) + + def main(): cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]") diff --git a/docs/md/examples/hooks_auth.md b/docs/md/examples/hooks_auth.md index 154300df..2b4c2701 100644 --- a/docs/md/examples/hooks_auth.md +++ b/docs/md/examples/hooks_auth.md @@ -14,6 +14,9 @@ Let's see how we can customize the crawler using hooks! In this example, we'll: ### Hook Definitions ```python +from crawl4ai.web_crawler import WebCrawler +from crawl4ai.crawler_strategy import * + def on_driver_created(driver): print("[HOOK] on_driver_created") # Example customization: maximize the window @@ -66,12 +69,13 @@ def before_return_html(driver, html): ```python print("\nπŸ”— [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True) -crawler = WebCrawler(verbose=True) +crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True) +crawler_strategy.set_hook('on_driver_created', on_driver_created) +crawler_strategy.set_hook('before_get_url', before_get_url) +crawler_strategy.set_hook('after_get_url', after_get_url) +crawler_strategy.set_hook('before_return_html', before_return_html) +crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy) crawler.warmup() -crawler.set_hook('on_driver_created', on_driver_created) -crawler.set_hook('before_get_url', before_get_url) -crawler.set_hook('after_get_url', after_get_url) -crawler.set_hook('before_return_html', before_return_html) result = crawler.run(url="https://example.com") diff --git a/docs/md/quickstart.md b/docs/md/quickstart.md index 9f5bdcd8..a0c1a2c7 100644 --- a/docs/md/quickstart.md +++ b/docs/md/quickstart.md @@ -176,41 +176,29 @@ print(f"JavaScript Code (Load More button) result: {result}") Let's see how we can customize the crawler using hooks! ```python -def on_driver_created(driver): - print("[HOOK] on_driver_created") - driver.maximize_window() - driver.get('https://example.com/login') - driver.find_element(By.NAME, 'username').send_keys('testuser') - driver.find_element(By.NAME, 'password').send_keys('password123') - driver.find_element(By.NAME, 'login').click() - driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'}) - return driver +import time -def before_get_url(driver): - print("[HOOK] before_get_url") - driver.execute_cdp_cmd('Network.enable', {}) - driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}}) - return driver +from crawl4ai.web_crawler import WebCrawler +from crawl4ai.crawler_strategy import * -def after_get_url(driver): - print("[HOOK] after_get_url") - print(driver.current_url) - return driver +def delay(driver): + print("Delaying for 5 seconds...") + time.sleep(5) + print("Resuming...") + +def create_crawler(): + crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True) + crawler_strategy.set_hook('after_get_url', delay) + crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy) + crawler.warmup() + return crawler -def before_return_html(driver, html): - print("[HOOK] before_return_html") - print(len(html)) - return driver - -crawler.set_hook('on_driver_created', on_driver_created) -crawler.set_hook('before_get_url', before_get_url) -crawler.set_hook('after_get_url', after_get_url) -crawler.set_hook('before_return_html', before_return_html) - -result = crawler.run(url="https://example.com") -print(f"Crawler Hooks result: {result}") +crawler = create_crawler() +result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True) ``` +check [Hooks](examples/hooks_auth.md) for more examples. + ## Congratulations! πŸŽ‰ You've made it through the Crawl4AI Quickstart Guide! Now go forth and crawl the web like a pro! πŸ•ΈοΈ