## [v0.2.74] - 2024-07-08

A slew of exciting updates to improve the crawler's stability and robustness! 🎉 - 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding. - 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy. - 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy. - 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
2024-07-08 16:33:25 +08:00
parent 3ff2a0d0e7
commit 4d283ab386
18 changed files with 142 additions and 77 deletions
--- a/docs/examples/llm_extraction_openai_pricing.py
+++ b/docs/examples/llm_extraction_openai_pricing.py
@@ -36,5 +36,5 @@ model_fees = json.loads(result.extracted_content)

 print(len(model_fees))

-with open(".data/data.json", "w") as f:
+with open(".data/data.json", "w", encoding="utf-8") as f:
    f.write(result.extracted_content)
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -249,15 +249,40 @@ def using_crawler_hooks(crawler):
    
    cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
    
-    crawler.set_hook('on_driver_created', on_driver_created)
-    crawler.set_hook('before_get_url', before_get_url)
-    crawler.set_hook('after_get_url', after_get_url)
-    crawler.set_hook('before_return_html', before_return_html)
+    crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
+    crawler_strategy.set_hook('on_driver_created', on_driver_created)
+    crawler_strategy.set_hook('before_get_url', before_get_url)
+    crawler_strategy.set_hook('after_get_url', after_get_url)
+    crawler_strategy.set_hook('before_return_html', before_return_html)
    
+    crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
+    crawler.warmup()    
    result = crawler.run(url="https://example.com")
    
    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
    print_result(result= result)
+    
+def using_crawler_hooks_dleay_example(crawler):
+    def delay(driver):
+        print("Delaying for 5 seconds...")
+        time.sleep(5)
+        print("Resuming...")
+        
+    def create_crawler():
+        crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
+        crawler_strategy.set_hook('after_get_url', delay)
+        crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
+        crawler.warmup()
+        return crawler
+
+    cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]")
+    crawler = create_crawler()
+    result = crawler.run(url="https://google.com", bypass_cache=True)    
+    
+    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
+    print_result(result)
+    
+    

 def main():
    cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
--- a/docs/examples/summarize_page.py
+++ b/docs/examples/summarize_page.py
@@ -42,5 +42,5 @@ page_summary = json.loads(result.extracted_content)

 print(page_summary)

-with open(".data/page_summary.json", "w") as f:
+with open(".data/page_summary.json", "w", encoding="utf-8") as f:
    f.write(result.extracted_content)
--- a/docs/md/changelog.md
+++ b/docs/md/changelog.md
@@ -1,5 +1,13 @@
 # Changelog

+## v0.2.74 - 2024-07-08
+A slew of exciting updates to improve the crawler's stability and robustness! 🎉
+
+- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
+- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
+- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
+- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
+
 ## [v0.2.73] - 2024-07-03

 💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.
--- a/docs/md/examples/hooks_auth.md
+++ b/docs/md/examples/hooks_auth.md
@@ -14,6 +14,9 @@ Let's see how we can customize the crawler using hooks! In this example, we'll:
 ### Hook Definitions

 ```python
+from crawl4ai.web_crawler import WebCrawler
+from crawl4ai.crawler_strategy import *
+
 def on_driver_created(driver):
    print("[HOOK] on_driver_created")
    # Example customization: maximize the window
@@ -66,12 +69,13 @@ def before_return_html(driver, html):

 ```python
 print("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
-crawler = WebCrawler(verbose=True)
+crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
+crawler_strategy.set_hook('on_driver_created', on_driver_created)
+crawler_strategy.set_hook('before_get_url', before_get_url)
+crawler_strategy.set_hook('after_get_url', after_get_url)
+crawler_strategy.set_hook('before_return_html', before_return_html)
+crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
 crawler.warmup()
-crawler.set_hook('on_driver_created', on_driver_created)
-crawler.set_hook('before_get_url', before_get_url)
-crawler.set_hook('after_get_url', after_get_url)
-crawler.set_hook('before_return_html', before_return_html)

 result = crawler.run(url="https://example.com")

--- a/docs/md/examples/llm_extraction.md
+++ b/docs/md/examples/llm_extraction.md
@@ -45,7 +45,7 @@ model_fees = json.loads(result.extracted_content)

 print(len(model_fees))

-with open(".data/data.json", "w") as f:
+with open(".data/data.json", "w", encoding="utf-8") as f:
    f.write(result.extracted_content)
 ```

@@ -71,7 +71,7 @@ model_fees = json.loads(result.extracted_content)

 print(len(model_fees))

-with open(".data/data.json", "w") as f:
+with open(".data/data.json", "w", encoding="utf-8") as f:
    f.write(result.extracted_content)
 ```

--- a/docs/md/examples/summarization.md
+++ b/docs/md/examples/summarization.md
@@ -91,7 +91,7 @@ This example demonstrates how to use `Crawl4AI` to extract a summary from a web
    Save the extracted data to a file for further use.

    ```python
-    with open(".data/page_summary.json", "w") as f:
+    with open(".data/page_summary.json", "w", encoding="utf-8") as f:
        f.write(result.extracted_content)
    ```

--- a/docs/md/index.md
+++ b/docs/md/index.md
@@ -1,4 +1,4 @@
-# Crawl4AI v0.2.73
+# Crawl4AI v0.2.74

 Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.

--- a/docs/md/quickstart.md
+++ b/docs/md/quickstart.md
@@ -176,41 +176,29 @@ print(f"JavaScript Code (Load More button) result: {result}")
 Let's see how we can customize the crawler using hooks!

 ```python
-def on_driver_created(driver):
-    print("[HOOK] on_driver_created")
-    driver.maximize_window()
-    driver.get('https://example.com/login')
-    driver.find_element(By.NAME, 'username').send_keys('testuser')
-    driver.find_element(By.NAME, 'password').send_keys('password123')
-    driver.find_element(By.NAME, 'login').click()
-    driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
-    return driver        
+import time

-def before_get_url(driver):
-    print("[HOOK] before_get_url")
-    driver.execute_cdp_cmd('Network.enable', {})
-    driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
-    return driver
+from crawl4ai.web_crawler import WebCrawler
+from crawl4ai.crawler_strategy import *

-def after_get_url(driver):
-    print("[HOOK] after_get_url")
-    print(driver.current_url)
-    return driver
+def delay(driver):
+    print("Delaying for 5 seconds...")
+    time.sleep(5)
+    print("Resuming...")
+    
+def create_crawler():
+    crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
+    crawler_strategy.set_hook('after_get_url', delay)
+    crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
+    crawler.warmup()
+    return crawler

-def before_return_html(driver, html):
-    print("[HOOK] before_return_html")
-    print(len(html))
-    return driver
-
-crawler.set_hook('on_driver_created', on_driver_created)
-crawler.set_hook('before_get_url', before_get_url)
-crawler.set_hook('after_get_url', after_get_url)
-crawler.set_hook('before_return_html', before_return_html)
-
-result = crawler.run(url="https://example.com")
-print(f"Crawler Hooks result: {result}")
+crawler = create_crawler()
+result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
 ```

+check [Hooks](examples/hooks_auth.md) for more examples.
+
 ## Congratulations! 🎉

 You've made it through the Crawl4AI Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️