Merge branch 'main' of https://github.com/unclecode/crawl4ai

2024-06-18 12:02:29 +00:00
parent 77df6db453 480902bd66
commit 5d3fef45f7
9 changed files with 147 additions and 28 deletions
--- a/.files/screenshot.png
+++ b/.files/screenshot.png
--- a/.gitignore
+++ b/.gitignore
@@ -179,3 +179,6 @@ docs/examples/.chainlit/
 docs/examples/.chainlit/*
 .chainlit/config.toml
 .chainlit/translations/en-US.json
+
+local/
+.files/
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,15 @@
 # Changelog

-## TODO:
- User agent: "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.101.76 Safari/537.36",
+## [0.2.5] - 2024-06-18
+### Added
+- Added five important hooks to the crawler:
+  - on_driver_created: Called when the driver is ready for initializations.
+  - before_get_url: Called right before Selenium fetches the URL.
+  - after_get_url: Called after Selenium fetches the URL.
+  - before_return_html: Called when the data is parsed and ready.
+  - on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize.
+- Added an example in `quickstart.py` in the example folder under the docs.
+
+## [0.2.4] - 2024-06-17
+### Fixed
+- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Crawl4AI v0.2.3 🕷️🤖
+# Crawl4AI v0.2.5 🕷️🤖

 [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
 [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)
@@ -8,10 +8,23 @@

 Crawl4AI has one clear task: to simplify crawling and extract useful information from web pages, making it accessible for large language models (LLMs) and AI applications. 🆓🌐

-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)
+- Use as REST API: Check  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1zODYjhemJ5bUmYceWpVoBMVpd0ofzNBZ?usp=sharing)
+- Use as Python library: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)

 ## Recent Changes 

+### v0.2.5
+- 🌟 Added six important hooks to the crawler:
+  - 🟢 on_driver_created: Called when the driver is ready for initializations.
+  - 🔵 before_get_url: Called right before Selenium fetches the URL.
+  - 🟣 after_get_url: Called after Selenium fetches the URL.
+  - 🟠 before_return_html: Called when the data is parsed and ready.
+  - 🟡 on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize.
+- 📄 Added an example in [`quickstart.py`](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart.py) in the example folder under the docs.
+
+### v0.2.4
+- 🐞 Resolve the issue with the long url. (Issue #22)
+
 ### v0.2.3
 - 🎨 Extract and return all media tags (Images, Audio, and Video). Check `result.media`
 - 🔗 Extrat all external and internal links. Check `result.links`
@@ -50,9 +63,12 @@ data = {
 response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally 
 response_data = response.json()
 print(response_data['results'][0].keys())
+# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media', 
+# 'links', 'screenshot', 'markdown', 'extracted_content', 
+# 'metadata', 'error_message'])
 ```

-To show the simplicity take a look at the first example:
+But you muore control then take a look at the first example of using the Python library.

 ```python
 from crawl4ai import WebCrawler
@@ -62,24 +78,7 @@ crawler = WebCrawler()

 # Run the crawler with keyword filtering and CSS selector
 result = crawler.run(url="https://www.nbcnews.com/business")
-print(result) # {url, html, markdown, extracted_content, metadata}
-```
-
-If you don't want to install Selenium, you can use the REST API or local server. 
-
-```python
-import requests
-
-data = {
-  "urls": [
-    "https://www.nbcnews.com/business"
-  ],
-  "word_count_threshold": 10,
-  "extraction_strategy": "NoExtractionStrategy",
-}
-
-response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally 
-print(response.json())
+print(result) # {url, html, cleaned_html, markdown, media, links, extracted_content, metadata, screenshots}
 ```

 Now let's try a complex task. Below is an example of how you can execute JavaScript, filter data using keywords, and use a CSS selector to extract specific content—all in one go!
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -10,7 +10,7 @@ import logging
 import base64
 from PIL import Image, ImageDraw, ImageFont
 from io import BytesIO
-from typing import List
+from typing import List, Callable
 import requests
 import os
 from pathlib import Path
@@ -48,6 +48,10 @@ class CrawlerStrategy(ABC):
    @abstractmethod
    def update_user_agent(self, user_agent: str):
        pass
+    
+    @abstractmethod
+    def set_hook(self, hook_type: str, hook: Callable):
+        pass

 class CloudCrawlerStrategy(CrawlerStrategy):
    def __init__(self, use_cached_html = False):
@@ -96,32 +100,74 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
        self.use_cached_html = use_cached_html
        self.js_code = js_code
        self.verbose = kwargs.get("verbose", False)
+        
+        # Hooks
+        self.hooks = {
+            'on_driver_created': None,
+            'on_user_agent_updated': None,
+            'before_get_url': None,
+            'after_get_url': None,
+            'before_return_html': None
+        }

        # chromedriver_autoinstaller.install()
        import chromedriver_autoinstaller
        self.service = Service(chromedriver_autoinstaller.install())
        self.service.log_path = "NUL"
        self.driver = webdriver.Chrome(service=self.service, options=self.options)
+        self.driver = self.execute_hook('on_driver_created', self.driver)
+
+    def set_hook(self, hook_type: str, hook: Callable):
+        if hook_type in self.hooks:
+            self.hooks[hook_type] = hook
+        else:
+            raise ValueError(f"Invalid hook type: {hook_type}")
+    
+    def execute_hook(self, hook_type: str, *args):
+        hook = self.hooks.get(hook_type)
+        if hook:
+            result = hook(*args)
+            if result is not None:
+                if isinstance(result, webdriver.Chrome):
+                    return result
+                else:
+                    raise TypeError(f"Hook {hook_type} must return an instance of webdriver.Chrome or None.")
+        # If the hook returns None or there is no hook, return self.driver
+        return self.driver

    def update_user_agent(self, user_agent: str):
        self.options.add_argument(f"user-agent={user_agent}")
        self.driver.quit()
        self.driver = webdriver.Chrome(service=self.service, options=self.options)
+        self.driver = self.execute_hook('on_user_agent_updated', self.driver)
+
+    def set_custom_headers(self, headers: dict):
+        # Enable Network domain for sending headers
+        self.driver.execute_cdp_cmd('Network.enable', {})
+        # Set extra HTTP headers
+        self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers})
+

    def crawl(self, url: str) -> str:
+        # Create md5 hash of the URL
+        import hashlib
+        url_hash = hashlib.md5(url.encode()).hexdigest()
+        
        if self.use_cached_html:
-            cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
+            cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
            if os.path.exists(cache_file_path):
                with open(cache_file_path, "r") as f:
                    return f.read()

        try:
+            self.driver = self.execute_hook('before_get_url', self.driver)
            if self.verbose:
                print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
            self.driver.get(url)
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
            )
+            self.driver = self.execute_hook('after_get_url', self.driver)
            
            # Execute JS code if provided
            if self.js_code and type(self.js_code) == str:
@@ -138,9 +184,10 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
                    )
            
            html = self.driver.page_source
+            self.driver = self.execute_hook('before_return_html', self.driver, html)
            
            # Store in cache
-            cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
+            cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
            with open(cache_file_path, "w") as f:
                f.write(html)
                
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -192,6 +192,64 @@ def multiple_scrip(crawler):
    cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
    print_result(result)

+def using_crawler_hooks(crawler):
+    # Example usage of the hooks for authentication and setting a cookie
+    def on_driver_created(driver):
+        print("[HOOK] on_driver_created")
+        # Example customization: maximize the window
+        driver.maximize_window()
+        
+        # Example customization: logging in to a hypothetical website
+        driver.get('https://example.com/login')
+        
+        from selenium.webdriver.support.ui import WebDriverWait
+        WebDriverWait(driver, 10).until(
+            EC.presence_of_element_located((By.NAME, 'username'))
+        )
+        driver.find_element(By.NAME, 'username').send_keys('testuser')
+        driver.find_element(By.NAME, 'password').send_keys('password123')
+        driver.find_element(By.NAME, 'login').click()
+        WebDriverWait(driver, 10).until(
+            EC.presence_of_element_located((By.ID, 'welcome'))
+        )
+        # Add a custom cookie
+        driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
+        return driver        
+        
+
+    def before_get_url(driver):
+        print("[HOOK] before_get_url")
+        # Example customization: add a custom header
+        # Enable Network domain for sending headers
+        driver.execute_cdp_cmd('Network.enable', {})
+        # Add a custom header
+        driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
+        return driver
+    
+    def after_get_url(driver):
+        print("[HOOK] after_get_url")
+        # Example customization: log the URL
+        print(driver.current_url)
+        return driver
+
+    def before_return_html(driver, html):
+        print("[HOOK] before_return_html")
+        # Example customization: log the HTML
+        print(len(html))
+        return driver
+    
+    cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
+    
+    crawler.set_hook('on_driver_created', on_driver_created)
+    crawler.set_hook('before_get_url', before_get_url)
+    crawler.set_hook('after_get_url', after_get_url)
+    crawler.set_hook('before_return_html', before_return_html)
+    
+    result = crawler.run(url="https://example.com")
+    
+    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
+    print_result(result= result)
+
 def main():
    cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
    cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
--- a/pages/index.html
+++ b/pages/index.html
@@ -25,7 +25,7 @@
        <header class="bg-zinc-950 text-lime-500 py-4 flex">
            
            <div class="mx-auto px-4">
-                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.3</h1>
+                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.5</h1>
            </div>
            <div class="mx-auto px-4 flex font-bold text-xl gap-2">
                <span>📊 Total Website Processed</span>
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,3 +18,4 @@ chromedriver-autoinstaller
 torch
 onnxruntime
 tokenizers
+pillow
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@ class CustomInstallCommand(install):

 setup(
    name="Crawl4AI",
-    version="0.2.3",
+    version="0.2.5",
    description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
    long_description=open("README.md").read(),
    long_description_content_type="text/markdown",