chore: Add custom headers to LocalSeleniumCrawlerStrategy

chore: Add hooks for customizing the LocalSeleniumCrawlerStrategy
Fix typo in README
2024-06-17 15:50:03 +08:00 · 2024-06-17 15:37:18 +08:00 · 2024-06-17 15:15:37 +08:00
3 changed files with 101 additions and 2 deletions
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
 ## Recent Changes 
-### v0.2.34
+### v0.2.4
 - 🐞 Resolve the issue with the long url. (Issue #22)
 ### v0.2.3
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -10,7 +10,7 @@ import logging
 import base64
 from PIL import Image, ImageDraw, ImageFont
 from io import BytesIO
-from typing import List
+from typing import List, Callable
 import requests
 import os
 from pathlib import Path
@@ -48,6 +48,10 @@ class CrawlerStrategy(ABC):
    @abstractmethod
    def update_user_agent(self, user_agent: str):
        pass
    @abstractmethod
    def set_hook(self, hook_type: str, hook: Callable):
        pass
 class CloudCrawlerStrategy(CrawlerStrategy):
    def __init__(self, use_cached_html = False):
@@ -96,6 +100,14 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
        self.use_cached_html = use_cached_html
        self.js_code = js_code
        self.verbose = kwargs.get("verbose", False)
        # Hooks
        self.hooks = {
            'on_driver_created': None,
            'before_get_url': None,
            'after_get_url': None,
            'before_return_html': None
        }
        # chromedriver_autoinstaller.install()
        import chromedriver_autoinstaller
@@ -103,10 +115,36 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
        self.service.log_path = "NUL"
        self.driver = webdriver.Chrome(service=self.service, options=self.options)
    def set_hook(self, hook_type: str, hook: Callable):
        if hook_type in self.hooks:
            self.hooks[hook_type] = hook
        else:
            raise ValueError(f"Invalid hook type: {hook_type}")
    def execute_hook(self, hook_type: str, *args):
        hook = self.hooks.get(hook_type)
        if hook:
            result = hook(*args)
            if result is not None:
                if isinstance(result, webdriver.Chrome):
                    return result
                else:
                    raise TypeError(f"Hook {hook_type} must return an instance of webdriver.Chrome or None.")
        # If the hook returns None or there is no hook, return self.driver
        return self.driver
    def update_user_agent(self, user_agent: str):
        self.options.add_argument(f"user-agent={user_agent}")
        self.driver.quit()
        self.driver = webdriver.Chrome(service=self.service, options=self.options)
        self.driver = self.execute_hook('on_driver_created', self.driver)
    def set_custom_headers(self, headers: dict):
        # Enable Network domain for sending headers
        self.driver.execute_cdp_cmd('Network.enable', {})
        # Set extra HTTP headers
        self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers})
    def crawl(self, url: str) -> str:
        # Create md5 hash of the URL
@@ -120,12 +158,14 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
                    return f.read()
        try:
            self.driver = self.execute_hook('before_get_url', self.driver)
            if self.verbose:
                print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
            self.driver.get(url)
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
            )
            self.driver = self.execute_hook('after_get_url', self.driver)
            # Execute JS code if provided
            if self.js_code and type(self.js_code) == str:
@@ -142,6 +182,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
                    )
            html = self.driver.page_source
            self.driver = self.execute_hook('before_return_html', self.driver, html)
            # Store in cache
            cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -192,6 +192,64 @@ def multiple_scrip(crawler):
    cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
    print_result(result)
 def using_crawler_hooks(crawler):
    # Example usage of the hooks for authentication and setting a cookie
    def on_driver_created(driver):
        print("[HOOK] on_driver_created")
        # Example customization: maximize the window
        driver.maximize_window()
        # Example customization: logging in to a hypothetical website
        driver.get('https://example.com/login')
        from selenium.webdriver.support.ui import WebDriverWait
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.NAME, 'username'))
        )
        driver.find_element(By.NAME, 'username').send_keys('testuser')
        driver.find_element(By.NAME, 'password').send_keys('password123')
        driver.find_element(By.NAME, 'login').click()
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, 'welcome'))
        )
        # Add a custom cookie
        driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
        return driver        
    def before_get_url(driver):
        print("[HOOK] before_get_url")
        # Example customization: add a custom header
        # Enable Network domain for sending headers
        driver.execute_cdp_cmd('Network.enable', {})
        # Add a custom header
        driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
        return driver
    def after_get_url(driver):
        print("[HOOK] after_get_url")
        # Example customization: log the URL
        print(driver.current_url)
        return driver
    def before_return_html(driver, html):
        print("[HOOK] before_return_html")
        # Example customization: log the HTML
        print(len(html))
        return driver
    cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
    crawler.set_hook('on_driver_created', on_driver_created)
    crawler.set_hook('before_get_url', before_get_url)
    crawler.set_hook('after_get_url', after_get_url)
    crawler.set_hook('before_return_html', before_return_html)
    result = crawler.run(url="https://example.com")
    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
    print_result(result= result)
 def main():
    cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
    cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
Author	SHA1	Message	Date
unclecode	77da48050d	chore: Add custom headers to LocalSeleniumCrawlerStrategy	2024-06-17 15:50:03 +08:00
unclecode	9a97aacd85	chore: Add hooks for customizing the LocalSeleniumCrawlerStrategy	2024-06-17 15:37:18 +08:00
unclecode	52daf3936a	Fix typo in README	2024-06-17 15:15:37 +08:00