diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index cfd307e9..8cadd75c 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -10,7 +10,7 @@ import logging import base64 from PIL import Image, ImageDraw, ImageFont from io import BytesIO -from typing import List +from typing import List, Callable import requests import os from pathlib import Path @@ -48,6 +48,10 @@ class CrawlerStrategy(ABC): @abstractmethod def update_user_agent(self, user_agent: str): pass + + @abstractmethod + def set_hook(self, hook_type: str, hook: Callable): + pass class CloudCrawlerStrategy(CrawlerStrategy): def __init__(self, use_cached_html = False): @@ -96,6 +100,14 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): self.use_cached_html = use_cached_html self.js_code = js_code self.verbose = kwargs.get("verbose", False) + + # Hooks + self.hooks = { + 'on_driver_created': None, + 'before_get_url': None, + 'after_get_url': None, + 'before_return_html': None + } # chromedriver_autoinstaller.install() import chromedriver_autoinstaller @@ -103,10 +115,36 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): self.service.log_path = "NUL" self.driver = webdriver.Chrome(service=self.service, options=self.options) + def set_hook(self, hook_type: str, hook: Callable): + if hook_type in self.hooks: + self.hooks[hook_type] = hook + else: + raise ValueError(f"Invalid hook type: {hook_type}") + + def execute_hook(self, hook_type: str, *args): + hook = self.hooks.get(hook_type) + if hook: + result = hook(*args) + if result is not None: + if isinstance(result, webdriver.Chrome): + return result + else: + raise TypeError(f"Hook {hook_type} must return an instance of webdriver.Chrome or None.") + # If the hook returns None or there is no hook, return self.driver + return self.driver + def update_user_agent(self, user_agent: str): self.options.add_argument(f"user-agent={user_agent}") self.driver.quit() self.driver = webdriver.Chrome(service=self.service, options=self.options) + self.driver = self.execute_hook('on_driver_created', self.driver) + + def set_custom_headers(self, headers: dict): + # Enable Network domain for sending headers + self.driver.execute_cdp_cmd('Network.enable', {}) + # Set extra HTTP headers + self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers}) + def crawl(self, url: str) -> str: # Create md5 hash of the URL @@ -120,12 +158,14 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): return f.read() try: + self.driver = self.execute_hook('before_get_url', self.driver) if self.verbose: print(f"[LOG] πŸ•ΈοΈ Crawling {url} using LocalSeleniumCrawlerStrategy...") self.driver.get(url) WebDriverWait(self.driver, 10).until( EC.presence_of_all_elements_located((By.TAG_NAME, "html")) ) + self.driver = self.execute_hook('after_get_url', self.driver) # Execute JS code if provided if self.js_code and type(self.js_code) == str: @@ -142,6 +182,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): ) html = self.driver.page_source + self.driver = self.execute_hook('before_return_html', self.driver, html) # Store in cache cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash) diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py index 44b0c0e9..a6139f0a 100644 --- a/docs/examples/quickstart.py +++ b/docs/examples/quickstart.py @@ -192,6 +192,64 @@ def multiple_scrip(crawler): cprint("[LOG] πŸ“¦ [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]") print_result(result) +def using_crawler_hooks(crawler): + # Example usage of the hooks for authentication and setting a cookie + def on_driver_created(driver): + print("[HOOK] on_driver_created") + # Example customization: maximize the window + driver.maximize_window() + + # Example customization: logging in to a hypothetical website + driver.get('https://example.com/login') + + from selenium.webdriver.support.ui import WebDriverWait + WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.NAME, 'username')) + ) + driver.find_element(By.NAME, 'username').send_keys('testuser') + driver.find_element(By.NAME, 'password').send_keys('password123') + driver.find_element(By.NAME, 'login').click() + WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.ID, 'welcome')) + ) + # Add a custom cookie + driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'}) + return driver + + + def before_get_url(driver): + print("[HOOK] before_get_url") + # Example customization: add a custom header + # Enable Network domain for sending headers + driver.execute_cdp_cmd('Network.enable', {}) + # Add a custom header + driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}}) + return driver + + def after_get_url(driver): + print("[HOOK] after_get_url") + # Example customization: log the URL + print(driver.current_url) + return driver + + def before_return_html(driver, html): + print("[HOOK] before_return_html") + # Example customization: log the HTML + print(len(html)) + return driver + + cprint("\nπŸ”— [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True) + + crawler.set_hook('on_driver_created', on_driver_created) + crawler.set_hook('before_get_url', before_get_url) + crawler.set_hook('after_get_url', after_get_url) + crawler.set_hook('before_return_html', before_return_html) + + result = crawler.run(url="https://example.com") + + cprint("[LOG] πŸ“¦ [bold yellow]Crawler Hooks result:[/bold yellow]") + print_result(result= result) + def main(): cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]") cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")