From 77da48050d5c463d837746e307dfcda9db5a2bea Mon Sep 17 00:00:00 2001 From: unclecode Date: Mon, 17 Jun 2024 15:50:03 +0800 Subject: [PATCH] chore: Add custom headers to LocalSeleniumCrawlerStrategy --- crawl4ai/crawler_strategy.py | 7 +++++++ docs/examples/quickstart.py | 25 +++++++++++++++++-------- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index 2a6793e3..8cadd75c 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -139,6 +139,13 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): self.driver = webdriver.Chrome(service=self.service, options=self.options) self.driver = self.execute_hook('on_driver_created', self.driver) + def set_custom_headers(self, headers: dict): + # Enable Network domain for sending headers + self.driver.execute_cdp_cmd('Network.enable', {}) + # Set extra HTTP headers + self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers}) + + def crawl(self, url: str) -> str: # Create md5 hash of the URL import hashlib diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py index 8e035d8d..a6139f0a 100644 --- a/docs/examples/quickstart.py +++ b/docs/examples/quickstart.py @@ -198,12 +198,11 @@ def using_crawler_hooks(crawler): print("[HOOK] on_driver_created") # Example customization: maximize the window driver.maximize_window() - return driver - - def before_get_url(driver): - print("[HOOK] before_get_url") + # Example customization: logging in to a hypothetical website driver.get('https://example.com/login') + + from selenium.webdriver.support.ui import WebDriverWait WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.NAME, 'username')) ) @@ -215,8 +214,18 @@ def using_crawler_hooks(crawler): ) # Add a custom cookie driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'}) - return driver + return driver + + def before_get_url(driver): + print("[HOOK] before_get_url") + # Example customization: add a custom header + # Enable Network domain for sending headers + driver.execute_cdp_cmd('Network.enable', {}) + # Add a custom header + driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}}) + return driver + def after_get_url(driver): print("[HOOK] after_get_url") # Example customization: log the URL @@ -225,9 +234,9 @@ def using_crawler_hooks(crawler): def before_return_html(driver, html): print("[HOOK] before_return_html") - # Example customization: modify the HTML (for demonstration purposes) - modified_html = html.replace('Example Domain', 'Test Domain') - return driver, modified_html + # Example customization: log the HTML + print(len(html)) + return driver cprint("\nšŸ”— [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)