Compare commits

..

3 Commits

Author SHA1 Message Date
unclecode
77da48050d chore: Add custom headers to LocalSeleniumCrawlerStrategy 2024-06-17 15:50:03 +08:00
unclecode
9a97aacd85 chore: Add hooks for customizing the LocalSeleniumCrawlerStrategy 2024-06-17 15:37:18 +08:00
unclecode
52daf3936a Fix typo in README 2024-06-17 15:15:37 +08:00
3 changed files with 101 additions and 2 deletions

View File

@@ -13,7 +13,7 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
## Recent Changes ## Recent Changes
### v0.2.34 ### v0.2.4
- 🐞 Resolve the issue with the long url. (Issue #22) - 🐞 Resolve the issue with the long url. (Issue #22)
### v0.2.3 ### v0.2.3

View File

@@ -10,7 +10,7 @@ import logging
import base64 import base64
from PIL import Image, ImageDraw, ImageFont from PIL import Image, ImageDraw, ImageFont
from io import BytesIO from io import BytesIO
from typing import List from typing import List, Callable
import requests import requests
import os import os
from pathlib import Path from pathlib import Path
@@ -48,6 +48,10 @@ class CrawlerStrategy(ABC):
@abstractmethod @abstractmethod
def update_user_agent(self, user_agent: str): def update_user_agent(self, user_agent: str):
pass pass
@abstractmethod
def set_hook(self, hook_type: str, hook: Callable):
pass
class CloudCrawlerStrategy(CrawlerStrategy): class CloudCrawlerStrategy(CrawlerStrategy):
def __init__(self, use_cached_html = False): def __init__(self, use_cached_html = False):
@@ -96,6 +100,14 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
self.use_cached_html = use_cached_html self.use_cached_html = use_cached_html
self.js_code = js_code self.js_code = js_code
self.verbose = kwargs.get("verbose", False) self.verbose = kwargs.get("verbose", False)
# Hooks
self.hooks = {
'on_driver_created': None,
'before_get_url': None,
'after_get_url': None,
'before_return_html': None
}
# chromedriver_autoinstaller.install() # chromedriver_autoinstaller.install()
import chromedriver_autoinstaller import chromedriver_autoinstaller
@@ -103,10 +115,36 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
self.service.log_path = "NUL" self.service.log_path = "NUL"
self.driver = webdriver.Chrome(service=self.service, options=self.options) self.driver = webdriver.Chrome(service=self.service, options=self.options)
def set_hook(self, hook_type: str, hook: Callable):
if hook_type in self.hooks:
self.hooks[hook_type] = hook
else:
raise ValueError(f"Invalid hook type: {hook_type}")
def execute_hook(self, hook_type: str, *args):
hook = self.hooks.get(hook_type)
if hook:
result = hook(*args)
if result is not None:
if isinstance(result, webdriver.Chrome):
return result
else:
raise TypeError(f"Hook {hook_type} must return an instance of webdriver.Chrome or None.")
# If the hook returns None or there is no hook, return self.driver
return self.driver
def update_user_agent(self, user_agent: str): def update_user_agent(self, user_agent: str):
self.options.add_argument(f"user-agent={user_agent}") self.options.add_argument(f"user-agent={user_agent}")
self.driver.quit() self.driver.quit()
self.driver = webdriver.Chrome(service=self.service, options=self.options) self.driver = webdriver.Chrome(service=self.service, options=self.options)
self.driver = self.execute_hook('on_driver_created', self.driver)
def set_custom_headers(self, headers: dict):
# Enable Network domain for sending headers
self.driver.execute_cdp_cmd('Network.enable', {})
# Set extra HTTP headers
self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers})
def crawl(self, url: str) -> str: def crawl(self, url: str) -> str:
# Create md5 hash of the URL # Create md5 hash of the URL
@@ -120,12 +158,14 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
return f.read() return f.read()
try: try:
self.driver = self.execute_hook('before_get_url', self.driver)
if self.verbose: if self.verbose:
print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...") print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
self.driver.get(url) self.driver.get(url)
WebDriverWait(self.driver, 10).until( WebDriverWait(self.driver, 10).until(
EC.presence_of_all_elements_located((By.TAG_NAME, "html")) EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
) )
self.driver = self.execute_hook('after_get_url', self.driver)
# Execute JS code if provided # Execute JS code if provided
if self.js_code and type(self.js_code) == str: if self.js_code and type(self.js_code) == str:
@@ -142,6 +182,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
) )
html = self.driver.page_source html = self.driver.page_source
self.driver = self.execute_hook('before_return_html', self.driver, html)
# Store in cache # Store in cache
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash) cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)

View File

@@ -192,6 +192,64 @@ def multiple_scrip(crawler):
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]") cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
print_result(result) print_result(result)
def using_crawler_hooks(crawler):
# Example usage of the hooks for authentication and setting a cookie
def on_driver_created(driver):
print("[HOOK] on_driver_created")
# Example customization: maximize the window
driver.maximize_window()
# Example customization: logging in to a hypothetical website
driver.get('https://example.com/login')
from selenium.webdriver.support.ui import WebDriverWait
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.NAME, 'username'))
)
driver.find_element(By.NAME, 'username').send_keys('testuser')
driver.find_element(By.NAME, 'password').send_keys('password123')
driver.find_element(By.NAME, 'login').click()
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, 'welcome'))
)
# Add a custom cookie
driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
return driver
def before_get_url(driver):
print("[HOOK] before_get_url")
# Example customization: add a custom header
# Enable Network domain for sending headers
driver.execute_cdp_cmd('Network.enable', {})
# Add a custom header
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
return driver
def after_get_url(driver):
print("[HOOK] after_get_url")
# Example customization: log the URL
print(driver.current_url)
return driver
def before_return_html(driver, html):
print("[HOOK] before_return_html")
# Example customization: log the HTML
print(len(html))
return driver
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
crawler.set_hook('on_driver_created', on_driver_created)
crawler.set_hook('before_get_url', before_get_url)
crawler.set_hook('after_get_url', after_get_url)
crawler.set_hook('before_return_html', before_return_html)
result = crawler.run(url="https://example.com")
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
print_result(result= result)
def main(): def main():
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]") cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]") cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")