Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
77da48050d | ||
|
|
9a97aacd85 | ||
|
|
52daf3936a |
@@ -13,7 +13,7 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
|
|||||||
|
|
||||||
## Recent Changes
|
## Recent Changes
|
||||||
|
|
||||||
### v0.2.34
|
### v0.2.4
|
||||||
- 🐞 Resolve the issue with the long url. (Issue #22)
|
- 🐞 Resolve the issue with the long url. (Issue #22)
|
||||||
|
|
||||||
### v0.2.3
|
### v0.2.3
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ import logging
|
|||||||
import base64
|
import base64
|
||||||
from PIL import Image, ImageDraw, ImageFont
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import List
|
from typing import List, Callable
|
||||||
import requests
|
import requests
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -48,6 +48,10 @@ class CrawlerStrategy(ABC):
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def update_user_agent(self, user_agent: str):
|
def update_user_agent(self, user_agent: str):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def set_hook(self, hook_type: str, hook: Callable):
|
||||||
|
pass
|
||||||
|
|
||||||
class CloudCrawlerStrategy(CrawlerStrategy):
|
class CloudCrawlerStrategy(CrawlerStrategy):
|
||||||
def __init__(self, use_cached_html = False):
|
def __init__(self, use_cached_html = False):
|
||||||
@@ -96,6 +100,14 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
self.use_cached_html = use_cached_html
|
self.use_cached_html = use_cached_html
|
||||||
self.js_code = js_code
|
self.js_code = js_code
|
||||||
self.verbose = kwargs.get("verbose", False)
|
self.verbose = kwargs.get("verbose", False)
|
||||||
|
|
||||||
|
# Hooks
|
||||||
|
self.hooks = {
|
||||||
|
'on_driver_created': None,
|
||||||
|
'before_get_url': None,
|
||||||
|
'after_get_url': None,
|
||||||
|
'before_return_html': None
|
||||||
|
}
|
||||||
|
|
||||||
# chromedriver_autoinstaller.install()
|
# chromedriver_autoinstaller.install()
|
||||||
import chromedriver_autoinstaller
|
import chromedriver_autoinstaller
|
||||||
@@ -103,10 +115,36 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
self.service.log_path = "NUL"
|
self.service.log_path = "NUL"
|
||||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||||
|
|
||||||
|
def set_hook(self, hook_type: str, hook: Callable):
|
||||||
|
if hook_type in self.hooks:
|
||||||
|
self.hooks[hook_type] = hook
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid hook type: {hook_type}")
|
||||||
|
|
||||||
|
def execute_hook(self, hook_type: str, *args):
|
||||||
|
hook = self.hooks.get(hook_type)
|
||||||
|
if hook:
|
||||||
|
result = hook(*args)
|
||||||
|
if result is not None:
|
||||||
|
if isinstance(result, webdriver.Chrome):
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
raise TypeError(f"Hook {hook_type} must return an instance of webdriver.Chrome or None.")
|
||||||
|
# If the hook returns None or there is no hook, return self.driver
|
||||||
|
return self.driver
|
||||||
|
|
||||||
def update_user_agent(self, user_agent: str):
|
def update_user_agent(self, user_agent: str):
|
||||||
self.options.add_argument(f"user-agent={user_agent}")
|
self.options.add_argument(f"user-agent={user_agent}")
|
||||||
self.driver.quit()
|
self.driver.quit()
|
||||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||||
|
self.driver = self.execute_hook('on_driver_created', self.driver)
|
||||||
|
|
||||||
|
def set_custom_headers(self, headers: dict):
|
||||||
|
# Enable Network domain for sending headers
|
||||||
|
self.driver.execute_cdp_cmd('Network.enable', {})
|
||||||
|
# Set extra HTTP headers
|
||||||
|
self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers})
|
||||||
|
|
||||||
|
|
||||||
def crawl(self, url: str) -> str:
|
def crawl(self, url: str) -> str:
|
||||||
# Create md5 hash of the URL
|
# Create md5 hash of the URL
|
||||||
@@ -120,12 +158,14 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
return f.read()
|
return f.read()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
self.driver = self.execute_hook('before_get_url', self.driver)
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
|
print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
|
||||||
self.driver.get(url)
|
self.driver.get(url)
|
||||||
WebDriverWait(self.driver, 10).until(
|
WebDriverWait(self.driver, 10).until(
|
||||||
EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
|
EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
|
||||||
)
|
)
|
||||||
|
self.driver = self.execute_hook('after_get_url', self.driver)
|
||||||
|
|
||||||
# Execute JS code if provided
|
# Execute JS code if provided
|
||||||
if self.js_code and type(self.js_code) == str:
|
if self.js_code and type(self.js_code) == str:
|
||||||
@@ -142,6 +182,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
)
|
)
|
||||||
|
|
||||||
html = self.driver.page_source
|
html = self.driver.page_source
|
||||||
|
self.driver = self.execute_hook('before_return_html', self.driver, html)
|
||||||
|
|
||||||
# Store in cache
|
# Store in cache
|
||||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
||||||
|
|||||||
@@ -192,6 +192,64 @@ def multiple_scrip(crawler):
|
|||||||
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
||||||
print_result(result)
|
print_result(result)
|
||||||
|
|
||||||
|
def using_crawler_hooks(crawler):
|
||||||
|
# Example usage of the hooks for authentication and setting a cookie
|
||||||
|
def on_driver_created(driver):
|
||||||
|
print("[HOOK] on_driver_created")
|
||||||
|
# Example customization: maximize the window
|
||||||
|
driver.maximize_window()
|
||||||
|
|
||||||
|
# Example customization: logging in to a hypothetical website
|
||||||
|
driver.get('https://example.com/login')
|
||||||
|
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
WebDriverWait(driver, 10).until(
|
||||||
|
EC.presence_of_element_located((By.NAME, 'username'))
|
||||||
|
)
|
||||||
|
driver.find_element(By.NAME, 'username').send_keys('testuser')
|
||||||
|
driver.find_element(By.NAME, 'password').send_keys('password123')
|
||||||
|
driver.find_element(By.NAME, 'login').click()
|
||||||
|
WebDriverWait(driver, 10).until(
|
||||||
|
EC.presence_of_element_located((By.ID, 'welcome'))
|
||||||
|
)
|
||||||
|
# Add a custom cookie
|
||||||
|
driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
|
||||||
|
return driver
|
||||||
|
|
||||||
|
|
||||||
|
def before_get_url(driver):
|
||||||
|
print("[HOOK] before_get_url")
|
||||||
|
# Example customization: add a custom header
|
||||||
|
# Enable Network domain for sending headers
|
||||||
|
driver.execute_cdp_cmd('Network.enable', {})
|
||||||
|
# Add a custom header
|
||||||
|
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
|
||||||
|
return driver
|
||||||
|
|
||||||
|
def after_get_url(driver):
|
||||||
|
print("[HOOK] after_get_url")
|
||||||
|
# Example customization: log the URL
|
||||||
|
print(driver.current_url)
|
||||||
|
return driver
|
||||||
|
|
||||||
|
def before_return_html(driver, html):
|
||||||
|
print("[HOOK] before_return_html")
|
||||||
|
# Example customization: log the HTML
|
||||||
|
print(len(html))
|
||||||
|
return driver
|
||||||
|
|
||||||
|
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
||||||
|
|
||||||
|
crawler.set_hook('on_driver_created', on_driver_created)
|
||||||
|
crawler.set_hook('before_get_url', before_get_url)
|
||||||
|
crawler.set_hook('after_get_url', after_get_url)
|
||||||
|
crawler.set_hook('before_return_html', before_return_html)
|
||||||
|
|
||||||
|
result = crawler.run(url="https://example.com")
|
||||||
|
|
||||||
|
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||||||
|
print_result(result= result)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
||||||
cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
|
cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
|
||||||
|
|||||||
Reference in New Issue
Block a user