Compare commits
3 Commits
format-inl
...
hooks
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
77da48050d | ||
|
|
9a97aacd85 | ||
|
|
52daf3936a |
@@ -1,10 +1,5 @@
|
||||
# Changelog
|
||||
|
||||
## [0.2.5] - 2024-06-17
|
||||
### Added
|
||||
- Enhancement issue #24: Replaced inline HTML tags (e.g., DEL, INS, SUB, ABBR) with textual format for better context handling in LLM.
|
||||
|
||||
|
||||
## [0.2.4] - 2024-06-17
|
||||
### Fixed
|
||||
- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
|
||||
@@ -13,9 +13,6 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
|
||||
|
||||
## Recent Changes
|
||||
|
||||
### v0.2.5
|
||||
- ✨ Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness.
|
||||
|
||||
### v0.2.4
|
||||
- 🐞 Resolve the issue with the long url. (Issue #22)
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ import logging
|
||||
import base64
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
from io import BytesIO
|
||||
from typing import List
|
||||
from typing import List, Callable
|
||||
import requests
|
||||
import os
|
||||
from pathlib import Path
|
||||
@@ -48,6 +48,10 @@ class CrawlerStrategy(ABC):
|
||||
@abstractmethod
|
||||
def update_user_agent(self, user_agent: str):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def set_hook(self, hook_type: str, hook: Callable):
|
||||
pass
|
||||
|
||||
class CloudCrawlerStrategy(CrawlerStrategy):
|
||||
def __init__(self, use_cached_html = False):
|
||||
@@ -96,6 +100,14 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
self.use_cached_html = use_cached_html
|
||||
self.js_code = js_code
|
||||
self.verbose = kwargs.get("verbose", False)
|
||||
|
||||
# Hooks
|
||||
self.hooks = {
|
||||
'on_driver_created': None,
|
||||
'before_get_url': None,
|
||||
'after_get_url': None,
|
||||
'before_return_html': None
|
||||
}
|
||||
|
||||
# chromedriver_autoinstaller.install()
|
||||
import chromedriver_autoinstaller
|
||||
@@ -103,10 +115,36 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
self.service.log_path = "NUL"
|
||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||
|
||||
def set_hook(self, hook_type: str, hook: Callable):
|
||||
if hook_type in self.hooks:
|
||||
self.hooks[hook_type] = hook
|
||||
else:
|
||||
raise ValueError(f"Invalid hook type: {hook_type}")
|
||||
|
||||
def execute_hook(self, hook_type: str, *args):
|
||||
hook = self.hooks.get(hook_type)
|
||||
if hook:
|
||||
result = hook(*args)
|
||||
if result is not None:
|
||||
if isinstance(result, webdriver.Chrome):
|
||||
return result
|
||||
else:
|
||||
raise TypeError(f"Hook {hook_type} must return an instance of webdriver.Chrome or None.")
|
||||
# If the hook returns None or there is no hook, return self.driver
|
||||
return self.driver
|
||||
|
||||
def update_user_agent(self, user_agent: str):
|
||||
self.options.add_argument(f"user-agent={user_agent}")
|
||||
self.driver.quit()
|
||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||
self.driver = self.execute_hook('on_driver_created', self.driver)
|
||||
|
||||
def set_custom_headers(self, headers: dict):
|
||||
# Enable Network domain for sending headers
|
||||
self.driver.execute_cdp_cmd('Network.enable', {})
|
||||
# Set extra HTTP headers
|
||||
self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers})
|
||||
|
||||
|
||||
def crawl(self, url: str) -> str:
|
||||
# Create md5 hash of the URL
|
||||
@@ -120,12 +158,14 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
return f.read()
|
||||
|
||||
try:
|
||||
self.driver = self.execute_hook('before_get_url', self.driver)
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
|
||||
self.driver.get(url)
|
||||
WebDriverWait(self.driver, 10).until(
|
||||
EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
|
||||
)
|
||||
self.driver = self.execute_hook('after_get_url', self.driver)
|
||||
|
||||
# Execute JS code if provided
|
||||
if self.js_code and type(self.js_code) == str:
|
||||
@@ -142,6 +182,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
)
|
||||
|
||||
html = self.driver.page_source
|
||||
self.driver = self.execute_hook('before_return_html', self.driver, html)
|
||||
|
||||
# Store in cache
|
||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
||||
|
||||
@@ -151,38 +151,6 @@ class CustomHTML2Text(HTML2Text):
|
||||
|
||||
super().handle_tag(tag, attrs, start)
|
||||
|
||||
def replace_inline_tags(soup, tags):
|
||||
tag_replacements = {
|
||||
'b': lambda tag: f"**{tag.text}**",
|
||||
'i': lambda tag: f"*{tag.text}*",
|
||||
'u': lambda tag: f"__{tag.text}__",
|
||||
'span': lambda tag: f"{tag.text}",
|
||||
'del': lambda tag: f"~~{tag.text}~~",
|
||||
'ins': lambda tag: f"++{tag.text}++",
|
||||
'sub': lambda tag: f"~{tag.text}~",
|
||||
'sup': lambda tag: f"^^{tag.text}^^",
|
||||
'strong': lambda tag: f"**{tag.text}**",
|
||||
'em': lambda tag: f"*{tag.text}*",
|
||||
'code': lambda tag: f"`{tag.text}`",
|
||||
'kbd': lambda tag: f"`{tag.text}`",
|
||||
'var': lambda tag: f"_{tag.text}_",
|
||||
's': lambda tag: f"~~{tag.text}~~",
|
||||
'q': lambda tag: f'"{tag.text}"',
|
||||
'abbr': lambda tag: f"{tag.text} ({tag.get('title', '')})",
|
||||
'cite': lambda tag: f"_{tag.text}_",
|
||||
'dfn': lambda tag: f"_{tag.text}_",
|
||||
'time': lambda tag: f"{tag.text}",
|
||||
'small': lambda tag: f"<small>{tag.text}</small>",
|
||||
'mark': lambda tag: f"=={tag.text}=="
|
||||
}
|
||||
|
||||
for tag_name in tags:
|
||||
for tag in soup.find_all(tag_name):
|
||||
replacement_text = tag_replacements.get(tag_name, lambda t: t.text)(tag)
|
||||
tag.replace_with(replacement_text)
|
||||
|
||||
return soup
|
||||
|
||||
def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
|
||||
try:
|
||||
if not html:
|
||||
@@ -281,9 +249,6 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
|
||||
|
||||
# Replace all "pre" tags with their inner text
|
||||
body = replace_pre_tags_with_text(body)
|
||||
|
||||
# Replace inline tags with their text content
|
||||
body = replace_inline_tags(body, ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark'])
|
||||
|
||||
# Recursively remove empty elements, their parent elements, and elements with word count below threshold
|
||||
def remove_empty_and_low_word_count_elements(node, word_count_threshold):
|
||||
|
||||
@@ -192,6 +192,64 @@ def multiple_scrip(crawler):
|
||||
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
def using_crawler_hooks(crawler):
|
||||
# Example usage of the hooks for authentication and setting a cookie
|
||||
def on_driver_created(driver):
|
||||
print("[HOOK] on_driver_created")
|
||||
# Example customization: maximize the window
|
||||
driver.maximize_window()
|
||||
|
||||
# Example customization: logging in to a hypothetical website
|
||||
driver.get('https://example.com/login')
|
||||
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.NAME, 'username'))
|
||||
)
|
||||
driver.find_element(By.NAME, 'username').send_keys('testuser')
|
||||
driver.find_element(By.NAME, 'password').send_keys('password123')
|
||||
driver.find_element(By.NAME, 'login').click()
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.ID, 'welcome'))
|
||||
)
|
||||
# Add a custom cookie
|
||||
driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
|
||||
return driver
|
||||
|
||||
|
||||
def before_get_url(driver):
|
||||
print("[HOOK] before_get_url")
|
||||
# Example customization: add a custom header
|
||||
# Enable Network domain for sending headers
|
||||
driver.execute_cdp_cmd('Network.enable', {})
|
||||
# Add a custom header
|
||||
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
|
||||
return driver
|
||||
|
||||
def after_get_url(driver):
|
||||
print("[HOOK] after_get_url")
|
||||
# Example customization: log the URL
|
||||
print(driver.current_url)
|
||||
return driver
|
||||
|
||||
def before_return_html(driver, html):
|
||||
print("[HOOK] before_return_html")
|
||||
# Example customization: log the HTML
|
||||
print(len(html))
|
||||
return driver
|
||||
|
||||
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
||||
|
||||
crawler.set_hook('on_driver_created', on_driver_created)
|
||||
crawler.set_hook('before_get_url', before_get_url)
|
||||
crawler.set_hook('after_get_url', after_get_url)
|
||||
crawler.set_hook('before_return_html', before_return_html)
|
||||
|
||||
result = crawler.run(url="https://example.com")
|
||||
|
||||
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||||
print_result(result= result)
|
||||
|
||||
def main():
|
||||
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
||||
cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
<header class="bg-zinc-950 text-lime-500 py-4 flex">
|
||||
|
||||
<div class="mx-auto px-4">
|
||||
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.5</h1>
|
||||
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.4</h1>
|
||||
</div>
|
||||
<div class="mx-auto px-4 flex font-bold text-xl gap-2">
|
||||
<span>📊 Total Website Processed</span>
|
||||
|
||||
2
setup.py
2
setup.py
@@ -26,7 +26,7 @@ class CustomInstallCommand(install):
|
||||
|
||||
setup(
|
||||
name="Crawl4AI",
|
||||
version="0.2.5",
|
||||
version="0.2.4",
|
||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
||||
long_description=open("README.md").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
|
||||
Reference in New Issue
Block a user