This commit is contained in:
Unclecode
2024-06-18 12:02:29 +00:00
9 changed files with 147 additions and 28 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.5 MiB

3
.gitignore vendored
View File

@@ -179,3 +179,6 @@ docs/examples/.chainlit/
docs/examples/.chainlit/*
.chainlit/config.toml
.chainlit/translations/en-US.json
local/
.files/

View File

@@ -1,4 +1,15 @@
# Changelog
## TODO:
- User agent: "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.101.76 Safari/537.36",
## [0.2.5] - 2024-06-18
### Added
- Added five important hooks to the crawler:
- on_driver_created: Called when the driver is ready for initializations.
- before_get_url: Called right before Selenium fetches the URL.
- after_get_url: Called after Selenium fetches the URL.
- before_return_html: Called when the data is parsed and ready.
- on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize.
- Added an example in `quickstart.py` in the example folder under the docs.
## [0.2.4] - 2024-06-17
### Fixed
- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs

View File

@@ -1,4 +1,4 @@
# Crawl4AI v0.2.3 🕷️🤖
# Crawl4AI v0.2.5 🕷️🤖
[![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
[![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)
@@ -8,10 +8,23 @@
Crawl4AI has one clear task: to simplify crawling and extract useful information from web pages, making it accessible for large language models (LLMs) and AI applications. 🆓🌐
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)
- Use as REST API: Check [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1zODYjhemJ5bUmYceWpVoBMVpd0ofzNBZ?usp=sharing)
- Use as Python library: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)
## Recent Changes
### v0.2.5
- 🌟 Added six important hooks to the crawler:
- 🟢 on_driver_created: Called when the driver is ready for initializations.
- 🔵 before_get_url: Called right before Selenium fetches the URL.
- 🟣 after_get_url: Called after Selenium fetches the URL.
- 🟠 before_return_html: Called when the data is parsed and ready.
- 🟡 on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize.
- 📄 Added an example in [`quickstart.py`](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart.py) in the example folder under the docs.
### v0.2.4
- 🐞 Resolve the issue with the long url. (Issue #22)
### v0.2.3
- 🎨 Extract and return all media tags (Images, Audio, and Video). Check `result.media`
- 🔗 Extrat all external and internal links. Check `result.links`
@@ -50,9 +63,12 @@ data = {
response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally
response_data = response.json()
print(response_data['results'][0].keys())
# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media',
# 'links', 'screenshot', 'markdown', 'extracted_content',
# 'metadata', 'error_message'])
```
To show the simplicity take a look at the first example:
But you muore control then take a look at the first example of using the Python library.
```python
from crawl4ai import WebCrawler
@@ -62,24 +78,7 @@ crawler = WebCrawler()
# Run the crawler with keyword filtering and CSS selector
result = crawler.run(url="https://www.nbcnews.com/business")
print(result) # {url, html, markdown, extracted_content, metadata}
```
If you don't want to install Selenium, you can use the REST API or local server.
```python
import requests
data = {
"urls": [
"https://www.nbcnews.com/business"
],
"word_count_threshold": 10,
"extraction_strategy": "NoExtractionStrategy",
}
response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally
print(response.json())
print(result) # {url, html, cleaned_html, markdown, media, links, extracted_content, metadata, screenshots}
```
Now let's try a complex task. Below is an example of how you can execute JavaScript, filter data using keywords, and use a CSS selector to extract specific content—all in one go!

View File

@@ -10,7 +10,7 @@ import logging
import base64
from PIL import Image, ImageDraw, ImageFont
from io import BytesIO
from typing import List
from typing import List, Callable
import requests
import os
from pathlib import Path
@@ -48,6 +48,10 @@ class CrawlerStrategy(ABC):
@abstractmethod
def update_user_agent(self, user_agent: str):
pass
@abstractmethod
def set_hook(self, hook_type: str, hook: Callable):
pass
class CloudCrawlerStrategy(CrawlerStrategy):
def __init__(self, use_cached_html = False):
@@ -96,32 +100,74 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
self.use_cached_html = use_cached_html
self.js_code = js_code
self.verbose = kwargs.get("verbose", False)
# Hooks
self.hooks = {
'on_driver_created': None,
'on_user_agent_updated': None,
'before_get_url': None,
'after_get_url': None,
'before_return_html': None
}
# chromedriver_autoinstaller.install()
import chromedriver_autoinstaller
self.service = Service(chromedriver_autoinstaller.install())
self.service.log_path = "NUL"
self.driver = webdriver.Chrome(service=self.service, options=self.options)
self.driver = self.execute_hook('on_driver_created', self.driver)
def set_hook(self, hook_type: str, hook: Callable):
if hook_type in self.hooks:
self.hooks[hook_type] = hook
else:
raise ValueError(f"Invalid hook type: {hook_type}")
def execute_hook(self, hook_type: str, *args):
hook = self.hooks.get(hook_type)
if hook:
result = hook(*args)
if result is not None:
if isinstance(result, webdriver.Chrome):
return result
else:
raise TypeError(f"Hook {hook_type} must return an instance of webdriver.Chrome or None.")
# If the hook returns None or there is no hook, return self.driver
return self.driver
def update_user_agent(self, user_agent: str):
self.options.add_argument(f"user-agent={user_agent}")
self.driver.quit()
self.driver = webdriver.Chrome(service=self.service, options=self.options)
self.driver = self.execute_hook('on_user_agent_updated', self.driver)
def set_custom_headers(self, headers: dict):
# Enable Network domain for sending headers
self.driver.execute_cdp_cmd('Network.enable', {})
# Set extra HTTP headers
self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers})
def crawl(self, url: str) -> str:
# Create md5 hash of the URL
import hashlib
url_hash = hashlib.md5(url.encode()).hexdigest()
if self.use_cached_html:
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
if os.path.exists(cache_file_path):
with open(cache_file_path, "r") as f:
return f.read()
try:
self.driver = self.execute_hook('before_get_url', self.driver)
if self.verbose:
print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
self.driver.get(url)
WebDriverWait(self.driver, 10).until(
EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
)
self.driver = self.execute_hook('after_get_url', self.driver)
# Execute JS code if provided
if self.js_code and type(self.js_code) == str:
@@ -138,9 +184,10 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
)
html = self.driver.page_source
self.driver = self.execute_hook('before_return_html', self.driver, html)
# Store in cache
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
with open(cache_file_path, "w") as f:
f.write(html)

View File

@@ -192,6 +192,64 @@ def multiple_scrip(crawler):
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
print_result(result)
def using_crawler_hooks(crawler):
# Example usage of the hooks for authentication and setting a cookie
def on_driver_created(driver):
print("[HOOK] on_driver_created")
# Example customization: maximize the window
driver.maximize_window()
# Example customization: logging in to a hypothetical website
driver.get('https://example.com/login')
from selenium.webdriver.support.ui import WebDriverWait
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.NAME, 'username'))
)
driver.find_element(By.NAME, 'username').send_keys('testuser')
driver.find_element(By.NAME, 'password').send_keys('password123')
driver.find_element(By.NAME, 'login').click()
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, 'welcome'))
)
# Add a custom cookie
driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
return driver
def before_get_url(driver):
print("[HOOK] before_get_url")
# Example customization: add a custom header
# Enable Network domain for sending headers
driver.execute_cdp_cmd('Network.enable', {})
# Add a custom header
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
return driver
def after_get_url(driver):
print("[HOOK] after_get_url")
# Example customization: log the URL
print(driver.current_url)
return driver
def before_return_html(driver, html):
print("[HOOK] before_return_html")
# Example customization: log the HTML
print(len(html))
return driver
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
crawler.set_hook('on_driver_created', on_driver_created)
crawler.set_hook('before_get_url', before_get_url)
crawler.set_hook('after_get_url', after_get_url)
crawler.set_hook('before_return_html', before_return_html)
result = crawler.run(url="https://example.com")
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
print_result(result= result)
def main():
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")

View File

@@ -25,7 +25,7 @@
<header class="bg-zinc-950 text-lime-500 py-4 flex">
<div class="mx-auto px-4">
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.3</h1>
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.5</h1>
</div>
<div class="mx-auto px-4 flex font-bold text-xl gap-2">
<span>📊 Total Website Processed</span>

View File

@@ -18,3 +18,4 @@ chromedriver-autoinstaller
torch
onnxruntime
tokenizers
pillow

View File

@@ -26,7 +26,7 @@ class CustomInstallCommand(install):
setup(
name="Crawl4AI",
version="0.2.3",
version="0.2.5",
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
long_description=open("README.md").read(),
long_description_content_type="text/markdown",