diff --git a/.files/screenshot.png b/.files/screenshot.png deleted file mode 100644 index c8005487..00000000 Binary files a/.files/screenshot.png and /dev/null differ diff --git a/.gitignore b/.gitignore index 407c5cdb..3330f2e6 100644 --- a/.gitignore +++ b/.gitignore @@ -179,3 +179,6 @@ docs/examples/.chainlit/ docs/examples/.chainlit/* .chainlit/config.toml .chainlit/translations/en-US.json + +local/ +.files/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index d4504e40..eb854b1d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,15 @@ # Changelog -## TODO: -- User agent: "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.101.76 Safari/537.36", \ No newline at end of file +## [0.2.5] - 2024-06-18 +### Added +- Added five important hooks to the crawler: + - on_driver_created: Called when the driver is ready for initializations. + - before_get_url: Called right before Selenium fetches the URL. + - after_get_url: Called after Selenium fetches the URL. + - before_return_html: Called when the data is parsed and ready. + - on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize. +- Added an example in `quickstart.py` in the example folder under the docs. + +## [0.2.4] - 2024-06-17 +### Fixed +- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs diff --git a/README.md b/README.md index 10d81ed9..b0b12510 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Crawl4AI v0.2.3 πŸ•·οΈπŸ€– +# Crawl4AI v0.2.5 πŸ•·οΈπŸ€– [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers) [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members) @@ -8,10 +8,23 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information from web pages, making it accessible for large language models (LLMs) and AI applications. πŸ†“πŸŒ -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk) +- Use as REST API: Check [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1zODYjhemJ5bUmYceWpVoBMVpd0ofzNBZ?usp=sharing) +- Use as Python library: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk) ## Recent Changes +### v0.2.5 +- 🌟 Added six important hooks to the crawler: + - 🟒 on_driver_created: Called when the driver is ready for initializations. + - πŸ”΅ before_get_url: Called right before Selenium fetches the URL. + - 🟣 after_get_url: Called after Selenium fetches the URL. + - 🟠 before_return_html: Called when the data is parsed and ready. + - 🟑 on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize. +- πŸ“„ Added an example in [`quickstart.py`](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart.py) in the example folder under the docs. + +### v0.2.4 +- 🐞 Resolve the issue with the long url. (Issue #22) + ### v0.2.3 - 🎨 Extract and return all media tags (Images, Audio, and Video). Check `result.media` - πŸ”— Extrat all external and internal links. Check `result.links` @@ -50,9 +63,12 @@ data = { response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally response_data = response.json() print(response_data['results'][0].keys()) +# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media', +# 'links', 'screenshot', 'markdown', 'extracted_content', +# 'metadata', 'error_message']) ``` -To show the simplicity take a look at the first example: +But you muore control then take a look at the first example of using the Python library. ```python from crawl4ai import WebCrawler @@ -62,24 +78,7 @@ crawler = WebCrawler() # Run the crawler with keyword filtering and CSS selector result = crawler.run(url="https://www.nbcnews.com/business") -print(result) # {url, html, markdown, extracted_content, metadata} -``` - -If you don't want to install Selenium, you can use the REST API or local server. - -```python -import requests - -data = { - "urls": [ - "https://www.nbcnews.com/business" - ], - "word_count_threshold": 10, - "extraction_strategy": "NoExtractionStrategy", -} - -response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally -print(response.json()) +print(result) # {url, html, cleaned_html, markdown, media, links, extracted_content, metadata, screenshots} ``` Now let's try a complex task. Below is an example of how you can execute JavaScript, filter data using keywords, and use a CSS selector to extract specific contentβ€”all in one go! diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index 1f258613..ecf0863a 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -10,7 +10,7 @@ import logging import base64 from PIL import Image, ImageDraw, ImageFont from io import BytesIO -from typing import List +from typing import List, Callable import requests import os from pathlib import Path @@ -48,6 +48,10 @@ class CrawlerStrategy(ABC): @abstractmethod def update_user_agent(self, user_agent: str): pass + + @abstractmethod + def set_hook(self, hook_type: str, hook: Callable): + pass class CloudCrawlerStrategy(CrawlerStrategy): def __init__(self, use_cached_html = False): @@ -96,32 +100,74 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): self.use_cached_html = use_cached_html self.js_code = js_code self.verbose = kwargs.get("verbose", False) + + # Hooks + self.hooks = { + 'on_driver_created': None, + 'on_user_agent_updated': None, + 'before_get_url': None, + 'after_get_url': None, + 'before_return_html': None + } # chromedriver_autoinstaller.install() import chromedriver_autoinstaller self.service = Service(chromedriver_autoinstaller.install()) self.service.log_path = "NUL" self.driver = webdriver.Chrome(service=self.service, options=self.options) + self.driver = self.execute_hook('on_driver_created', self.driver) + + def set_hook(self, hook_type: str, hook: Callable): + if hook_type in self.hooks: + self.hooks[hook_type] = hook + else: + raise ValueError(f"Invalid hook type: {hook_type}") + + def execute_hook(self, hook_type: str, *args): + hook = self.hooks.get(hook_type) + if hook: + result = hook(*args) + if result is not None: + if isinstance(result, webdriver.Chrome): + return result + else: + raise TypeError(f"Hook {hook_type} must return an instance of webdriver.Chrome or None.") + # If the hook returns None or there is no hook, return self.driver + return self.driver def update_user_agent(self, user_agent: str): self.options.add_argument(f"user-agent={user_agent}") self.driver.quit() self.driver = webdriver.Chrome(service=self.service, options=self.options) + self.driver = self.execute_hook('on_user_agent_updated', self.driver) + + def set_custom_headers(self, headers: dict): + # Enable Network domain for sending headers + self.driver.execute_cdp_cmd('Network.enable', {}) + # Set extra HTTP headers + self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers}) + def crawl(self, url: str) -> str: + # Create md5 hash of the URL + import hashlib + url_hash = hashlib.md5(url.encode()).hexdigest() + if self.use_cached_html: - cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_")) + cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash) if os.path.exists(cache_file_path): with open(cache_file_path, "r") as f: return f.read() try: + self.driver = self.execute_hook('before_get_url', self.driver) if self.verbose: print(f"[LOG] πŸ•ΈοΈ Crawling {url} using LocalSeleniumCrawlerStrategy...") self.driver.get(url) WebDriverWait(self.driver, 10).until( EC.presence_of_all_elements_located((By.TAG_NAME, "html")) ) + self.driver = self.execute_hook('after_get_url', self.driver) # Execute JS code if provided if self.js_code and type(self.js_code) == str: @@ -138,9 +184,10 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): ) html = self.driver.page_source + self.driver = self.execute_hook('before_return_html', self.driver, html) # Store in cache - cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_")) + cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash) with open(cache_file_path, "w") as f: f.write(html) diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py index 44b0c0e9..a6139f0a 100644 --- a/docs/examples/quickstart.py +++ b/docs/examples/quickstart.py @@ -192,6 +192,64 @@ def multiple_scrip(crawler): cprint("[LOG] πŸ“¦ [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]") print_result(result) +def using_crawler_hooks(crawler): + # Example usage of the hooks for authentication and setting a cookie + def on_driver_created(driver): + print("[HOOK] on_driver_created") + # Example customization: maximize the window + driver.maximize_window() + + # Example customization: logging in to a hypothetical website + driver.get('https://example.com/login') + + from selenium.webdriver.support.ui import WebDriverWait + WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.NAME, 'username')) + ) + driver.find_element(By.NAME, 'username').send_keys('testuser') + driver.find_element(By.NAME, 'password').send_keys('password123') + driver.find_element(By.NAME, 'login').click() + WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.ID, 'welcome')) + ) + # Add a custom cookie + driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'}) + return driver + + + def before_get_url(driver): + print("[HOOK] before_get_url") + # Example customization: add a custom header + # Enable Network domain for sending headers + driver.execute_cdp_cmd('Network.enable', {}) + # Add a custom header + driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}}) + return driver + + def after_get_url(driver): + print("[HOOK] after_get_url") + # Example customization: log the URL + print(driver.current_url) + return driver + + def before_return_html(driver, html): + print("[HOOK] before_return_html") + # Example customization: log the HTML + print(len(html)) + return driver + + cprint("\nπŸ”— [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True) + + crawler.set_hook('on_driver_created', on_driver_created) + crawler.set_hook('before_get_url', before_get_url) + crawler.set_hook('after_get_url', after_get_url) + crawler.set_hook('before_return_html', before_return_html) + + result = crawler.run(url="https://example.com") + + cprint("[LOG] πŸ“¦ [bold yellow]Crawler Hooks result:[/bold yellow]") + print_result(result= result) + def main(): cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]") cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]") diff --git a/pages/index.html b/pages/index.html index 43d4394a..c9e2b54f 100644 --- a/pages/index.html +++ b/pages/index.html @@ -25,7 +25,7 @@
-

πŸ”₯πŸ•·οΈ Crawl4AI: Web Data for your Thoughts v0.2.3

+

πŸ”₯πŸ•·οΈ Crawl4AI: Web Data for your Thoughts v0.2.5

πŸ“Š Total Website Processed diff --git a/requirements.txt b/requirements.txt index f4fdce65..20f7a0e4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,4 @@ chromedriver-autoinstaller torch onnxruntime tokenizers +pillow \ No newline at end of file diff --git a/setup.py b/setup.py index 8fb10997..2d05e206 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ class CustomInstallCommand(install): setup( name="Crawl4AI", - version="0.2.3", + version="0.2.5", description="πŸ”₯πŸ•·οΈ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper", long_description=open("README.md").read(), long_description_content_type="text/markdown",