## [v0.2.74] - 2024-07-08

A slew of exciting updates to improve the crawler's stability and robustness! 🎉

- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
This commit is contained in:
unclecode
2024-07-08 16:33:25 +08:00
parent 3ff2a0d0e7
commit 4d283ab386
18 changed files with 142 additions and 77 deletions

View File

@@ -1,5 +1,14 @@
# Changelog
## [v0.2.74] - 2024-07-08
A slew of exciting updates to improve the crawler's stability and robustness! 🎉
- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
## [v0.2.73] - 2024-07-03
💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.

View File

@@ -1,4 +1,4 @@
# Crawl4AI v0.2.73 🕷️🤖
# Crawl4AI v0.2.74 🕷️🤖
[![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
[![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)

View File

@@ -3,6 +3,7 @@ import re
from collections import Counter
import string
from .model_loader import load_nltk_punkt
from .utils import *
# Define the abstract base class for chunking strategies
class ChunkingStrategy(ABC):

View File

@@ -8,6 +8,7 @@ from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import InvalidArgumentException, WebDriverException
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from urllib3.exceptions import MaxRetryError
from .config import *
import logging, time
@@ -18,7 +19,7 @@ from typing import List, Callable
import requests
import os
from pathlib import Path
from .utils import wrap_text
from .utils import *
logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
logger.setLevel(logging.WARNING)
@@ -73,7 +74,7 @@ class CloudCrawlerStrategy(CrawlerStrategy):
response = requests.post("http://crawl4ai.uccode.io/crawl", json=data)
response = response.json()
html = response["results"][0]["html"]
return html
return sanitize_input_encode(html)
class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
@@ -200,7 +201,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
if os.path.exists(cache_file_path):
with open(cache_file_path, "r") as f:
return f.read()
return sanitize_input_encode(f.read())
try:
self.driver = self.execute_hook('before_get_url', self.driver)
@@ -214,11 +215,15 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
WebDriverWait(self.driver, 10).until(
EC.presence_of_all_elements_located((By.TAG_NAME, "body"))
)
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
html = self._ensure_page_load() # self.driver.page_source
self.driver = self.execute_hook('after_get_url', self.driver)
html = sanitize_input_encode(self._ensure_page_load()) # self.driver.page_source
can_not_be_done_headless = False # Look at my creativity for naming variables
# TODO: Very ugly way for now but it works
if not kwargs.get('bypass_headless', False) and html == "<html><head></head><body></body></html>":
# TODO: Very ugly approach, but promise to change it!
if kwargs.get('bypass_headless', False) or html == "<html><head></head><body></body></html>":
print("[LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode...")
can_not_be_done_headless = True
options = Options()
@@ -227,11 +232,10 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
options.add_argument("--window-size=5,5")
driver = webdriver.Chrome(service=self.service, options=options)
driver.get(url)
html = driver.page_source
self.driver = self.execute_hook('after_get_url', driver)
html = sanitize_input_encode(driver.page_source)
driver.quit()
self.driver = self.execute_hook('after_get_url', self.driver)
# Execute JS code if provided
if self.js_code and type(self.js_code) == str:
self.driver.execute_script(self.js_code)
@@ -247,12 +251,12 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
)
if not can_not_be_done_headless:
html = self.driver.page_source
html = sanitize_input_encode(self.driver.page_source)
self.driver = self.execute_hook('before_return_html', self.driver, html)
# Store in cache
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
with open(cache_file_path, "w") as f:
with open(cache_file_path, "w", encoding="utf-8") as f:
f.write(html)
if self.verbose:
@@ -261,16 +265,16 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
return html
except InvalidArgumentException:
if not hasattr(e, 'msg'):
e.msg = str(e)
e.msg = sanitize_input_encode(str(e))
raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
except WebDriverException as e:
# If e does nlt have msg attribute create it and set it to str(e)
if not hasattr(e, 'msg'):
e.msg = str(e)
e.msg = sanitize_input_encode(str(e))
raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
except Exception as e:
if not hasattr(e, 'msg'):
e.msg = str(e)
e.msg = sanitize_input_encode(str(e))
raise Exception(f"Failed to crawl {url}: {e.msg}")
def take_screenshot(self) -> str:
@@ -299,7 +303,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
return img_base64
except Exception as e:
error_message = f"Failed to take screenshot: {str(e)}"
error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}")
print(error_message)
# Generate an image with black background

View File

@@ -20,7 +20,7 @@ def init_db():
extracted_content TEXT,
success BOOLEAN,
media TEXT DEFAULT "{}",
link TEXT DEFAULT "{}",
links TEXT DEFAULT "{}",
metadata TEXT DEFAULT "{}",
screenshot TEXT DEFAULT ""
)
@@ -127,6 +127,9 @@ def update_existing_records(new_column: str = "media", default_value: str = "{}"
print(f"Error updating existing records: {e}")
if __name__ == "__main__":
init_db() # Initialize the database if not already initialized
alter_db_add_screenshot("metadata") # Add the new column to the table
update_existing_records("metadata") # Update existing records to set the new column to an empty string
# Delete the existing database file
if os.path.exists(DB_PATH):
os.remove(DB_PATH)
init_db()
# alter_db_add_screenshot("COL_NAME")

View File

@@ -116,7 +116,6 @@ class LLMExtractionStrategy(ExtractionStrategy):
for block in blocks:
block['error'] = False
except Exception as e:
print("Error extracting blocks:", str(e))
parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
blocks = parsed
if unparsed:
@@ -192,7 +191,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
# Sequential processing with a delay
for ix, section in enumerate(merged_sections):
extract_func = partial(self.extract, url)
extracted_content.extend(extract_func(ix, section))
extracted_content.extend(extract_func(ix, sanitize_input_encode(section)))
time.sleep(0.5) # 500 ms delay between each processing
else:
# Parallel processing using ThreadPoolExecutor
@@ -202,10 +201,21 @@ class LLMExtractionStrategy(ExtractionStrategy):
with ThreadPoolExecutor(max_workers=4) as executor:
extract_func = partial(self.extract, url)
futures = [executor.submit(extract_func, ix, section) for ix, section in enumerate(merged_sections)]
futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)]
for future in as_completed(futures):
extracted_content.extend(future.result())
try:
extracted_content.extend(future.result())
except Exception as e:
if self.verbose:
print(f"Error in thread execution: {e}")
# Add error information to extracted_content
extracted_content.append({
"index": 0,
"error": True,
"tags": ["error"],
"content": str(e)
})
return extracted_content

View File

@@ -96,6 +96,16 @@ def sanitize_html(html):
return sanitized_html
def sanitize_input_encode(text: str) -> str:
"""Sanitize input to handle potential encoding issues."""
try:
# Attempt to encode and decode as UTF-8 to handle potential encoding issues
return text.encode('utf-8', errors='ignore').decode('utf-8')
except UnicodeEncodeError as e:
print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}")
# Fall back to ASCII if UTF-8 fails
return text.encode('ascii', errors='ignore').decode('ascii')
def escape_json_string(s):
"""
Escapes characters in a string to be JSON safe.
@@ -664,7 +674,6 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
for block in blocks:
block['error'] = False
except Exception as e:
print("Error extracting blocks:", str(e))
parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
blocks = parsed
# Append all unparsed segments as onr error block and content is list of unparsed segments
@@ -710,7 +719,6 @@ def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_toke
blocks = json.loads(blocks)
except Exception as e:
print("Error extracting blocks:", str(e))
blocks = [{
"index": 0,
"tags": ["error"],

View File

@@ -155,8 +155,8 @@ class WebCrawler:
return None
if cached:
html = cached[1]
extracted_content = cached[4]
html = sanitize_input_encode(cached[1])
extracted_content = sanitize_input_encode(cached[4])
if screenshot:
screenshot_data = cached[9]
if not screenshot_data:
@@ -166,7 +166,7 @@ class WebCrawler:
if user_agent:
self.crawler_strategy.update_user_agent(user_agent)
t1 = time.time()
html = self.crawler_strategy.crawl(url, **kwargs)
html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
t2 = time.time()
if verbose:
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
@@ -213,8 +213,8 @@ class WebCrawler:
except InvalidCSSSelectorError as e:
raise ValueError(str(e))
cleaned_html = result.get("cleaned_html", "")
markdown = result.get("markdown", "")
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
markdown = sanitize_input_encode(result.get("markdown", ""))
media = result.get("media", [])
links = result.get("links", [])
metadata = result.get("metadata", {})

View File

@@ -36,5 +36,5 @@ model_fees = json.loads(result.extracted_content)
print(len(model_fees))
with open(".data/data.json", "w") as f:
with open(".data/data.json", "w", encoding="utf-8") as f:
f.write(result.extracted_content)

View File

@@ -249,15 +249,40 @@ def using_crawler_hooks(crawler):
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
crawler.set_hook('on_driver_created', on_driver_created)
crawler.set_hook('before_get_url', before_get_url)
crawler.set_hook('after_get_url', after_get_url)
crawler.set_hook('before_return_html', before_return_html)
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
crawler_strategy.set_hook('on_driver_created', on_driver_created)
crawler_strategy.set_hook('before_get_url', before_get_url)
crawler_strategy.set_hook('after_get_url', after_get_url)
crawler_strategy.set_hook('before_return_html', before_return_html)
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
crawler.warmup()
result = crawler.run(url="https://example.com")
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
print_result(result= result)
def using_crawler_hooks_dleay_example(crawler):
def delay(driver):
print("Delaying for 5 seconds...")
time.sleep(5)
print("Resuming...")
def create_crawler():
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
crawler_strategy.set_hook('after_get_url', delay)
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
crawler.warmup()
return crawler
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]")
crawler = create_crawler()
result = crawler.run(url="https://google.com", bypass_cache=True)
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
print_result(result)
def main():
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")

View File

@@ -42,5 +42,5 @@ page_summary = json.loads(result.extracted_content)
print(page_summary)
with open(".data/page_summary.json", "w") as f:
with open(".data/page_summary.json", "w", encoding="utf-8") as f:
f.write(result.extracted_content)

View File

@@ -1,5 +1,13 @@
# Changelog
## v0.2.74 - 2024-07-08
A slew of exciting updates to improve the crawler's stability and robustness! 🎉
- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
## [v0.2.73] - 2024-07-03
💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.

View File

@@ -14,6 +14,9 @@ Let's see how we can customize the crawler using hooks! In this example, we'll:
### Hook Definitions
```python
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.crawler_strategy import *
def on_driver_created(driver):
print("[HOOK] on_driver_created")
# Example customization: maximize the window
@@ -66,12 +69,13 @@ def before_return_html(driver, html):
```python
print("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
crawler = WebCrawler(verbose=True)
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
crawler_strategy.set_hook('on_driver_created', on_driver_created)
crawler_strategy.set_hook('before_get_url', before_get_url)
crawler_strategy.set_hook('after_get_url', after_get_url)
crawler_strategy.set_hook('before_return_html', before_return_html)
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
crawler.warmup()
crawler.set_hook('on_driver_created', on_driver_created)
crawler.set_hook('before_get_url', before_get_url)
crawler.set_hook('after_get_url', after_get_url)
crawler.set_hook('before_return_html', before_return_html)
result = crawler.run(url="https://example.com")

View File

@@ -45,7 +45,7 @@ model_fees = json.loads(result.extracted_content)
print(len(model_fees))
with open(".data/data.json", "w") as f:
with open(".data/data.json", "w", encoding="utf-8") as f:
f.write(result.extracted_content)
```
@@ -71,7 +71,7 @@ model_fees = json.loads(result.extracted_content)
print(len(model_fees))
with open(".data/data.json", "w") as f:
with open(".data/data.json", "w", encoding="utf-8") as f:
f.write(result.extracted_content)
```

View File

@@ -91,7 +91,7 @@ This example demonstrates how to use `Crawl4AI` to extract a summary from a web
Save the extracted data to a file for further use.
```python
with open(".data/page_summary.json", "w") as f:
with open(".data/page_summary.json", "w", encoding="utf-8") as f:
f.write(result.extracted_content)
```

View File

@@ -1,4 +1,4 @@
# Crawl4AI v0.2.73
# Crawl4AI v0.2.74
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.

View File

@@ -176,41 +176,29 @@ print(f"JavaScript Code (Load More button) result: {result}")
Let's see how we can customize the crawler using hooks!
```python
def on_driver_created(driver):
print("[HOOK] on_driver_created")
driver.maximize_window()
driver.get('https://example.com/login')
driver.find_element(By.NAME, 'username').send_keys('testuser')
driver.find_element(By.NAME, 'password').send_keys('password123')
driver.find_element(By.NAME, 'login').click()
driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
return driver
import time
def before_get_url(driver):
print("[HOOK] before_get_url")
driver.execute_cdp_cmd('Network.enable', {})
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
return driver
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.crawler_strategy import *
def after_get_url(driver):
print("[HOOK] after_get_url")
print(driver.current_url)
return driver
def delay(driver):
print("Delaying for 5 seconds...")
time.sleep(5)
print("Resuming...")
def create_crawler():
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
crawler_strategy.set_hook('after_get_url', delay)
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
crawler.warmup()
return crawler
def before_return_html(driver, html):
print("[HOOK] before_return_html")
print(len(html))
return driver
crawler.set_hook('on_driver_created', on_driver_created)
crawler.set_hook('before_get_url', before_get_url)
crawler.set_hook('after_get_url', after_get_url)
crawler.set_hook('before_return_html', before_return_html)
result = crawler.run(url="https://example.com")
print(f"Crawler Hooks result: {result}")
crawler = create_crawler()
result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
```
check [Hooks](examples/hooks_auth.md) for more examples.
## Congratulations! 🎉
You've made it through the Crawl4AI Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️

View File

@@ -5,10 +5,15 @@ import subprocess
from setuptools.command.install import install
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
# If the folder already exists, remove the cache folder
crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
if os.path.exists(f"{crawl4ai_folder}/cache"):
subprocess.run(["rm", "-rf", f"{crawl4ai_folder}/cache"])
os.makedirs(crawl4ai_folder, exist_ok=True)
os.makedirs(f"{crawl4ai_folder}/cache", exist_ok=True)
# Read the requirements from requirements.txt
with open("requirements.txt") as f:
requirements = f.read().splitlines()
@@ -20,7 +25,7 @@ transformer_requirements = [req for req in requirements if req.startswith(("tran
setup(
name="Crawl4AI",
version="0.2.73",
version="0.2.74",
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
long_description=open("README.md", encoding="utf-8").read(),
long_description_content_type="text/markdown",