## [v0.2.74] - 2024-07-08
A slew of exciting updates to improve the crawler's stability and robustness! 🎉 - 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding. - 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy. - 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy. - 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
This commit is contained in:
@@ -1,5 +1,14 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## [v0.2.74] - 2024-07-08
|
||||||
|
A slew of exciting updates to improve the crawler's stability and robustness! 🎉
|
||||||
|
|
||||||
|
- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
|
||||||
|
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
|
||||||
|
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
|
||||||
|
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
|
||||||
|
|
||||||
|
|
||||||
## [v0.2.73] - 2024-07-03
|
## [v0.2.73] - 2024-07-03
|
||||||
|
|
||||||
💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.
|
💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# Crawl4AI v0.2.73 🕷️🤖
|
# Crawl4AI v0.2.74 🕷️🤖
|
||||||
|
|
||||||
[](https://github.com/unclecode/crawl4ai/stargazers)
|
[](https://github.com/unclecode/crawl4ai/stargazers)
|
||||||
[](https://github.com/unclecode/crawl4ai/network/members)
|
[](https://github.com/unclecode/crawl4ai/network/members)
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ import re
|
|||||||
from collections import Counter
|
from collections import Counter
|
||||||
import string
|
import string
|
||||||
from .model_loader import load_nltk_punkt
|
from .model_loader import load_nltk_punkt
|
||||||
|
from .utils import *
|
||||||
|
|
||||||
# Define the abstract base class for chunking strategies
|
# Define the abstract base class for chunking strategies
|
||||||
class ChunkingStrategy(ABC):
|
class ChunkingStrategy(ABC):
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ from selenium.webdriver.chrome.options import Options
|
|||||||
from selenium.common.exceptions import InvalidArgumentException, WebDriverException
|
from selenium.common.exceptions import InvalidArgumentException, WebDriverException
|
||||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||||
from webdriver_manager.chrome import ChromeDriverManager
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
from urllib3.exceptions import MaxRetryError
|
||||||
|
|
||||||
from .config import *
|
from .config import *
|
||||||
import logging, time
|
import logging, time
|
||||||
@@ -18,7 +19,7 @@ from typing import List, Callable
|
|||||||
import requests
|
import requests
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from .utils import wrap_text
|
from .utils import *
|
||||||
|
|
||||||
logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
|
logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
|
||||||
logger.setLevel(logging.WARNING)
|
logger.setLevel(logging.WARNING)
|
||||||
@@ -73,7 +74,7 @@ class CloudCrawlerStrategy(CrawlerStrategy):
|
|||||||
response = requests.post("http://crawl4ai.uccode.io/crawl", json=data)
|
response = requests.post("http://crawl4ai.uccode.io/crawl", json=data)
|
||||||
response = response.json()
|
response = response.json()
|
||||||
html = response["results"][0]["html"]
|
html = response["results"][0]["html"]
|
||||||
return html
|
return sanitize_input_encode(html)
|
||||||
|
|
||||||
class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||||
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
|
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
|
||||||
@@ -200,7 +201,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
||||||
if os.path.exists(cache_file_path):
|
if os.path.exists(cache_file_path):
|
||||||
with open(cache_file_path, "r") as f:
|
with open(cache_file_path, "r") as f:
|
||||||
return f.read()
|
return sanitize_input_encode(f.read())
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.driver = self.execute_hook('before_get_url', self.driver)
|
self.driver = self.execute_hook('before_get_url', self.driver)
|
||||||
@@ -214,11 +215,15 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
WebDriverWait(self.driver, 10).until(
|
WebDriverWait(self.driver, 10).until(
|
||||||
EC.presence_of_all_elements_located((By.TAG_NAME, "body"))
|
EC.presence_of_all_elements_located((By.TAG_NAME, "body"))
|
||||||
)
|
)
|
||||||
|
|
||||||
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
html = self._ensure_page_load() # self.driver.page_source
|
|
||||||
|
self.driver = self.execute_hook('after_get_url', self.driver)
|
||||||
|
html = sanitize_input_encode(self._ensure_page_load()) # self.driver.page_source
|
||||||
can_not_be_done_headless = False # Look at my creativity for naming variables
|
can_not_be_done_headless = False # Look at my creativity for naming variables
|
||||||
# TODO: Very ugly way for now but it works
|
|
||||||
if not kwargs.get('bypass_headless', False) and html == "<html><head></head><body></body></html>":
|
# TODO: Very ugly approach, but promise to change it!
|
||||||
|
if kwargs.get('bypass_headless', False) or html == "<html><head></head><body></body></html>":
|
||||||
print("[LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode...")
|
print("[LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode...")
|
||||||
can_not_be_done_headless = True
|
can_not_be_done_headless = True
|
||||||
options = Options()
|
options = Options()
|
||||||
@@ -227,11 +232,10 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
options.add_argument("--window-size=5,5")
|
options.add_argument("--window-size=5,5")
|
||||||
driver = webdriver.Chrome(service=self.service, options=options)
|
driver = webdriver.Chrome(service=self.service, options=options)
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
html = driver.page_source
|
self.driver = self.execute_hook('after_get_url', driver)
|
||||||
|
html = sanitize_input_encode(driver.page_source)
|
||||||
driver.quit()
|
driver.quit()
|
||||||
|
|
||||||
self.driver = self.execute_hook('after_get_url', self.driver)
|
|
||||||
|
|
||||||
# Execute JS code if provided
|
# Execute JS code if provided
|
||||||
if self.js_code and type(self.js_code) == str:
|
if self.js_code and type(self.js_code) == str:
|
||||||
self.driver.execute_script(self.js_code)
|
self.driver.execute_script(self.js_code)
|
||||||
@@ -247,12 +251,12 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if not can_not_be_done_headless:
|
if not can_not_be_done_headless:
|
||||||
html = self.driver.page_source
|
html = sanitize_input_encode(self.driver.page_source)
|
||||||
self.driver = self.execute_hook('before_return_html', self.driver, html)
|
self.driver = self.execute_hook('before_return_html', self.driver, html)
|
||||||
|
|
||||||
# Store in cache
|
# Store in cache
|
||||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
||||||
with open(cache_file_path, "w") as f:
|
with open(cache_file_path, "w", encoding="utf-8") as f:
|
||||||
f.write(html)
|
f.write(html)
|
||||||
|
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
@@ -261,16 +265,16 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
return html
|
return html
|
||||||
except InvalidArgumentException:
|
except InvalidArgumentException:
|
||||||
if not hasattr(e, 'msg'):
|
if not hasattr(e, 'msg'):
|
||||||
e.msg = str(e)
|
e.msg = sanitize_input_encode(str(e))
|
||||||
raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
|
raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
|
||||||
except WebDriverException as e:
|
except WebDriverException as e:
|
||||||
# If e does nlt have msg attribute create it and set it to str(e)
|
# If e does nlt have msg attribute create it and set it to str(e)
|
||||||
if not hasattr(e, 'msg'):
|
if not hasattr(e, 'msg'):
|
||||||
e.msg = str(e)
|
e.msg = sanitize_input_encode(str(e))
|
||||||
raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
|
raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if not hasattr(e, 'msg'):
|
if not hasattr(e, 'msg'):
|
||||||
e.msg = str(e)
|
e.msg = sanitize_input_encode(str(e))
|
||||||
raise Exception(f"Failed to crawl {url}: {e.msg}")
|
raise Exception(f"Failed to crawl {url}: {e.msg}")
|
||||||
|
|
||||||
def take_screenshot(self) -> str:
|
def take_screenshot(self) -> str:
|
||||||
@@ -299,7 +303,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
return img_base64
|
return img_base64
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_message = f"Failed to take screenshot: {str(e)}"
|
error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}")
|
||||||
print(error_message)
|
print(error_message)
|
||||||
|
|
||||||
# Generate an image with black background
|
# Generate an image with black background
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ def init_db():
|
|||||||
extracted_content TEXT,
|
extracted_content TEXT,
|
||||||
success BOOLEAN,
|
success BOOLEAN,
|
||||||
media TEXT DEFAULT "{}",
|
media TEXT DEFAULT "{}",
|
||||||
link TEXT DEFAULT "{}",
|
links TEXT DEFAULT "{}",
|
||||||
metadata TEXT DEFAULT "{}",
|
metadata TEXT DEFAULT "{}",
|
||||||
screenshot TEXT DEFAULT ""
|
screenshot TEXT DEFAULT ""
|
||||||
)
|
)
|
||||||
@@ -127,6 +127,9 @@ def update_existing_records(new_column: str = "media", default_value: str = "{}"
|
|||||||
print(f"Error updating existing records: {e}")
|
print(f"Error updating existing records: {e}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
init_db() # Initialize the database if not already initialized
|
# Delete the existing database file
|
||||||
alter_db_add_screenshot("metadata") # Add the new column to the table
|
if os.path.exists(DB_PATH):
|
||||||
update_existing_records("metadata") # Update existing records to set the new column to an empty string
|
os.remove(DB_PATH)
|
||||||
|
init_db()
|
||||||
|
# alter_db_add_screenshot("COL_NAME")
|
||||||
|
|
||||||
|
|||||||
@@ -116,7 +116,6 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
for block in blocks:
|
for block in blocks:
|
||||||
block['error'] = False
|
block['error'] = False
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Error extracting blocks:", str(e))
|
|
||||||
parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
|
parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
|
||||||
blocks = parsed
|
blocks = parsed
|
||||||
if unparsed:
|
if unparsed:
|
||||||
@@ -192,7 +191,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
# Sequential processing with a delay
|
# Sequential processing with a delay
|
||||||
for ix, section in enumerate(merged_sections):
|
for ix, section in enumerate(merged_sections):
|
||||||
extract_func = partial(self.extract, url)
|
extract_func = partial(self.extract, url)
|
||||||
extracted_content.extend(extract_func(ix, section))
|
extracted_content.extend(extract_func(ix, sanitize_input_encode(section)))
|
||||||
time.sleep(0.5) # 500 ms delay between each processing
|
time.sleep(0.5) # 500 ms delay between each processing
|
||||||
else:
|
else:
|
||||||
# Parallel processing using ThreadPoolExecutor
|
# Parallel processing using ThreadPoolExecutor
|
||||||
@@ -202,10 +201,21 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=4) as executor:
|
with ThreadPoolExecutor(max_workers=4) as executor:
|
||||||
extract_func = partial(self.extract, url)
|
extract_func = partial(self.extract, url)
|
||||||
futures = [executor.submit(extract_func, ix, section) for ix, section in enumerate(merged_sections)]
|
futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)]
|
||||||
|
|
||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
extracted_content.extend(future.result())
|
try:
|
||||||
|
extracted_content.extend(future.result())
|
||||||
|
except Exception as e:
|
||||||
|
if self.verbose:
|
||||||
|
print(f"Error in thread execution: {e}")
|
||||||
|
# Add error information to extracted_content
|
||||||
|
extracted_content.append({
|
||||||
|
"index": 0,
|
||||||
|
"error": True,
|
||||||
|
"tags": ["error"],
|
||||||
|
"content": str(e)
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
return extracted_content
|
return extracted_content
|
||||||
|
|||||||
@@ -96,6 +96,16 @@ def sanitize_html(html):
|
|||||||
|
|
||||||
return sanitized_html
|
return sanitized_html
|
||||||
|
|
||||||
|
def sanitize_input_encode(text: str) -> str:
|
||||||
|
"""Sanitize input to handle potential encoding issues."""
|
||||||
|
try:
|
||||||
|
# Attempt to encode and decode as UTF-8 to handle potential encoding issues
|
||||||
|
return text.encode('utf-8', errors='ignore').decode('utf-8')
|
||||||
|
except UnicodeEncodeError as e:
|
||||||
|
print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}")
|
||||||
|
# Fall back to ASCII if UTF-8 fails
|
||||||
|
return text.encode('ascii', errors='ignore').decode('ascii')
|
||||||
|
|
||||||
def escape_json_string(s):
|
def escape_json_string(s):
|
||||||
"""
|
"""
|
||||||
Escapes characters in a string to be JSON safe.
|
Escapes characters in a string to be JSON safe.
|
||||||
@@ -664,7 +674,6 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
|
|||||||
for block in blocks:
|
for block in blocks:
|
||||||
block['error'] = False
|
block['error'] = False
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Error extracting blocks:", str(e))
|
|
||||||
parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
|
parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
|
||||||
blocks = parsed
|
blocks = parsed
|
||||||
# Append all unparsed segments as onr error block and content is list of unparsed segments
|
# Append all unparsed segments as onr error block and content is list of unparsed segments
|
||||||
@@ -710,7 +719,6 @@ def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_toke
|
|||||||
blocks = json.loads(blocks)
|
blocks = json.loads(blocks)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Error extracting blocks:", str(e))
|
|
||||||
blocks = [{
|
blocks = [{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
"tags": ["error"],
|
"tags": ["error"],
|
||||||
|
|||||||
@@ -155,8 +155,8 @@ class WebCrawler:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
if cached:
|
if cached:
|
||||||
html = cached[1]
|
html = sanitize_input_encode(cached[1])
|
||||||
extracted_content = cached[4]
|
extracted_content = sanitize_input_encode(cached[4])
|
||||||
if screenshot:
|
if screenshot:
|
||||||
screenshot_data = cached[9]
|
screenshot_data = cached[9]
|
||||||
if not screenshot_data:
|
if not screenshot_data:
|
||||||
@@ -166,7 +166,7 @@ class WebCrawler:
|
|||||||
if user_agent:
|
if user_agent:
|
||||||
self.crawler_strategy.update_user_agent(user_agent)
|
self.crawler_strategy.update_user_agent(user_agent)
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
html = self.crawler_strategy.crawl(url, **kwargs)
|
html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
|
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
|
||||||
@@ -213,8 +213,8 @@ class WebCrawler:
|
|||||||
except InvalidCSSSelectorError as e:
|
except InvalidCSSSelectorError as e:
|
||||||
raise ValueError(str(e))
|
raise ValueError(str(e))
|
||||||
|
|
||||||
cleaned_html = result.get("cleaned_html", "")
|
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
||||||
markdown = result.get("markdown", "")
|
markdown = sanitize_input_encode(result.get("markdown", ""))
|
||||||
media = result.get("media", [])
|
media = result.get("media", [])
|
||||||
links = result.get("links", [])
|
links = result.get("links", [])
|
||||||
metadata = result.get("metadata", {})
|
metadata = result.get("metadata", {})
|
||||||
|
|||||||
@@ -36,5 +36,5 @@ model_fees = json.loads(result.extracted_content)
|
|||||||
|
|
||||||
print(len(model_fees))
|
print(len(model_fees))
|
||||||
|
|
||||||
with open(".data/data.json", "w") as f:
|
with open(".data/data.json", "w", encoding="utf-8") as f:
|
||||||
f.write(result.extracted_content)
|
f.write(result.extracted_content)
|
||||||
@@ -249,15 +249,40 @@ def using_crawler_hooks(crawler):
|
|||||||
|
|
||||||
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
||||||
|
|
||||||
crawler.set_hook('on_driver_created', on_driver_created)
|
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||||
crawler.set_hook('before_get_url', before_get_url)
|
crawler_strategy.set_hook('on_driver_created', on_driver_created)
|
||||||
crawler.set_hook('after_get_url', after_get_url)
|
crawler_strategy.set_hook('before_get_url', before_get_url)
|
||||||
crawler.set_hook('before_return_html', before_return_html)
|
crawler_strategy.set_hook('after_get_url', after_get_url)
|
||||||
|
crawler_strategy.set_hook('before_return_html', before_return_html)
|
||||||
|
|
||||||
|
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||||
|
crawler.warmup()
|
||||||
result = crawler.run(url="https://example.com")
|
result = crawler.run(url="https://example.com")
|
||||||
|
|
||||||
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||||||
print_result(result= result)
|
print_result(result= result)
|
||||||
|
|
||||||
|
def using_crawler_hooks_dleay_example(crawler):
|
||||||
|
def delay(driver):
|
||||||
|
print("Delaying for 5 seconds...")
|
||||||
|
time.sleep(5)
|
||||||
|
print("Resuming...")
|
||||||
|
|
||||||
|
def create_crawler():
|
||||||
|
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||||
|
crawler_strategy.set_hook('after_get_url', delay)
|
||||||
|
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||||
|
crawler.warmup()
|
||||||
|
return crawler
|
||||||
|
|
||||||
|
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]")
|
||||||
|
crawler = create_crawler()
|
||||||
|
result = crawler.run(url="https://google.com", bypass_cache=True)
|
||||||
|
|
||||||
|
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||||||
|
print_result(result)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
||||||
|
|||||||
@@ -42,5 +42,5 @@ page_summary = json.loads(result.extracted_content)
|
|||||||
|
|
||||||
print(page_summary)
|
print(page_summary)
|
||||||
|
|
||||||
with open(".data/page_summary.json", "w") as f:
|
with open(".data/page_summary.json", "w", encoding="utf-8") as f:
|
||||||
f.write(result.extracted_content)
|
f.write(result.extracted_content)
|
||||||
|
|||||||
@@ -1,5 +1,13 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## v0.2.74 - 2024-07-08
|
||||||
|
A slew of exciting updates to improve the crawler's stability and robustness! 🎉
|
||||||
|
|
||||||
|
- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
|
||||||
|
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
|
||||||
|
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
|
||||||
|
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
|
||||||
|
|
||||||
## [v0.2.73] - 2024-07-03
|
## [v0.2.73] - 2024-07-03
|
||||||
|
|
||||||
💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.
|
💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.
|
||||||
|
|||||||
@@ -14,6 +14,9 @@ Let's see how we can customize the crawler using hooks! In this example, we'll:
|
|||||||
### Hook Definitions
|
### Hook Definitions
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
from crawl4ai.web_crawler import WebCrawler
|
||||||
|
from crawl4ai.crawler_strategy import *
|
||||||
|
|
||||||
def on_driver_created(driver):
|
def on_driver_created(driver):
|
||||||
print("[HOOK] on_driver_created")
|
print("[HOOK] on_driver_created")
|
||||||
# Example customization: maximize the window
|
# Example customization: maximize the window
|
||||||
@@ -66,12 +69,13 @@ def before_return_html(driver, html):
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
print("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
print("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
||||||
crawler = WebCrawler(verbose=True)
|
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||||
|
crawler_strategy.set_hook('on_driver_created', on_driver_created)
|
||||||
|
crawler_strategy.set_hook('before_get_url', before_get_url)
|
||||||
|
crawler_strategy.set_hook('after_get_url', after_get_url)
|
||||||
|
crawler_strategy.set_hook('before_return_html', before_return_html)
|
||||||
|
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||||
crawler.warmup()
|
crawler.warmup()
|
||||||
crawler.set_hook('on_driver_created', on_driver_created)
|
|
||||||
crawler.set_hook('before_get_url', before_get_url)
|
|
||||||
crawler.set_hook('after_get_url', after_get_url)
|
|
||||||
crawler.set_hook('before_return_html', before_return_html)
|
|
||||||
|
|
||||||
result = crawler.run(url="https://example.com")
|
result = crawler.run(url="https://example.com")
|
||||||
|
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ model_fees = json.loads(result.extracted_content)
|
|||||||
|
|
||||||
print(len(model_fees))
|
print(len(model_fees))
|
||||||
|
|
||||||
with open(".data/data.json", "w") as f:
|
with open(".data/data.json", "w", encoding="utf-8") as f:
|
||||||
f.write(result.extracted_content)
|
f.write(result.extracted_content)
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -71,7 +71,7 @@ model_fees = json.loads(result.extracted_content)
|
|||||||
|
|
||||||
print(len(model_fees))
|
print(len(model_fees))
|
||||||
|
|
||||||
with open(".data/data.json", "w") as f:
|
with open(".data/data.json", "w", encoding="utf-8") as f:
|
||||||
f.write(result.extracted_content)
|
f.write(result.extracted_content)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -91,7 +91,7 @@ This example demonstrates how to use `Crawl4AI` to extract a summary from a web
|
|||||||
Save the extracted data to a file for further use.
|
Save the extracted data to a file for further use.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
with open(".data/page_summary.json", "w") as f:
|
with open(".data/page_summary.json", "w", encoding="utf-8") as f:
|
||||||
f.write(result.extracted_content)
|
f.write(result.extracted_content)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# Crawl4AI v0.2.73
|
# Crawl4AI v0.2.74
|
||||||
|
|
||||||
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
|
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
|
||||||
|
|
||||||
|
|||||||
@@ -176,41 +176,29 @@ print(f"JavaScript Code (Load More button) result: {result}")
|
|||||||
Let's see how we can customize the crawler using hooks!
|
Let's see how we can customize the crawler using hooks!
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def on_driver_created(driver):
|
import time
|
||||||
print("[HOOK] on_driver_created")
|
|
||||||
driver.maximize_window()
|
|
||||||
driver.get('https://example.com/login')
|
|
||||||
driver.find_element(By.NAME, 'username').send_keys('testuser')
|
|
||||||
driver.find_element(By.NAME, 'password').send_keys('password123')
|
|
||||||
driver.find_element(By.NAME, 'login').click()
|
|
||||||
driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
|
|
||||||
return driver
|
|
||||||
|
|
||||||
def before_get_url(driver):
|
from crawl4ai.web_crawler import WebCrawler
|
||||||
print("[HOOK] before_get_url")
|
from crawl4ai.crawler_strategy import *
|
||||||
driver.execute_cdp_cmd('Network.enable', {})
|
|
||||||
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
|
|
||||||
return driver
|
|
||||||
|
|
||||||
def after_get_url(driver):
|
def delay(driver):
|
||||||
print("[HOOK] after_get_url")
|
print("Delaying for 5 seconds...")
|
||||||
print(driver.current_url)
|
time.sleep(5)
|
||||||
return driver
|
print("Resuming...")
|
||||||
|
|
||||||
|
def create_crawler():
|
||||||
|
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||||
|
crawler_strategy.set_hook('after_get_url', delay)
|
||||||
|
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||||
|
crawler.warmup()
|
||||||
|
return crawler
|
||||||
|
|
||||||
def before_return_html(driver, html):
|
crawler = create_crawler()
|
||||||
print("[HOOK] before_return_html")
|
result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
|
||||||
print(len(html))
|
|
||||||
return driver
|
|
||||||
|
|
||||||
crawler.set_hook('on_driver_created', on_driver_created)
|
|
||||||
crawler.set_hook('before_get_url', before_get_url)
|
|
||||||
crawler.set_hook('after_get_url', after_get_url)
|
|
||||||
crawler.set_hook('before_return_html', before_return_html)
|
|
||||||
|
|
||||||
result = crawler.run(url="https://example.com")
|
|
||||||
print(f"Crawler Hooks result: {result}")
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
check [Hooks](examples/hooks_auth.md) for more examples.
|
||||||
|
|
||||||
## Congratulations! 🎉
|
## Congratulations! 🎉
|
||||||
|
|
||||||
You've made it through the Crawl4AI Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️
|
You've made it through the Crawl4AI Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️
|
||||||
|
|||||||
7
setup.py
7
setup.py
@@ -5,10 +5,15 @@ import subprocess
|
|||||||
from setuptools.command.install import install
|
from setuptools.command.install import install
|
||||||
|
|
||||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||||
|
# If the folder already exists, remove the cache folder
|
||||||
crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||||
|
if os.path.exists(f"{crawl4ai_folder}/cache"):
|
||||||
|
subprocess.run(["rm", "-rf", f"{crawl4ai_folder}/cache"])
|
||||||
os.makedirs(crawl4ai_folder, exist_ok=True)
|
os.makedirs(crawl4ai_folder, exist_ok=True)
|
||||||
os.makedirs(f"{crawl4ai_folder}/cache", exist_ok=True)
|
os.makedirs(f"{crawl4ai_folder}/cache", exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Read the requirements from requirements.txt
|
# Read the requirements from requirements.txt
|
||||||
with open("requirements.txt") as f:
|
with open("requirements.txt") as f:
|
||||||
requirements = f.read().splitlines()
|
requirements = f.read().splitlines()
|
||||||
@@ -20,7 +25,7 @@ transformer_requirements = [req for req in requirements if req.startswith(("tran
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="Crawl4AI",
|
name="Crawl4AI",
|
||||||
version="0.2.73",
|
version="0.2.74",
|
||||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
||||||
long_description=open("README.md", encoding="utf-8").read(),
|
long_description=open("README.md", encoding="utf-8").read(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
|
|||||||
Reference in New Issue
Block a user