Compare commits

..

4 Commits

Author SHA1 Message Date
unclecode
d11a83c232 ## [0.2.71] 2024-06-26
• Refactored `crawler_strategy.py` to handle exceptions and improve error messages
• Improved `get_content_of_website_optimized` function in `utils.py` for better performance
• Updated `utils.py` with latest changes
• Migrated to `ChromeDriverManager` for resolving Chrome driver download issues
2024-06-26 15:34:15 +08:00
unclecode
3255c7a3fa Update CHANGELOG.md with recent commits 2024-06-26 15:20:34 +08:00
unclecode
4756d0a532 Refactor crawler_strategy.py to handle exceptions and improve error messages 2024-06-26 15:04:33 +08:00
unclecode
7ba2142363 chore: Refactor get_content_of_website_optimized function in utils.py 2024-06-26 14:43:09 +08:00
9 changed files with 109 additions and 66 deletions

4
.gitignore vendored
View File

@@ -185,4 +185,6 @@ local/
a.txt
.lambda_function.py
ec2*
ec2*
update_changelog.sh

View File

@@ -1,5 +1,16 @@
# Changelog
## [0.2.71] 2024-06-26
• Refactored `crawler_strategy.py` to handle exceptions and improve error messages
• Improved `get_content_of_website_optimized` function in `utils.py` for better performance
• Updated `utils.py` with latest changes
• Migrated to `ChromeDriverManager` for resolving Chrome driver download issues
## [0.2.71] - 2024-06-25
### Fixed
- Speed up twice the extraction function.
## [0.2.6] - 2024-06-22
### Fixed
- Fix issue #19: Update Dockerfile to ensure compatibility across multiple platforms.

View File

@@ -1,4 +1,4 @@
# Crawl4AI v0.2.7 🕷️🤖
# Crawl4AI v0.2.71 🕷️🤖
[![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
[![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)

View File

@@ -5,7 +5,7 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import InvalidArgumentException
from selenium.common.exceptions import InvalidArgumentException, WebDriverException
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
@@ -220,9 +220,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
return html
except InvalidArgumentException:
raise InvalidArgumentException(f"Invalid URL {url}")
if not hasattr(e, 'msg'):
e.msg = str(e)
raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
except WebDriverException as e:
# If e does nlt have msg attribute create it and set it to str(e)
if not hasattr(e, 'msg'):
e.msg = str(e)
raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
except Exception as e:
raise Exception(f"Failed to crawl {url}: {str(e)}")
if not hasattr(e, 'msg'):
e.msg = str(e)
raise Exception(f"Failed to crawl {url}: {e.msg}")
def take_screenshot(self) -> str:
try:

View File

@@ -438,18 +438,17 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
links = {'internal': [], 'external': []}
media = {'images': [], 'videos': [], 'audios': []}
def process_element(element: element.PageElement) -> None:
def process_element(element: element.PageElement) -> bool:
if isinstance(element, NavigableString):
if isinstance(element, Comment):
element.extract()
return
# if not isinstance(element, element.Tag):
# return
return False
if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
element.decompose()
return
return False
keep_element = False
if element.name == 'a' and element.get('href'):
href = element['href']
@@ -459,6 +458,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
links['external'].append(link_data)
else:
links['internal'].append(link_data)
keep_element = True
elif element.name == 'img':
media['images'].append({
@@ -466,12 +466,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
'alt': element.get('alt'),
'type': 'image'
})
alt_text = element.get('alt')
if alt_text:
element.replace_with(soup.new_string(alt_text))
else:
element.decompose()
return
return True # Always keep image elements
elif element.name in ['video', 'audio']:
media[f"{element.name}s"].append({
@@ -479,6 +474,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
'alt': element.get('alt'),
'type': element.name
})
return True # Always keep video and audio elements
if element.name != 'pre':
if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']:
@@ -489,17 +485,26 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
elif element.name != 'img':
element.attrs = {}
word_count = len(element.get_text(strip=True).split())
if word_count < word_count_threshold:
element.decompose()
return
# Process children
for child in list(element.children):
process_element(child)
if isinstance(child, NavigableString) and not isinstance(child, Comment):
if len(child.strip()) > 0:
keep_element = True
else:
if process_element(child):
keep_element = True
if not element.contents and not element.get_text(strip=True):
# Check word count
if not keep_element:
word_count = len(element.get_text(strip=True).split())
keep_element = word_count >= word_count_threshold
if not keep_element:
element.decompose()
return keep_element
process_element(body)
def flatten_nested_elements(node):

View File

@@ -129,47 +129,57 @@ class WebCrawler:
verbose=True,
**kwargs,
) -> CrawlResult:
extraction_strategy = extraction_strategy or NoExtractionStrategy()
extraction_strategy.verbose = verbose
if not isinstance(extraction_strategy, ExtractionStrategy):
raise ValueError("Unsupported extraction strategy")
if not isinstance(chunking_strategy, ChunkingStrategy):
raise ValueError("Unsupported chunking strategy")
if word_count_threshold < MIN_WORD_THRESHOLD:
word_count_threshold = MIN_WORD_THRESHOLD
try:
extraction_strategy = extraction_strategy or NoExtractionStrategy()
extraction_strategy.verbose = verbose
if not isinstance(extraction_strategy, ExtractionStrategy):
raise ValueError("Unsupported extraction strategy")
if not isinstance(chunking_strategy, ChunkingStrategy):
raise ValueError("Unsupported chunking strategy")
# if word_count_threshold < MIN_WORD_THRESHOLD:
# word_count_threshold = MIN_WORD_THRESHOLD
word_count_threshold = max(word_count_threshold, 0)
# Check cache first
cached = None
screenshot_data = None
extracted_content = None
if not bypass_cache and not self.always_by_pass_cache:
cached = get_cached_url(url)
if kwargs.get("warmup", True) and not self.ready:
return None
if cached:
html = cached[1]
extracted_content = cached[4]
if screenshot:
screenshot_data = cached[9]
if not screenshot_data:
cached = None
if not cached or not html:
if user_agent:
self.crawler_strategy.update_user_agent(user_agent)
t1 = time.time()
html = self.crawler_strategy.crawl(url)
t2 = time.time()
if verbose:
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
if screenshot:
screenshot_data = self.crawler_strategy.take_screenshot()
# Check cache first
cached = None
screenshot_data = None
extracted_content = None
if not bypass_cache and not self.always_by_pass_cache:
cached = get_cached_url(url)
if kwargs.get("warmup", True) and not self.ready:
return None
if cached:
html = cached[1]
extracted_content = cached[4]
if screenshot:
screenshot_data = cached[9]
if not screenshot_data:
cached = None
if not cached or not html:
if user_agent:
self.crawler_strategy.update_user_agent(user_agent)
t1 = time.time()
html = self.crawler_strategy.crawl(url)
t2 = time.time()
if verbose:
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
if screenshot:
screenshot_data = self.crawler_strategy.take_screenshot()
return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
crawl_result = self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
crawl_result.success = bool(html)
return crawl_result
except Exception as e:
if not hasattr(e, "msg"):
e.msg = str(e)
print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")
return CrawlResult(url=url, html="", success=False, error_message=e.msg)
def process_html(
self,

View File

@@ -1,6 +1,12 @@
# Changelog
## [0.2.7] - 2024-06-27
## [0.2.71] 2024-06-26
• Refactored `crawler_strategy.py` to handle exceptions and improve error messages
• Improved `get_content_of_website_optimized` function in `utils.py` for better performance
• Updated `utils.py` with latest changes
• Migrated to `ChromeDriverManager` for resolving Chrome driver download issues
## [0.2.71] - 2024-06-25
### Fixed
- Speed up twice the extraction function.

View File

@@ -1,4 +1,4 @@
# Crawl4AI v0.2.7
# Crawl4AI v0.2.71
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.

View File

@@ -33,7 +33,7 @@ class CustomInstallCommand(install):
setup(
name="Crawl4AI",
version="0.2.7",
version="0.2.71",
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
long_description=open("README.md").read(),
long_description_content_type="text/markdown",