This commit is contained in:
Unclecode
2024-06-26 07:35:06 +00:00
11 changed files with 127 additions and 71 deletions

4
.gitignore vendored
View File

@@ -185,4 +185,6 @@ local/
a.txt a.txt
.lambda_function.py .lambda_function.py
ec2* ec2*
update_changelog.sh

View File

@@ -1,5 +1,16 @@
# Changelog # Changelog
## [0.2.71] 2024-06-26
• Refactored `crawler_strategy.py` to handle exceptions and improve error messages
• Improved `get_content_of_website_optimized` function in `utils.py` for better performance
• Updated `utils.py` with latest changes
• Migrated to `ChromeDriverManager` for resolving Chrome driver download issues
## [0.2.71] - 2024-06-25
### Fixed
- Speed up twice the extraction function.
## [0.2.6] - 2024-06-22 ## [0.2.6] - 2024-06-22
### Fixed ### Fixed
- Fix issue #19: Update Dockerfile to ensure compatibility across multiple platforms. - Fix issue #19: Update Dockerfile to ensure compatibility across multiple platforms.

View File

@@ -1,4 +1,4 @@
# Crawl4AI v0.2.7 🕷️🤖 # Crawl4AI v0.2.71 🕷️🤖
[![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers) [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
[![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members) [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)

View File

@@ -5,7 +5,10 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import InvalidArgumentException from selenium.common.exceptions import InvalidArgumentException, WebDriverException
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
import logging import logging
import base64 import base64
from PIL import Image, ImageDraw, ImageFont from PIL import Image, ImageDraw, ImageFont
@@ -118,10 +121,15 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
} }
# chromedriver_autoinstaller.install() # chromedriver_autoinstaller.install()
import chromedriver_autoinstaller # import chromedriver_autoinstaller
crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") # crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver(crawl4ai_folder, False) # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)
# chromedriver_path = chromedriver_autoinstaller.install()
# chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver()
# self.service = Service(chromedriver_autoinstaller.install()) # self.service = Service(chromedriver_autoinstaller.install())
chromedriver_path = ChromeDriverManager().install()
self.service = Service(chromedriver_path) self.service = Service(chromedriver_path)
self.service.log_path = "NUL" self.service.log_path = "NUL"
self.driver = webdriver.Chrome(service=self.service, options=self.options) self.driver = webdriver.Chrome(service=self.service, options=self.options)
@@ -212,9 +220,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
return html return html
except InvalidArgumentException: except InvalidArgumentException:
raise InvalidArgumentException(f"Invalid URL {url}") if not hasattr(e, 'msg'):
e.msg = str(e)
raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
except WebDriverException as e:
# If e does nlt have msg attribute create it and set it to str(e)
if not hasattr(e, 'msg'):
e.msg = str(e)
raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
except Exception as e: except Exception as e:
raise Exception(f"Failed to crawl {url}: {str(e)}") if not hasattr(e, 'msg'):
e.msg = str(e)
raise Exception(f"Failed to crawl {url}: {e.msg}")
def take_screenshot(self) -> str: def take_screenshot(self) -> str:
try: try:

View File

@@ -438,18 +438,17 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
links = {'internal': [], 'external': []} links = {'internal': [], 'external': []}
media = {'images': [], 'videos': [], 'audios': []} media = {'images': [], 'videos': [], 'audios': []}
def process_element(element: element.PageElement) -> None: def process_element(element: element.PageElement) -> bool:
if isinstance(element, NavigableString): if isinstance(element, NavigableString):
if isinstance(element, Comment): if isinstance(element, Comment):
element.extract() element.extract()
return return False
# if not isinstance(element, element.Tag):
# return
if element.name in ['script', 'style', 'link', 'meta', 'noscript']: if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
element.decompose() element.decompose()
return return False
keep_element = False
if element.name == 'a' and element.get('href'): if element.name == 'a' and element.get('href'):
href = element['href'] href = element['href']
@@ -459,6 +458,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
links['external'].append(link_data) links['external'].append(link_data)
else: else:
links['internal'].append(link_data) links['internal'].append(link_data)
keep_element = True
elif element.name == 'img': elif element.name == 'img':
media['images'].append({ media['images'].append({
@@ -466,12 +466,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
'alt': element.get('alt'), 'alt': element.get('alt'),
'type': 'image' 'type': 'image'
}) })
alt_text = element.get('alt') return True # Always keep image elements
if alt_text:
element.replace_with(soup.new_string(alt_text))
else:
element.decompose()
return
elif element.name in ['video', 'audio']: elif element.name in ['video', 'audio']:
media[f"{element.name}s"].append({ media[f"{element.name}s"].append({
@@ -479,6 +474,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
'alt': element.get('alt'), 'alt': element.get('alt'),
'type': element.name 'type': element.name
}) })
return True # Always keep video and audio elements
if element.name != 'pre': if element.name != 'pre':
if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']: if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']:
@@ -489,17 +485,26 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
elif element.name != 'img': elif element.name != 'img':
element.attrs = {} element.attrs = {}
word_count = len(element.get_text(strip=True).split()) # Process children
if word_count < word_count_threshold:
element.decompose()
return
for child in list(element.children): for child in list(element.children):
process_element(child) if isinstance(child, NavigableString) and not isinstance(child, Comment):
if len(child.strip()) > 0:
keep_element = True
else:
if process_element(child):
keep_element = True
if not element.contents and not element.get_text(strip=True): # Check word count
if not keep_element:
word_count = len(element.get_text(strip=True).split())
keep_element = word_count >= word_count_threshold
if not keep_element:
element.decompose() element.decompose()
return keep_element
process_element(body) process_element(body)
def flatten_nested_elements(node): def flatten_nested_elements(node):
@@ -770,4 +775,6 @@ def wrap_text(draw, text, font, max_width):
def format_html(html_string): def format_html(html_string):
soup = BeautifulSoup(html_string, 'html.parser') soup = BeautifulSoup(html_string, 'html.parser')
return soup.prettify() return soup.prettify()

View File

@@ -129,47 +129,57 @@ class WebCrawler:
verbose=True, verbose=True,
**kwargs, **kwargs,
) -> CrawlResult: ) -> CrawlResult:
extraction_strategy = extraction_strategy or NoExtractionStrategy() try:
extraction_strategy.verbose = verbose extraction_strategy = extraction_strategy or NoExtractionStrategy()
if not isinstance(extraction_strategy, ExtractionStrategy): extraction_strategy.verbose = verbose
raise ValueError("Unsupported extraction strategy") if not isinstance(extraction_strategy, ExtractionStrategy):
if not isinstance(chunking_strategy, ChunkingStrategy): raise ValueError("Unsupported extraction strategy")
raise ValueError("Unsupported chunking strategy") if not isinstance(chunking_strategy, ChunkingStrategy):
raise ValueError("Unsupported chunking strategy")
if word_count_threshold < MIN_WORD_THRESHOLD:
word_count_threshold = MIN_WORD_THRESHOLD # if word_count_threshold < MIN_WORD_THRESHOLD:
# word_count_threshold = MIN_WORD_THRESHOLD
word_count_threshold = max(word_count_threshold, 0)
# Check cache first # Check cache first
cached = None cached = None
screenshot_data = None screenshot_data = None
extracted_content = None extracted_content = None
if not bypass_cache and not self.always_by_pass_cache: if not bypass_cache and not self.always_by_pass_cache:
cached = get_cached_url(url) cached = get_cached_url(url)
if kwargs.get("warmup", True) and not self.ready: if kwargs.get("warmup", True) and not self.ready:
return None return None
if cached: if cached:
html = cached[1] html = cached[1]
extracted_content = cached[4] extracted_content = cached[4]
if screenshot: if screenshot:
screenshot_data = cached[9] screenshot_data = cached[9]
if not screenshot_data: if not screenshot_data:
cached = None cached = None
if not cached or not html: if not cached or not html:
if user_agent: if user_agent:
self.crawler_strategy.update_user_agent(user_agent) self.crawler_strategy.update_user_agent(user_agent)
t1 = time.time() t1 = time.time()
html = self.crawler_strategy.crawl(url) html = self.crawler_strategy.crawl(url)
t2 = time.time() t2 = time.time()
if verbose: if verbose:
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds") print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
if screenshot: if screenshot:
screenshot_data = self.crawler_strategy.take_screenshot() screenshot_data = self.crawler_strategy.take_screenshot()
return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs) crawl_result = self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
crawl_result.success = bool(html)
return crawl_result
except Exception as e:
if not hasattr(e, "msg"):
e.msg = str(e)
print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")
return CrawlResult(url=url, html="", success=False, error_message=e.msg)
def process_html( def process_html(
self, self,

View File

@@ -1,6 +1,12 @@
# Changelog # Changelog
## [0.2.7] - 2024-06-27 ## [0.2.71] 2024-06-26
• Refactored `crawler_strategy.py` to handle exceptions and improve error messages
• Improved `get_content_of_website_optimized` function in `utils.py` for better performance
• Updated `utils.py` with latest changes
• Migrated to `ChromeDriverManager` for resolving Chrome driver download issues
## [0.2.71] - 2024-06-25
### Fixed ### Fixed
- Speed up twice the extraction function. - Speed up twice the extraction function.

View File

@@ -1,4 +1,4 @@
# Crawl4AI v0.2.7 # Crawl4AI v0.2.71
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI. Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.

View File

@@ -49,7 +49,9 @@ templates = Jinja2Templates(directory=__location__ + "/pages")
@lru_cache() @lru_cache()
def get_crawler(): def get_crawler():
# Initialize and return a WebCrawler instance # Initialize and return a WebCrawler instance
return WebCrawler(verbose = True) crawler = WebCrawler(verbose = True)
crawler.warmup()
return crawler
class CrawlRequest(BaseModel): class CrawlRequest(BaseModel):
urls: List[str] urls: List[str]

View File

@@ -20,3 +20,4 @@ torch==2.3.1
onnxruntime==1.18.0 onnxruntime==1.18.0
tokenizers==0.19.1 tokenizers==0.19.1
pillow==10.3.0 pillow==10.3.0
webdriver-manager==4.0.1

View File

@@ -33,7 +33,7 @@ class CustomInstallCommand(install):
setup( setup(
name="Crawl4AI", name="Crawl4AI",
version="0.2.7", version="0.2.71",
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper", description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
long_description=open("README.md").read(), long_description=open("README.md").read(),
long_description_content_type="text/markdown", long_description_content_type="text/markdown",