This commit is contained in:
Unclecode
2024-06-24 14:40:48 +00:00
6 changed files with 164 additions and 16 deletions

View File

@@ -1,4 +1,4 @@
# Crawl4AI v0.2.6 🕷️🤖 # Crawl4AI v0.2.7 🕷️🤖
[![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers) [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
[![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members) [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)

View File

@@ -10,6 +10,7 @@ from html2text import HTML2Text
from .prompts import PROMPT_EXTRACT_BLOCKS from .prompts import PROMPT_EXTRACT_BLOCKS
from .config import * from .config import *
from pathlib import Path from pathlib import Path
from typing import Dict, Any
class InvalidCSSSelectorError(Exception): class InvalidCSSSelectorError(Exception):
pass pass
@@ -176,16 +177,25 @@ def replace_inline_tags(soup, tags, only_text=False):
'mark': lambda tag: f"=={tag.text}==" 'mark': lambda tag: f"=={tag.text}=="
} }
for tag_name in tags: replacement_data = [(tag, tag_replacements.get(tag, lambda t: t.text)) for tag in tags]
for tag_name, replacement_func in replacement_data:
for tag in soup.find_all(tag_name): for tag in soup.find_all(tag_name):
if not only_text: replacement_text = tag.text if only_text else replacement_func(tag)
replacement_text = tag_replacements.get(tag_name, lambda t: t.text)(tag) tag.replace_with(replacement_text)
tag.replace_with(replacement_text)
else:
tag.replace_with(tag.text)
return soup return soup
# for tag_name in tags:
# for tag in soup.find_all(tag_name):
# if not only_text:
# replacement_text = tag_replacements.get(tag_name, lambda t: t.text)(tag)
# tag.replace_with(replacement_text)
# else:
# tag.replace_with(tag.text)
# return soup
def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None, **kwargs): def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None, **kwargs):
try: try:
if not html: if not html:
@@ -388,13 +398,21 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
markdown = h.handle(cleaned_html) markdown = h.handle(cleaned_html)
markdown = markdown.replace(' ```', '```') markdown = markdown.replace(' ```', '```')
try:
meta = extract_metadata(html, soup)
except Exception as e:
print('Error extracting metadata:', str(e))
meta = {}
# Return the Markdown content # Return the Markdown content
return{ return{
'markdown': markdown, 'markdown': markdown,
'cleaned_html': cleaned_html, 'cleaned_html': cleaned_html,
'success': True, 'success': True,
'media': media, 'media': media,
'links': links 'links': links,
'metadata': meta
} }
except Exception as e: except Exception as e:
@@ -402,15 +420,131 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e
def get_content_of_website_optimized(url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
if not html:
return None
def extract_metadata(html): soup = BeautifulSoup(html, 'html.parser')
body = soup.body
if css_selector:
selected_elements = body.select(css_selector)
if not selected_elements:
raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
body = soup.new_tag('div')
for el in selected_elements:
body.append(el)
links = {'internal': [], 'external': []}
media = {'images': [], 'videos': [], 'audios': []}
def process_element(element: element.PageElement) -> None:
if isinstance(element, NavigableString):
if isinstance(element, Comment):
element.extract()
return
# if not isinstance(element, element.Tag):
# return
if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
element.decompose()
return
if element.name == 'a' and element.get('href'):
href = element['href']
url_base = url.split('/')[2]
link_data = {'href': href, 'text': element.get_text()}
if href.startswith('http') and url_base not in href:
links['external'].append(link_data)
else:
links['internal'].append(link_data)
elif element.name == 'img':
media['images'].append({
'src': element.get('src'),
'alt': element.get('alt'),
'type': 'image'
})
alt_text = element.get('alt')
if alt_text:
element.replace_with(soup.new_string(alt_text))
else:
element.decompose()
return
elif element.name in ['video', 'audio']:
media[f"{element.name}s"].append({
'src': element.get('src'),
'alt': element.get('alt'),
'type': element.name
})
if element.name != 'pre':
if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']:
if kwargs.get('only_text', False):
element.replace_with(element.get_text())
else:
element.unwrap()
elif element.name != 'img':
element.attrs = {}
word_count = len(element.get_text(strip=True).split())
if word_count < word_count_threshold:
element.decompose()
return
for child in list(element.children):
process_element(child)
if not element.contents and not element.get_text(strip=True):
element.decompose()
process_element(body)
def flatten_nested_elements(node):
if isinstance(node, NavigableString):
return node
if len(node.contents) == 1 and isinstance(node.contents[0], element.Tag) and node.contents[0].name == node.name:
return flatten_nested_elements(node.contents[0])
node.contents = [flatten_nested_elements(child) for child in node.contents]
return node
body = flatten_nested_elements(body)
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
cleaned_html = sanitize_html(cleaned_html)
h = CustomHTML2Text()
h.ignore_links = True
markdown = h.handle(cleaned_html)
markdown = markdown.replace(' ```', '```')
try:
meta = extract_metadata(html, soup)
except Exception as e:
print('Error extracting metadata:', str(e))
meta = {}
return {
'markdown': markdown,
'cleaned_html': cleaned_html,
'success': True,
'media': media,
'links': links,
'metadata': meta
}
def extract_metadata(html, soup = None):
metadata = {} metadata = {}
if not html: if not html:
return metadata return metadata
# Parse HTML content with BeautifulSoup # Parse HTML content with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser') if not soup:
soup = BeautifulSoup(html, 'html.parser')
# Title # Title
title_tag = soup.find('title') title_tag = soup.find('title')

View File

@@ -46,7 +46,8 @@ class WebCrawler:
word_count_threshold=5, word_count_threshold=5,
extraction_strategy= NoExtractionStrategy(), extraction_strategy= NoExtractionStrategy(),
bypass_cache=False, bypass_cache=False,
verbose = False verbose = False,
warmup=True
) )
self.ready = True self.ready = True
print("[LOG] 🌞 WebCrawler is ready to crawl") print("[LOG] 🌞 WebCrawler is ready to crawl")
@@ -145,6 +146,9 @@ class WebCrawler:
if not bypass_cache and not self.always_by_pass_cache: if not bypass_cache and not self.always_by_pass_cache:
cached = get_cached_url(url) cached = get_cached_url(url)
if kwargs.get("warmup", True) and not self.ready:
return None
if cached: if cached:
html = cached[1] html = cached[1]
extracted_content = cached[4] extracted_content = cached[4]
@@ -180,8 +184,13 @@ class WebCrawler:
t = time.time() t = time.time()
# Extract content from HTML # Extract content from HTML
try: try:
result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False)) # t1 = time.time()
metadata = extract_metadata(html) # result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
# print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t1} seconds")
t1 = time.time()
result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t1} seconds")
if result is None: if result is None:
raise ValueError(f"Failed to extract content from the website: {url}") raise ValueError(f"Failed to extract content from the website: {url}")
except InvalidCSSSelectorError as e: except InvalidCSSSelectorError as e:
@@ -191,6 +200,7 @@ class WebCrawler:
markdown = result.get("markdown", "") markdown = result.get("markdown", "")
media = result.get("media", []) media = result.get("media", [])
links = result.get("links", []) links = result.get("links", [])
metadata = result.get("metadata", {})
if verbose: if verbose:
print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds") print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds")

View File

@@ -1,5 +1,9 @@
# Changelog # Changelog
## [0.2.7] - 2024-06-27
### Fixed
- Speed up twice the extraction function.
## [0.2.6] - 2024-06-22 ## [0.2.6] - 2024-06-22
### Fixed ### Fixed
- Fix issue #19: Update Dockerfile to ensure compatibility across multiple platforms. - Fix issue #19: Update Dockerfile to ensure compatibility across multiple platforms.

View File

@@ -1,4 +1,4 @@
# Crawl4AI v0.2.6 # Crawl4AI v0.2.7
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI. Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.

View File

@@ -33,7 +33,7 @@ class CustomInstallCommand(install):
setup( setup(
name="Crawl4AI", name="Crawl4AI",
version="0.2.6", version="0.2.7",
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper", description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
long_description=open("README.md").read(), long_description=open("README.md").read(),
long_description_content_type="text/markdown", long_description_content_type="text/markdown",