Merge branch 'main' of https://github.com/unclecode/crawl4ai
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -185,4 +185,6 @@ local/
|
|||||||
|
|
||||||
a.txt
|
a.txt
|
||||||
.lambda_function.py
|
.lambda_function.py
|
||||||
ec2*
|
ec2*
|
||||||
|
|
||||||
|
update_changelog.sh
|
||||||
11
CHANGELOG.md
11
CHANGELOG.md
@@ -1,5 +1,16 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## [0.2.71] 2024-06-26
|
||||||
|
• Refactored `crawler_strategy.py` to handle exceptions and improve error messages
|
||||||
|
• Improved `get_content_of_website_optimized` function in `utils.py` for better performance
|
||||||
|
• Updated `utils.py` with latest changes
|
||||||
|
• Migrated to `ChromeDriverManager` for resolving Chrome driver download issues
|
||||||
|
|
||||||
|
## [0.2.71] - 2024-06-25
|
||||||
|
### Fixed
|
||||||
|
- Speed up twice the extraction function.
|
||||||
|
|
||||||
|
|
||||||
## [0.2.6] - 2024-06-22
|
## [0.2.6] - 2024-06-22
|
||||||
### Fixed
|
### Fixed
|
||||||
- Fix issue #19: Update Dockerfile to ensure compatibility across multiple platforms.
|
- Fix issue #19: Update Dockerfile to ensure compatibility across multiple platforms.
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# Crawl4AI v0.2.7 🕷️🤖
|
# Crawl4AI v0.2.71 🕷️🤖
|
||||||
|
|
||||||
[](https://github.com/unclecode/crawl4ai/stargazers)
|
[](https://github.com/unclecode/crawl4ai/stargazers)
|
||||||
[](https://github.com/unclecode/crawl4ai/network/members)
|
[](https://github.com/unclecode/crawl4ai/network/members)
|
||||||
|
|||||||
@@ -5,7 +5,10 @@ from selenium.webdriver.common.by import By
|
|||||||
from selenium.webdriver.support.ui import WebDriverWait
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
from selenium.webdriver.support import expected_conditions as EC
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
from selenium.common.exceptions import InvalidArgumentException
|
from selenium.common.exceptions import InvalidArgumentException, WebDriverException
|
||||||
|
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||||
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import base64
|
import base64
|
||||||
from PIL import Image, ImageDraw, ImageFont
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
@@ -118,10 +121,15 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
}
|
}
|
||||||
|
|
||||||
# chromedriver_autoinstaller.install()
|
# chromedriver_autoinstaller.install()
|
||||||
import chromedriver_autoinstaller
|
# import chromedriver_autoinstaller
|
||||||
crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
# crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||||
chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver(crawl4ai_folder, False)
|
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)
|
||||||
|
# chromedriver_path = chromedriver_autoinstaller.install()
|
||||||
|
# chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver()
|
||||||
# self.service = Service(chromedriver_autoinstaller.install())
|
# self.service = Service(chromedriver_autoinstaller.install())
|
||||||
|
|
||||||
|
|
||||||
|
chromedriver_path = ChromeDriverManager().install()
|
||||||
self.service = Service(chromedriver_path)
|
self.service = Service(chromedriver_path)
|
||||||
self.service.log_path = "NUL"
|
self.service.log_path = "NUL"
|
||||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||||
@@ -212,9 +220,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
|
|
||||||
return html
|
return html
|
||||||
except InvalidArgumentException:
|
except InvalidArgumentException:
|
||||||
raise InvalidArgumentException(f"Invalid URL {url}")
|
if not hasattr(e, 'msg'):
|
||||||
|
e.msg = str(e)
|
||||||
|
raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
|
||||||
|
except WebDriverException as e:
|
||||||
|
# If e does nlt have msg attribute create it and set it to str(e)
|
||||||
|
if not hasattr(e, 'msg'):
|
||||||
|
e.msg = str(e)
|
||||||
|
raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise Exception(f"Failed to crawl {url}: {str(e)}")
|
if not hasattr(e, 'msg'):
|
||||||
|
e.msg = str(e)
|
||||||
|
raise Exception(f"Failed to crawl {url}: {e.msg}")
|
||||||
|
|
||||||
def take_screenshot(self) -> str:
|
def take_screenshot(self) -> str:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -438,18 +438,17 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
|||||||
links = {'internal': [], 'external': []}
|
links = {'internal': [], 'external': []}
|
||||||
media = {'images': [], 'videos': [], 'audios': []}
|
media = {'images': [], 'videos': [], 'audios': []}
|
||||||
|
|
||||||
def process_element(element: element.PageElement) -> None:
|
def process_element(element: element.PageElement) -> bool:
|
||||||
if isinstance(element, NavigableString):
|
if isinstance(element, NavigableString):
|
||||||
if isinstance(element, Comment):
|
if isinstance(element, Comment):
|
||||||
element.extract()
|
element.extract()
|
||||||
return
|
return False
|
||||||
|
|
||||||
# if not isinstance(element, element.Tag):
|
|
||||||
# return
|
|
||||||
|
|
||||||
if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
|
if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
|
||||||
element.decompose()
|
element.decompose()
|
||||||
return
|
return False
|
||||||
|
|
||||||
|
keep_element = False
|
||||||
|
|
||||||
if element.name == 'a' and element.get('href'):
|
if element.name == 'a' and element.get('href'):
|
||||||
href = element['href']
|
href = element['href']
|
||||||
@@ -459,6 +458,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
|||||||
links['external'].append(link_data)
|
links['external'].append(link_data)
|
||||||
else:
|
else:
|
||||||
links['internal'].append(link_data)
|
links['internal'].append(link_data)
|
||||||
|
keep_element = True
|
||||||
|
|
||||||
elif element.name == 'img':
|
elif element.name == 'img':
|
||||||
media['images'].append({
|
media['images'].append({
|
||||||
@@ -466,12 +466,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
|||||||
'alt': element.get('alt'),
|
'alt': element.get('alt'),
|
||||||
'type': 'image'
|
'type': 'image'
|
||||||
})
|
})
|
||||||
alt_text = element.get('alt')
|
return True # Always keep image elements
|
||||||
if alt_text:
|
|
||||||
element.replace_with(soup.new_string(alt_text))
|
|
||||||
else:
|
|
||||||
element.decompose()
|
|
||||||
return
|
|
||||||
|
|
||||||
elif element.name in ['video', 'audio']:
|
elif element.name in ['video', 'audio']:
|
||||||
media[f"{element.name}s"].append({
|
media[f"{element.name}s"].append({
|
||||||
@@ -479,6 +474,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
|||||||
'alt': element.get('alt'),
|
'alt': element.get('alt'),
|
||||||
'type': element.name
|
'type': element.name
|
||||||
})
|
})
|
||||||
|
return True # Always keep video and audio elements
|
||||||
|
|
||||||
if element.name != 'pre':
|
if element.name != 'pre':
|
||||||
if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']:
|
if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']:
|
||||||
@@ -489,17 +485,26 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
|||||||
elif element.name != 'img':
|
elif element.name != 'img':
|
||||||
element.attrs = {}
|
element.attrs = {}
|
||||||
|
|
||||||
word_count = len(element.get_text(strip=True).split())
|
# Process children
|
||||||
if word_count < word_count_threshold:
|
|
||||||
element.decompose()
|
|
||||||
return
|
|
||||||
|
|
||||||
for child in list(element.children):
|
for child in list(element.children):
|
||||||
process_element(child)
|
if isinstance(child, NavigableString) and not isinstance(child, Comment):
|
||||||
|
if len(child.strip()) > 0:
|
||||||
|
keep_element = True
|
||||||
|
else:
|
||||||
|
if process_element(child):
|
||||||
|
keep_element = True
|
||||||
|
|
||||||
|
|
||||||
if not element.contents and not element.get_text(strip=True):
|
# Check word count
|
||||||
|
if not keep_element:
|
||||||
|
word_count = len(element.get_text(strip=True).split())
|
||||||
|
keep_element = word_count >= word_count_threshold
|
||||||
|
|
||||||
|
if not keep_element:
|
||||||
element.decompose()
|
element.decompose()
|
||||||
|
|
||||||
|
return keep_element
|
||||||
|
|
||||||
process_element(body)
|
process_element(body)
|
||||||
|
|
||||||
def flatten_nested_elements(node):
|
def flatten_nested_elements(node):
|
||||||
@@ -770,4 +775,6 @@ def wrap_text(draw, text, font, max_width):
|
|||||||
|
|
||||||
def format_html(html_string):
|
def format_html(html_string):
|
||||||
soup = BeautifulSoup(html_string, 'html.parser')
|
soup = BeautifulSoup(html_string, 'html.parser')
|
||||||
return soup.prettify()
|
return soup.prettify()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -129,47 +129,57 @@ class WebCrawler:
|
|||||||
verbose=True,
|
verbose=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> CrawlResult:
|
) -> CrawlResult:
|
||||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
try:
|
||||||
extraction_strategy.verbose = verbose
|
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||||
if not isinstance(extraction_strategy, ExtractionStrategy):
|
extraction_strategy.verbose = verbose
|
||||||
raise ValueError("Unsupported extraction strategy")
|
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||||
if not isinstance(chunking_strategy, ChunkingStrategy):
|
raise ValueError("Unsupported extraction strategy")
|
||||||
raise ValueError("Unsupported chunking strategy")
|
if not isinstance(chunking_strategy, ChunkingStrategy):
|
||||||
|
raise ValueError("Unsupported chunking strategy")
|
||||||
if word_count_threshold < MIN_WORD_THRESHOLD:
|
|
||||||
word_count_threshold = MIN_WORD_THRESHOLD
|
# if word_count_threshold < MIN_WORD_THRESHOLD:
|
||||||
|
# word_count_threshold = MIN_WORD_THRESHOLD
|
||||||
|
|
||||||
|
word_count_threshold = max(word_count_threshold, 0)
|
||||||
|
|
||||||
# Check cache first
|
# Check cache first
|
||||||
cached = None
|
cached = None
|
||||||
screenshot_data = None
|
screenshot_data = None
|
||||||
extracted_content = None
|
extracted_content = None
|
||||||
if not bypass_cache and not self.always_by_pass_cache:
|
if not bypass_cache and not self.always_by_pass_cache:
|
||||||
cached = get_cached_url(url)
|
cached = get_cached_url(url)
|
||||||
|
|
||||||
if kwargs.get("warmup", True) and not self.ready:
|
if kwargs.get("warmup", True) and not self.ready:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if cached:
|
if cached:
|
||||||
html = cached[1]
|
html = cached[1]
|
||||||
extracted_content = cached[4]
|
extracted_content = cached[4]
|
||||||
if screenshot:
|
if screenshot:
|
||||||
screenshot_data = cached[9]
|
screenshot_data = cached[9]
|
||||||
if not screenshot_data:
|
if not screenshot_data:
|
||||||
cached = None
|
cached = None
|
||||||
|
|
||||||
if not cached or not html:
|
if not cached or not html:
|
||||||
if user_agent:
|
if user_agent:
|
||||||
self.crawler_strategy.update_user_agent(user_agent)
|
self.crawler_strategy.update_user_agent(user_agent)
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
html = self.crawler_strategy.crawl(url)
|
html = self.crawler_strategy.crawl(url)
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
|
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
|
||||||
if screenshot:
|
if screenshot:
|
||||||
screenshot_data = self.crawler_strategy.take_screenshot()
|
screenshot_data = self.crawler_strategy.take_screenshot()
|
||||||
|
|
||||||
|
|
||||||
return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
|
crawl_result = self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
|
||||||
|
crawl_result.success = bool(html)
|
||||||
|
return crawl_result
|
||||||
|
except Exception as e:
|
||||||
|
if not hasattr(e, "msg"):
|
||||||
|
e.msg = str(e)
|
||||||
|
print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")
|
||||||
|
return CrawlResult(url=url, html="", success=False, error_message=e.msg)
|
||||||
|
|
||||||
def process_html(
|
def process_html(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -1,6 +1,12 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
## [0.2.7] - 2024-06-27
|
## [0.2.71] 2024-06-26
|
||||||
|
• Refactored `crawler_strategy.py` to handle exceptions and improve error messages
|
||||||
|
• Improved `get_content_of_website_optimized` function in `utils.py` for better performance
|
||||||
|
• Updated `utils.py` with latest changes
|
||||||
|
• Migrated to `ChromeDriverManager` for resolving Chrome driver download issues
|
||||||
|
|
||||||
|
## [0.2.71] - 2024-06-25
|
||||||
### Fixed
|
### Fixed
|
||||||
- Speed up twice the extraction function.
|
- Speed up twice the extraction function.
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# Crawl4AI v0.2.7
|
# Crawl4AI v0.2.71
|
||||||
|
|
||||||
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
|
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
|
||||||
|
|
||||||
|
|||||||
4
main.py
4
main.py
@@ -49,7 +49,9 @@ templates = Jinja2Templates(directory=__location__ + "/pages")
|
|||||||
@lru_cache()
|
@lru_cache()
|
||||||
def get_crawler():
|
def get_crawler():
|
||||||
# Initialize and return a WebCrawler instance
|
# Initialize and return a WebCrawler instance
|
||||||
return WebCrawler(verbose = True)
|
crawler = WebCrawler(verbose = True)
|
||||||
|
crawler.warmup()
|
||||||
|
return crawler
|
||||||
|
|
||||||
class CrawlRequest(BaseModel):
|
class CrawlRequest(BaseModel):
|
||||||
urls: List[str]
|
urls: List[str]
|
||||||
|
|||||||
@@ -20,3 +20,4 @@ torch==2.3.1
|
|||||||
onnxruntime==1.18.0
|
onnxruntime==1.18.0
|
||||||
tokenizers==0.19.1
|
tokenizers==0.19.1
|
||||||
pillow==10.3.0
|
pillow==10.3.0
|
||||||
|
webdriver-manager==4.0.1
|
||||||
2
setup.py
2
setup.py
@@ -33,7 +33,7 @@ class CustomInstallCommand(install):
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="Crawl4AI",
|
name="Crawl4AI",
|
||||||
version="0.2.7",
|
version="0.2.71",
|
||||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
||||||
long_description=open("README.md").read(),
|
long_description=open("README.md").read(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
|
|||||||
Reference in New Issue
Block a user