Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d11a83c232 | ||
|
|
3255c7a3fa | ||
|
|
4756d0a532 | ||
|
|
7ba2142363 | ||
|
|
96d1eb0d0d | ||
|
|
144cfa0eda | ||
|
|
a0dff192ae | ||
|
|
1fffeeedd2 | ||
|
|
f51b078042 | ||
|
|
b6023a51fb |
4
.gitignore
vendored
4
.gitignore
vendored
@@ -185,4 +185,6 @@ local/
|
||||
|
||||
a.txt
|
||||
.lambda_function.py
|
||||
ec2*
|
||||
ec2*
|
||||
|
||||
update_changelog.sh
|
||||
11
CHANGELOG.md
11
CHANGELOG.md
@@ -1,5 +1,16 @@
|
||||
# Changelog
|
||||
|
||||
## [0.2.71] 2024-06-26
|
||||
• Refactored `crawler_strategy.py` to handle exceptions and improve error messages
|
||||
• Improved `get_content_of_website_optimized` function in `utils.py` for better performance
|
||||
• Updated `utils.py` with latest changes
|
||||
• Migrated to `ChromeDriverManager` for resolving Chrome driver download issues
|
||||
|
||||
## [0.2.71] - 2024-06-25
|
||||
### Fixed
|
||||
- Speed up twice the extraction function.
|
||||
|
||||
|
||||
## [0.2.6] - 2024-06-22
|
||||
### Fixed
|
||||
- Fix issue #19: Update Dockerfile to ensure compatibility across multiple platforms.
|
||||
|
||||
58
README.md
58
README.md
@@ -1,4 +1,4 @@
|
||||
# Crawl4AI v0.2.7 🕷️🤖
|
||||
# Crawl4AI v0.2.71 🕷️🤖
|
||||
|
||||
[](https://github.com/unclecode/crawl4ai/stargazers)
|
||||
[](https://github.com/unclecode/crawl4ai/network/members)
|
||||
@@ -52,6 +52,33 @@ result = crawler.run(url="https://www.nbcnews.com/business")
|
||||
print(result.markdown)
|
||||
```
|
||||
|
||||
### Speed-First Design 🚀
|
||||
|
||||
Perhaps the most important design principle for this library is speed. We need to ensure it can handle many links and resources in parallel as quickly as possible. By combining this speed with fast LLMs like Groq, the results will be truly amazing.
|
||||
|
||||
```python
|
||||
import time
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
crawler = WebCrawler()
|
||||
crawler.warmup()
|
||||
|
||||
start = time.time()
|
||||
url = r"https://www.nbcnews.com/business"
|
||||
result = crawler.run( url, word_count_threshold=10, bypass_cache=True)
|
||||
end = time.time()
|
||||
print(f"Time taken: {end - start}")
|
||||
```
|
||||
|
||||
Let's take a look the calculated time for the above code snippet:
|
||||
|
||||
```bash
|
||||
[LOG] 🚀 Crawling done, success: True, time taken: 1.3623387813568115 seconds
|
||||
[LOG] 🚀 Content extracted, success: True, time taken: 0.05715131759643555 seconds
|
||||
[LOG] 🚀 Extraction, time taken: 0.05750393867492676 seconds.
|
||||
Time taken: 1.439958095550537
|
||||
```
|
||||
Fetching the content from the page took 1.3623 seconds, and extracting the content took 0.0575 seconds. 🚀
|
||||
|
||||
### Extract Structured Data from Web Pages 📊
|
||||
|
||||
Crawl all OpenAI models and their fees from the official page.
|
||||
@@ -60,19 +87,30 @@ Crawl all OpenAI models and their fees from the official page.
|
||||
import os
|
||||
from crawl4ai import WebCrawler
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
class OpenAIModelFee(BaseModel):
|
||||
model_name: str = Field(..., description="Name of the OpenAI model.")
|
||||
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
||||
output_fee: str = Field(..., description="Fee for output token ßfor the OpenAI model.")
|
||||
|
||||
url = 'https://openai.com/api/pricing/'
|
||||
crawler = WebCrawler()
|
||||
crawler.warmup()
|
||||
|
||||
result = crawler.run(
|
||||
url=url,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
instruction="Extract all model names and their fees for input and output tokens."
|
||||
),
|
||||
)
|
||||
url=url,
|
||||
word_count_threshold=1,
|
||||
extraction_strategy= LLMExtractionStrategy(
|
||||
provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
|
||||
schema=OpenAIModelFee.schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||
Do not miss any models in the entire content. One extracted model JSON format should look like this:
|
||||
{"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}."""
|
||||
),
|
||||
bypass_cache=True,
|
||||
)
|
||||
|
||||
print(result.extracted_content)
|
||||
```
|
||||
@@ -119,3 +157,7 @@ For questions, suggestions, or feedback, feel free to reach out:
|
||||
- Website: [crawl4ai.com](https://crawl4ai.com)
|
||||
|
||||
Happy Crawling! 🕸️🚀
|
||||
|
||||
## Star History
|
||||
|
||||
[](https://star-history.com/#unclecode/crawl4ai&Date)
|
||||
@@ -5,7 +5,10 @@ from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.common.exceptions import InvalidArgumentException
|
||||
from selenium.common.exceptions import InvalidArgumentException, WebDriverException
|
||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
|
||||
import logging
|
||||
import base64
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
@@ -118,10 +121,15 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
}
|
||||
|
||||
# chromedriver_autoinstaller.install()
|
||||
import chromedriver_autoinstaller
|
||||
crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver(crawl4ai_folder, False)
|
||||
# import chromedriver_autoinstaller
|
||||
# crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)
|
||||
# chromedriver_path = chromedriver_autoinstaller.install()
|
||||
# chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver()
|
||||
# self.service = Service(chromedriver_autoinstaller.install())
|
||||
|
||||
|
||||
chromedriver_path = ChromeDriverManager().install()
|
||||
self.service = Service(chromedriver_path)
|
||||
self.service.log_path = "NUL"
|
||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||
@@ -212,9 +220,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
|
||||
return html
|
||||
except InvalidArgumentException:
|
||||
raise InvalidArgumentException(f"Invalid URL {url}")
|
||||
if not hasattr(e, 'msg'):
|
||||
e.msg = str(e)
|
||||
raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
|
||||
except WebDriverException as e:
|
||||
# If e does nlt have msg attribute create it and set it to str(e)
|
||||
if not hasattr(e, 'msg'):
|
||||
e.msg = str(e)
|
||||
raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to crawl {url}: {str(e)}")
|
||||
if not hasattr(e, 'msg'):
|
||||
e.msg = str(e)
|
||||
raise Exception(f"Failed to crawl {url}: {e.msg}")
|
||||
|
||||
def take_screenshot(self) -> str:
|
||||
try:
|
||||
|
||||
@@ -438,18 +438,17 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
||||
links = {'internal': [], 'external': []}
|
||||
media = {'images': [], 'videos': [], 'audios': []}
|
||||
|
||||
def process_element(element: element.PageElement) -> None:
|
||||
def process_element(element: element.PageElement) -> bool:
|
||||
if isinstance(element, NavigableString):
|
||||
if isinstance(element, Comment):
|
||||
element.extract()
|
||||
return
|
||||
|
||||
# if not isinstance(element, element.Tag):
|
||||
# return
|
||||
return False
|
||||
|
||||
if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
|
||||
element.decompose()
|
||||
return
|
||||
return False
|
||||
|
||||
keep_element = False
|
||||
|
||||
if element.name == 'a' and element.get('href'):
|
||||
href = element['href']
|
||||
@@ -459,6 +458,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
||||
links['external'].append(link_data)
|
||||
else:
|
||||
links['internal'].append(link_data)
|
||||
keep_element = True
|
||||
|
||||
elif element.name == 'img':
|
||||
media['images'].append({
|
||||
@@ -466,12 +466,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
||||
'alt': element.get('alt'),
|
||||
'type': 'image'
|
||||
})
|
||||
alt_text = element.get('alt')
|
||||
if alt_text:
|
||||
element.replace_with(soup.new_string(alt_text))
|
||||
else:
|
||||
element.decompose()
|
||||
return
|
||||
return True # Always keep image elements
|
||||
|
||||
elif element.name in ['video', 'audio']:
|
||||
media[f"{element.name}s"].append({
|
||||
@@ -479,6 +474,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
||||
'alt': element.get('alt'),
|
||||
'type': element.name
|
||||
})
|
||||
return True # Always keep video and audio elements
|
||||
|
||||
if element.name != 'pre':
|
||||
if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']:
|
||||
@@ -489,17 +485,26 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
||||
elif element.name != 'img':
|
||||
element.attrs = {}
|
||||
|
||||
word_count = len(element.get_text(strip=True).split())
|
||||
if word_count < word_count_threshold:
|
||||
element.decompose()
|
||||
return
|
||||
|
||||
# Process children
|
||||
for child in list(element.children):
|
||||
process_element(child)
|
||||
if isinstance(child, NavigableString) and not isinstance(child, Comment):
|
||||
if len(child.strip()) > 0:
|
||||
keep_element = True
|
||||
else:
|
||||
if process_element(child):
|
||||
keep_element = True
|
||||
|
||||
|
||||
if not element.contents and not element.get_text(strip=True):
|
||||
# Check word count
|
||||
if not keep_element:
|
||||
word_count = len(element.get_text(strip=True).split())
|
||||
keep_element = word_count >= word_count_threshold
|
||||
|
||||
if not keep_element:
|
||||
element.decompose()
|
||||
|
||||
return keep_element
|
||||
|
||||
process_element(body)
|
||||
|
||||
def flatten_nested_elements(node):
|
||||
@@ -770,4 +775,6 @@ def wrap_text(draw, text, font, max_width):
|
||||
|
||||
def format_html(html_string):
|
||||
soup = BeautifulSoup(html_string, 'html.parser')
|
||||
return soup.prettify()
|
||||
return soup.prettify()
|
||||
|
||||
|
||||
|
||||
@@ -47,7 +47,7 @@ class WebCrawler:
|
||||
extraction_strategy= NoExtractionStrategy(),
|
||||
bypass_cache=False,
|
||||
verbose = False,
|
||||
warmup=True
|
||||
# warmup=True
|
||||
)
|
||||
self.ready = True
|
||||
print("[LOG] 🌞 WebCrawler is ready to crawl")
|
||||
@@ -129,43 +129,57 @@ class WebCrawler:
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||
extraction_strategy.verbose = verbose
|
||||
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||
raise ValueError("Unsupported extraction strategy")
|
||||
if not isinstance(chunking_strategy, ChunkingStrategy):
|
||||
raise ValueError("Unsupported chunking strategy")
|
||||
|
||||
if word_count_threshold < MIN_WORD_THRESHOLD:
|
||||
word_count_threshold = MIN_WORD_THRESHOLD
|
||||
try:
|
||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||
extraction_strategy.verbose = verbose
|
||||
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||
raise ValueError("Unsupported extraction strategy")
|
||||
if not isinstance(chunking_strategy, ChunkingStrategy):
|
||||
raise ValueError("Unsupported chunking strategy")
|
||||
|
||||
# if word_count_threshold < MIN_WORD_THRESHOLD:
|
||||
# word_count_threshold = MIN_WORD_THRESHOLD
|
||||
|
||||
word_count_threshold = max(word_count_threshold, 0)
|
||||
|
||||
# Check cache first
|
||||
cached = None
|
||||
screenshot_data = None
|
||||
extracted_content = None
|
||||
if not bypass_cache and not self.always_by_pass_cache:
|
||||
cached = get_cached_url(url)
|
||||
|
||||
if kwargs.get("warmup", True) and not self.ready:
|
||||
return None
|
||||
|
||||
if cached:
|
||||
html = cached[1]
|
||||
extracted_content = cached[4]
|
||||
if screenshot:
|
||||
screenshot_data = cached[9]
|
||||
if not screenshot_data:
|
||||
cached = None
|
||||
|
||||
if not cached or not html:
|
||||
if user_agent:
|
||||
self.crawler_strategy.update_user_agent(user_agent)
|
||||
html = self.crawler_strategy.crawl(url)
|
||||
if screenshot:
|
||||
screenshot_data = self.crawler_strategy.take_screenshot()
|
||||
# Check cache first
|
||||
cached = None
|
||||
screenshot_data = None
|
||||
extracted_content = None
|
||||
if not bypass_cache and not self.always_by_pass_cache:
|
||||
cached = get_cached_url(url)
|
||||
|
||||
if kwargs.get("warmup", True) and not self.ready:
|
||||
return None
|
||||
|
||||
if cached:
|
||||
html = cached[1]
|
||||
extracted_content = cached[4]
|
||||
if screenshot:
|
||||
screenshot_data = cached[9]
|
||||
if not screenshot_data:
|
||||
cached = None
|
||||
|
||||
if not cached or not html:
|
||||
if user_agent:
|
||||
self.crawler_strategy.update_user_agent(user_agent)
|
||||
t1 = time.time()
|
||||
html = self.crawler_strategy.crawl(url)
|
||||
t2 = time.time()
|
||||
if verbose:
|
||||
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
|
||||
if screenshot:
|
||||
screenshot_data = self.crawler_strategy.take_screenshot()
|
||||
|
||||
|
||||
return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
|
||||
|
||||
crawl_result = self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
|
||||
crawl_result.success = bool(html)
|
||||
return crawl_result
|
||||
except Exception as e:
|
||||
if not hasattr(e, "msg"):
|
||||
e.msg = str(e)
|
||||
print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")
|
||||
return CrawlResult(url=url, html="", success=False, error_message=e.msg)
|
||||
|
||||
def process_html(
|
||||
self,
|
||||
@@ -189,7 +203,8 @@ class WebCrawler:
|
||||
# print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t1} seconds")
|
||||
t1 = time.time()
|
||||
result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
|
||||
print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t1} seconds")
|
||||
if verbose:
|
||||
print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1} seconds")
|
||||
|
||||
if result is None:
|
||||
raise ValueError(f"Failed to extract content from the website: {url}")
|
||||
@@ -201,9 +216,6 @@ class WebCrawler:
|
||||
media = result.get("media", [])
|
||||
links = result.get("links", [])
|
||||
metadata = result.get("metadata", {})
|
||||
|
||||
if verbose:
|
||||
print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds")
|
||||
|
||||
if extracted_content is None:
|
||||
if verbose:
|
||||
|
||||
@@ -1,6 +1,12 @@
|
||||
# Changelog
|
||||
|
||||
## [0.2.7] - 2024-06-27
|
||||
## [0.2.71] 2024-06-26
|
||||
• Refactored `crawler_strategy.py` to handle exceptions and improve error messages
|
||||
• Improved `get_content_of_website_optimized` function in `utils.py` for better performance
|
||||
• Updated `utils.py` with latest changes
|
||||
• Migrated to `ChromeDriverManager` for resolving Chrome driver download issues
|
||||
|
||||
## [0.2.71] - 2024-06-25
|
||||
### Fixed
|
||||
- Speed up twice the extraction function.
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Crawl4AI v0.2.7
|
||||
# Crawl4AI v0.2.71
|
||||
|
||||
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
|
||||
|
||||
|
||||
4
main.py
4
main.py
@@ -49,7 +49,9 @@ templates = Jinja2Templates(directory=__location__ + "/pages")
|
||||
@lru_cache()
|
||||
def get_crawler():
|
||||
# Initialize and return a WebCrawler instance
|
||||
return WebCrawler(verbose = True)
|
||||
crawler = WebCrawler(verbose = True)
|
||||
crawler.warmup()
|
||||
return crawler
|
||||
|
||||
class CrawlRequest(BaseModel):
|
||||
urls: List[str]
|
||||
|
||||
@@ -20,3 +20,4 @@ torch==2.3.1
|
||||
onnxruntime==1.18.0
|
||||
tokenizers==0.19.1
|
||||
pillow==10.3.0
|
||||
webdriver-manager==4.0.1
|
||||
2
setup.py
2
setup.py
@@ -33,7 +33,7 @@ class CustomInstallCommand(install):
|
||||
|
||||
setup(
|
||||
name="Crawl4AI",
|
||||
version="0.2.7",
|
||||
version="0.2.71",
|
||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
||||
long_description=open("README.md").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
|
||||
Reference in New Issue
Block a user