Compare commits
11 Commits
v0.2.73
...
image-desc
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cf6c835e18 | ||
|
|
e5ecf291f3 | ||
|
|
9d0cafcfa6 | ||
|
|
7715623430 | ||
|
|
f5a4e80e2c | ||
|
|
8463aabedf | ||
|
|
7f30144ef2 | ||
|
|
fa5516aad6 | ||
|
|
ca0336af9e | ||
|
|
65ed1aeade | ||
|
|
4d283ab386 |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -165,6 +165,8 @@ Crawl4AI.egg-info/
|
||||
Crawl4AI.egg-info/*
|
||||
crawler_data.db
|
||||
.vscode/
|
||||
.tests/
|
||||
.test_pads/
|
||||
test_pad.py
|
||||
test_pad*.py
|
||||
.data/
|
||||
|
||||
18
CHANGELOG.md
18
CHANGELOG.md
@@ -1,5 +1,23 @@
|
||||
# Changelog
|
||||
|
||||
## [v0.2.75] - 2024-07-19
|
||||
|
||||
Minor improvements for a more maintainable codebase:
|
||||
|
||||
- 🔄 Fixed typos in `chunking_strategy.py` and `crawler_strategy.py` to improve code readability
|
||||
- 🔄 Removed `.test_pads/` directory from `.gitignore` to keep our repository clean and organized
|
||||
|
||||
These changes may seem small, but they contribute to a more stable and sustainable codebase. By fixing typos and updating our `.gitignore` settings, we're ensuring that our code is easier to maintain and scale in the long run.
|
||||
|
||||
## [v0.2.74] - 2024-07-08
|
||||
A slew of exciting updates to improve the crawler's stability and robustness! 🎉
|
||||
|
||||
- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
|
||||
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
|
||||
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
|
||||
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
|
||||
|
||||
|
||||
## [v0.2.73] - 2024-07-03
|
||||
|
||||
💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Crawl4AI v0.2.73 🕷️🤖
|
||||
# Crawl4AI v0.2.75 🕷️🤖
|
||||
|
||||
[](https://github.com/unclecode/crawl4ai/stargazers)
|
||||
[](https://github.com/unclecode/crawl4ai/network/members)
|
||||
|
||||
@@ -3,6 +3,7 @@ import re
|
||||
from collections import Counter
|
||||
import string
|
||||
from .model_loader import load_nltk_punkt
|
||||
from .utils import *
|
||||
|
||||
# Define the abstract base class for chunking strategies
|
||||
class ChunkingStrategy(ABC):
|
||||
@@ -54,7 +55,7 @@ class TopicSegmentationChunking(ChunkingStrategy):
|
||||
|
||||
def __init__(self, num_keywords=3, **kwargs):
|
||||
import nltk as nl
|
||||
self.tokenizer = nl.toknize.TextTilingTokenizer()
|
||||
self.tokenizer = nl.tokenize.TextTilingTokenizer()
|
||||
self.num_keywords = num_keywords
|
||||
|
||||
def chunk(self, text: str) -> list:
|
||||
|
||||
@@ -27,3 +27,13 @@ WORD_TOKEN_RATE = 1.3
|
||||
|
||||
# Threshold for the minimum number of word in a HTML tag to be considered
|
||||
MIN_WORD_THRESHOLD = 1
|
||||
|
||||
# Threshold for the Image extraction - Range is 1 to 6
|
||||
# Images are scored based on point based system, to filter based on usefulness. Points are assigned
|
||||
# to each image based on the following aspects.
|
||||
# If either height or width exceeds 150px
|
||||
# If image size is greater than 10Kb
|
||||
# If alt property is set
|
||||
# If image format is in jpg, png or webp
|
||||
# If image is in the first half of the total images extracted from the page
|
||||
IMAGE_SCORE_THRESHOLD = 2
|
||||
|
||||
@@ -8,6 +8,7 @@ from selenium.webdriver.chrome.options import Options
|
||||
from selenium.common.exceptions import InvalidArgumentException, WebDriverException
|
||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from urllib3.exceptions import MaxRetryError
|
||||
|
||||
from .config import *
|
||||
import logging, time
|
||||
@@ -18,7 +19,7 @@ from typing import List, Callable
|
||||
import requests
|
||||
import os
|
||||
from pathlib import Path
|
||||
from .utils import wrap_text
|
||||
from .utils import *
|
||||
|
||||
logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
|
||||
logger.setLevel(logging.WARNING)
|
||||
@@ -73,7 +74,7 @@ class CloudCrawlerStrategy(CrawlerStrategy):
|
||||
response = requests.post("http://crawl4ai.uccode.io/crawl", json=data)
|
||||
response = response.json()
|
||||
html = response["results"][0]["html"]
|
||||
return html
|
||||
return sanitize_input_encode(html)
|
||||
|
||||
class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
|
||||
@@ -200,7 +201,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
||||
if os.path.exists(cache_file_path):
|
||||
with open(cache_file_path, "r") as f:
|
||||
return f.read()
|
||||
return sanitize_input_encode(f.read())
|
||||
|
||||
try:
|
||||
self.driver = self.execute_hook('before_get_url', self.driver)
|
||||
@@ -214,11 +215,15 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
WebDriverWait(self.driver, 10).until(
|
||||
EC.presence_of_all_elements_located((By.TAG_NAME, "body"))
|
||||
)
|
||||
|
||||
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||
html = self._ensure_page_load() # self.driver.page_source
|
||||
|
||||
self.driver = self.execute_hook('after_get_url', self.driver)
|
||||
html = sanitize_input_encode(self._ensure_page_load()) # self.driver.page_source
|
||||
can_not_be_done_headless = False # Look at my creativity for naming variables
|
||||
# TODO: Very ugly way for now but it works
|
||||
if not kwargs.get('bypass_headless', False) and html == "<html><head></head><body></body></html>":
|
||||
|
||||
# TODO: Very ugly approach, but promise to change it!
|
||||
if kwargs.get('bypass_headless', False) or html == "<html><head></head><body></body></html>":
|
||||
print("[LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode...")
|
||||
can_not_be_done_headless = True
|
||||
options = Options()
|
||||
@@ -227,11 +232,10 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
options.add_argument("--window-size=5,5")
|
||||
driver = webdriver.Chrome(service=self.service, options=options)
|
||||
driver.get(url)
|
||||
html = driver.page_source
|
||||
self.driver = self.execute_hook('after_get_url', driver)
|
||||
html = sanitize_input_encode(driver.page_source)
|
||||
driver.quit()
|
||||
|
||||
self.driver = self.execute_hook('after_get_url', self.driver)
|
||||
|
||||
# Execute JS code if provided
|
||||
if self.js_code and type(self.js_code) == str:
|
||||
self.driver.execute_script(self.js_code)
|
||||
@@ -247,12 +251,12 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
)
|
||||
|
||||
if not can_not_be_done_headless:
|
||||
html = self.driver.page_source
|
||||
html = sanitize_input_encode(self.driver.page_source)
|
||||
self.driver = self.execute_hook('before_return_html', self.driver, html)
|
||||
|
||||
# Store in cache
|
||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
||||
with open(cache_file_path, "w") as f:
|
||||
with open(cache_file_path, "w", encoding="utf-8") as f:
|
||||
f.write(html)
|
||||
|
||||
if self.verbose:
|
||||
@@ -261,16 +265,16 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
return html
|
||||
except InvalidArgumentException:
|
||||
if not hasattr(e, 'msg'):
|
||||
e.msg = str(e)
|
||||
e.msg = sanitize_input_encode(str(e))
|
||||
raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
|
||||
except WebDriverException as e:
|
||||
# If e does nlt have msg attribute create it and set it to str(e)
|
||||
if not hasattr(e, 'msg'):
|
||||
e.msg = str(e)
|
||||
e.msg = sanitize_input_encode(str(e))
|
||||
raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
|
||||
except Exception as e:
|
||||
if not hasattr(e, 'msg'):
|
||||
e.msg = str(e)
|
||||
e.msg = sanitize_input_encode(str(e))
|
||||
raise Exception(f"Failed to crawl {url}: {e.msg}")
|
||||
|
||||
def take_screenshot(self) -> str:
|
||||
@@ -288,18 +292,25 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
# Open the screenshot with PIL
|
||||
image = Image.open(BytesIO(screenshot))
|
||||
|
||||
# Convert image to RGB mode
|
||||
rgb_image = image.convert('RGB')
|
||||
|
||||
# Convert to JPEG and compress
|
||||
buffered = BytesIO()
|
||||
image.save(buffered, format="JPEG", quality=85)
|
||||
rgb_image.save(buffered, format="JPEG", quality=85)
|
||||
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] 📸 Screenshot taken and converted to base64")
|
||||
|
||||
return img_base64
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"[ERROR] Failed to take screenshot: {str(e)}")
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
error_message = f"Failed to take screenshot: {str(e)}"
|
||||
error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}")
|
||||
print(error_message)
|
||||
|
||||
# Generate an image with black background
|
||||
|
||||
@@ -20,7 +20,7 @@ def init_db():
|
||||
extracted_content TEXT,
|
||||
success BOOLEAN,
|
||||
media TEXT DEFAULT "{}",
|
||||
link TEXT DEFAULT "{}",
|
||||
links TEXT DEFAULT "{}",
|
||||
metadata TEXT DEFAULT "{}",
|
||||
screenshot TEXT DEFAULT ""
|
||||
)
|
||||
@@ -127,6 +127,9 @@ def update_existing_records(new_column: str = "media", default_value: str = "{}"
|
||||
print(f"Error updating existing records: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
init_db() # Initialize the database if not already initialized
|
||||
alter_db_add_screenshot("metadata") # Add the new column to the table
|
||||
update_existing_records("metadata") # Update existing records to set the new column to an empty string
|
||||
# Delete the existing database file
|
||||
if os.path.exists(DB_PATH):
|
||||
os.remove(DB_PATH)
|
||||
init_db()
|
||||
# alter_db_add_screenshot("COL_NAME")
|
||||
|
||||
|
||||
@@ -116,7 +116,6 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
for block in blocks:
|
||||
block['error'] = False
|
||||
except Exception as e:
|
||||
print("Error extracting blocks:", str(e))
|
||||
parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
|
||||
blocks = parsed
|
||||
if unparsed:
|
||||
@@ -192,7 +191,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
# Sequential processing with a delay
|
||||
for ix, section in enumerate(merged_sections):
|
||||
extract_func = partial(self.extract, url)
|
||||
extracted_content.extend(extract_func(ix, section))
|
||||
extracted_content.extend(extract_func(ix, sanitize_input_encode(section)))
|
||||
time.sleep(0.5) # 500 ms delay between each processing
|
||||
else:
|
||||
# Parallel processing using ThreadPoolExecutor
|
||||
@@ -202,10 +201,21 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
|
||||
with ThreadPoolExecutor(max_workers=4) as executor:
|
||||
extract_func = partial(self.extract, url)
|
||||
futures = [executor.submit(extract_func, ix, section) for ix, section in enumerate(merged_sections)]
|
||||
futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)]
|
||||
|
||||
for future in as_completed(futures):
|
||||
extracted_content.extend(future.result())
|
||||
try:
|
||||
extracted_content.extend(future.result())
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"Error in thread execution: {e}")
|
||||
# Add error information to extracted_content
|
||||
extracted_content.append({
|
||||
"index": 0,
|
||||
"error": True,
|
||||
"tags": ["error"],
|
||||
"content": str(e)
|
||||
})
|
||||
|
||||
|
||||
return extracted_content
|
||||
|
||||
@@ -3,7 +3,7 @@ from pathlib import Path
|
||||
import subprocess, os
|
||||
import shutil
|
||||
import tarfile
|
||||
from crawl4ai.config import MODEL_REPO_BRANCH
|
||||
from .model_loader import *
|
||||
import argparse
|
||||
import urllib.request
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
@@ -11,6 +11,9 @@ from .prompts import PROMPT_EXTRACT_BLOCKS
|
||||
from .config import *
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from urllib.parse import urljoin
|
||||
import requests
|
||||
from requests.exceptions import InvalidSchema
|
||||
|
||||
class InvalidCSSSelectorError(Exception):
|
||||
pass
|
||||
@@ -96,6 +99,16 @@ def sanitize_html(html):
|
||||
|
||||
return sanitized_html
|
||||
|
||||
def sanitize_input_encode(text: str) -> str:
|
||||
"""Sanitize input to handle potential encoding issues."""
|
||||
try:
|
||||
# Attempt to encode and decode as UTF-8 to handle potential encoding issues
|
||||
return text.encode('utf-8', errors='ignore').decode('utf-8')
|
||||
except UnicodeEncodeError as e:
|
||||
print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}")
|
||||
# Fall back to ASCII if UTF-8 fails
|
||||
return text.encode('ascii', errors='ignore').decode('ascii')
|
||||
|
||||
def escape_json_string(s):
|
||||
"""
|
||||
Escapes characters in a string to be JSON safe.
|
||||
@@ -437,6 +450,101 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
||||
links = {'internal': [], 'external': []}
|
||||
media = {'images': [], 'videos': [], 'audios': []}
|
||||
|
||||
def process_image(img, url, index, total_images):
|
||||
#Check if an image has valid display and inside undesired html elements
|
||||
def is_valid_image(img, parent, parent_classes):
|
||||
style = img.get('style', '')
|
||||
src = img.get('src', '')
|
||||
classes_to_check = ['button', 'icon', 'logo']
|
||||
tags_to_check = ['button', 'input']
|
||||
return all([
|
||||
'display:none' not in style,
|
||||
src,
|
||||
not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check),
|
||||
parent.name not in tags_to_check
|
||||
])
|
||||
|
||||
#Score an image for it's usefulness
|
||||
def score_image_for_usefulness(img, base_url, index, images_count):
|
||||
# Function to parse image height/width value and units
|
||||
def parse_dimension(dimension):
|
||||
if dimension:
|
||||
match = re.match(r"(\d+)(\D*)", dimension)
|
||||
if match:
|
||||
number = int(match.group(1))
|
||||
unit = match.group(2) or 'px' # Default unit is 'px' if not specified
|
||||
return number, unit
|
||||
return None, None
|
||||
|
||||
# Fetch image file metadata to extract size and extension
|
||||
def fetch_image_file_size(img, base_url):
|
||||
#If src is relative path construct full URL, if not it may be CDN URL
|
||||
img_url = urljoin(base_url,img.get('src'))
|
||||
try:
|
||||
response = requests.head(img_url)
|
||||
if response.status_code == 200:
|
||||
return response.headers.get('Content-Length',None)
|
||||
else:
|
||||
print(f"Failed to retrieve file size for {img_url}")
|
||||
return None
|
||||
except InvalidSchema as e:
|
||||
return None
|
||||
finally:
|
||||
return
|
||||
|
||||
image_height = img.get('height')
|
||||
height_value, height_unit = parse_dimension(image_height)
|
||||
image_width = img.get('width')
|
||||
width_value, width_unit = parse_dimension(image_width)
|
||||
image_size = int(fetch_image_file_size(img,base_url) or 0)
|
||||
image_format = os.path.splitext(img.get('src',''))[1].lower()
|
||||
score = 0
|
||||
if height_value:
|
||||
if height_unit == 'px' and height_value > 150:
|
||||
score += 1
|
||||
if height_unit in ['%','vh','vmin','vmax'] and height_value >30:
|
||||
score += 1
|
||||
if width_value:
|
||||
if width_unit == 'px' and width_value > 150:
|
||||
score += 1
|
||||
if width_unit in ['%','vh','vmin','vmax'] and width_value >30:
|
||||
score += 1
|
||||
if image_size > 10000:
|
||||
score += 1
|
||||
if img.get('alt') != '':
|
||||
score+=1
|
||||
if any(image_format==format for format in ['jpg','png','webp']):
|
||||
score+=1
|
||||
if index/images_count<0.5:
|
||||
score+=1
|
||||
return score
|
||||
|
||||
# Extract meaningful text for images from closest parent
|
||||
def find_closest_parent_with_useful_text(tag):
|
||||
current_tag = tag
|
||||
while current_tag:
|
||||
current_tag = current_tag.parent
|
||||
# Get the text content of the parent tag
|
||||
if current_tag:
|
||||
text_content = current_tag.get_text(separator=' ',strip=True)
|
||||
# Check if the text content has at least word_count_threshold
|
||||
if len(text_content.split()) >= word_count_threshold:
|
||||
return text_content
|
||||
return None
|
||||
|
||||
if not is_valid_image(img, img.parent, img.parent.get('class', [])):
|
||||
return None
|
||||
score = score_image_for_usefulness(img, url, index, total_images)
|
||||
if score <= IMAGE_SCORE_THRESHOLD:
|
||||
return None
|
||||
return {
|
||||
'src': img.get('src', ''),
|
||||
'alt': img.get('alt', ''),
|
||||
'desc': find_closest_parent_with_useful_text(img),
|
||||
'score': score,
|
||||
'type': 'image'
|
||||
}
|
||||
|
||||
def process_element(element: element.PageElement) -> bool:
|
||||
try:
|
||||
if isinstance(element, NavigableString):
|
||||
@@ -461,11 +569,6 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
||||
keep_element = True
|
||||
|
||||
elif element.name == 'img':
|
||||
media['images'].append({
|
||||
'src': element.get('src'),
|
||||
'alt': element.get('alt'),
|
||||
'type': 'image'
|
||||
})
|
||||
return True # Always keep image elements
|
||||
|
||||
elif element.name in ['video', 'audio']:
|
||||
@@ -508,6 +611,14 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
||||
print('Error processing element:', str(e))
|
||||
return False
|
||||
|
||||
#process images by filtering and extracting contextual text from the page
|
||||
imgs = body.find_all('img')
|
||||
media['images'] = [
|
||||
result for result in
|
||||
(process_image(img, url, i, len(imgs)) for i, img in enumerate(imgs))
|
||||
if result is not None
|
||||
]
|
||||
|
||||
process_element(body)
|
||||
|
||||
def flatten_nested_elements(node):
|
||||
@@ -664,7 +775,6 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
|
||||
for block in blocks:
|
||||
block['error'] = False
|
||||
except Exception as e:
|
||||
print("Error extracting blocks:", str(e))
|
||||
parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
|
||||
blocks = parsed
|
||||
# Append all unparsed segments as onr error block and content is list of unparsed segments
|
||||
@@ -710,7 +820,6 @@ def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_toke
|
||||
blocks = json.loads(blocks)
|
||||
|
||||
except Exception as e:
|
||||
print("Error extracting blocks:", str(e))
|
||||
blocks = [{
|
||||
"index": 0,
|
||||
"tags": ["error"],
|
||||
|
||||
@@ -155,8 +155,8 @@ class WebCrawler:
|
||||
return None
|
||||
|
||||
if cached:
|
||||
html = cached[1]
|
||||
extracted_content = cached[4]
|
||||
html = sanitize_input_encode(cached[1])
|
||||
extracted_content = sanitize_input_encode(cached[4])
|
||||
if screenshot:
|
||||
screenshot_data = cached[9]
|
||||
if not screenshot_data:
|
||||
@@ -166,7 +166,7 @@ class WebCrawler:
|
||||
if user_agent:
|
||||
self.crawler_strategy.update_user_agent(user_agent)
|
||||
t1 = time.time()
|
||||
html = self.crawler_strategy.crawl(url, **kwargs)
|
||||
html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
|
||||
t2 = time.time()
|
||||
if verbose:
|
||||
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
|
||||
@@ -213,8 +213,8 @@ class WebCrawler:
|
||||
except InvalidCSSSelectorError as e:
|
||||
raise ValueError(str(e))
|
||||
|
||||
cleaned_html = result.get("cleaned_html", "")
|
||||
markdown = result.get("markdown", "")
|
||||
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
||||
markdown = sanitize_input_encode(result.get("markdown", ""))
|
||||
media = result.get("media", [])
|
||||
links = result.get("links", [])
|
||||
metadata = result.get("metadata", {})
|
||||
|
||||
@@ -36,5 +36,5 @@ model_fees = json.loads(result.extracted_content)
|
||||
|
||||
print(len(model_fees))
|
||||
|
||||
with open(".data/data.json", "w") as f:
|
||||
with open(".data/data.json", "w", encoding="utf-8") as f:
|
||||
f.write(result.extracted_content)
|
||||
@@ -249,15 +249,40 @@ def using_crawler_hooks(crawler):
|
||||
|
||||
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
||||
|
||||
crawler.set_hook('on_driver_created', on_driver_created)
|
||||
crawler.set_hook('before_get_url', before_get_url)
|
||||
crawler.set_hook('after_get_url', after_get_url)
|
||||
crawler.set_hook('before_return_html', before_return_html)
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||
crawler_strategy.set_hook('on_driver_created', on_driver_created)
|
||||
crawler_strategy.set_hook('before_get_url', before_get_url)
|
||||
crawler_strategy.set_hook('after_get_url', after_get_url)
|
||||
crawler_strategy.set_hook('before_return_html', before_return_html)
|
||||
|
||||
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||
crawler.warmup()
|
||||
result = crawler.run(url="https://example.com")
|
||||
|
||||
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||||
print_result(result= result)
|
||||
|
||||
def using_crawler_hooks_dleay_example(crawler):
|
||||
def delay(driver):
|
||||
print("Delaying for 5 seconds...")
|
||||
time.sleep(5)
|
||||
print("Resuming...")
|
||||
|
||||
def create_crawler():
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||
crawler_strategy.set_hook('after_get_url', delay)
|
||||
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||
crawler.warmup()
|
||||
return crawler
|
||||
|
||||
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]")
|
||||
crawler = create_crawler()
|
||||
result = crawler.run(url="https://google.com", bypass_cache=True)
|
||||
|
||||
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
||||
|
||||
@@ -42,5 +42,5 @@ page_summary = json.loads(result.extracted_content)
|
||||
|
||||
print(page_summary)
|
||||
|
||||
with open(".data/page_summary.json", "w") as f:
|
||||
with open(".data/page_summary.json", "w", encoding="utf-8") as f:
|
||||
f.write(result.extracted_content)
|
||||
|
||||
@@ -1,5 +1,23 @@
|
||||
# Changelog
|
||||
|
||||
## [v0.2.75] - 2024-07-19
|
||||
|
||||
Minor improvements for a more maintainable codebase:
|
||||
|
||||
- 🔄 Fixed typos in `chunking_strategy.py` and `crawler_strategy.py` to improve code readability
|
||||
- 🔄 Removed `.test_pads/` directory from `.gitignore` to keep our repository clean and organized
|
||||
|
||||
These changes may seem small, but they contribute to a more stable and sustainable codebase. By fixing typos and updating our `.gitignore` settings, we're ensuring that our code is easier to maintain and scale in the long run.
|
||||
|
||||
|
||||
## v0.2.74 - 2024-07-08
|
||||
A slew of exciting updates to improve the crawler's stability and robustness! 🎉
|
||||
|
||||
- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
|
||||
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
|
||||
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
|
||||
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
|
||||
|
||||
## [v0.2.73] - 2024-07-03
|
||||
|
||||
💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
<div class="form-group">
|
||||
<button class="btn btn-default" type="submit">Submit</button>
|
||||
</div>
|
||||
|
||||
</fieldset>
|
||||
</form>
|
||||
|
||||
@@ -93,6 +94,10 @@
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<div id="error" class="error-message" style="display: none; margin-top:1em;">
|
||||
<div class="terminal-alert terminal-alert-error"></div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
function showTab(tabId) {
|
||||
const tabs = document.querySelectorAll('.tab-content');
|
||||
@@ -162,7 +167,17 @@
|
||||
},
|
||||
body: JSON.stringify(data)
|
||||
})
|
||||
.then(response => response.json())
|
||||
.then(response => {
|
||||
if (!response.ok) {
|
||||
if (response.status === 429) {
|
||||
return response.json().then(err => {
|
||||
throw Object.assign(new Error('Rate limit exceeded'), { status: 429, details: err });
|
||||
});
|
||||
}
|
||||
throw new Error('Network response was not ok');
|
||||
}
|
||||
return response.json();
|
||||
})
|
||||
.then(data => {
|
||||
data = data.results[0]; // Only one URL is requested
|
||||
document.getElementById('loading').style.display = 'none';
|
||||
@@ -187,11 +202,29 @@ result = crawler.run(
|
||||
print(result)
|
||||
`;
|
||||
redo(document.getElementById('pythonCode'), pythonCode);
|
||||
document.getElementById('error').style.display = 'none';
|
||||
})
|
||||
.catch(error => {
|
||||
document.getElementById('loading').style.display = 'none';
|
||||
document.getElementById('response').style.display = 'block';
|
||||
document.getElementById('markdownContent').textContent = 'Error: ' + error;
|
||||
document.getElementById('error').style.display = 'block';
|
||||
let errorMessage = 'An unexpected error occurred. Please try again later.';
|
||||
|
||||
if (error.status === 429) {
|
||||
const details = error.details;
|
||||
if (details.retry_after) {
|
||||
errorMessage = `Rate limit exceeded. Please wait ${parseFloat(details.retry_after).toFixed(1)} seconds before trying again.`;
|
||||
} else if (details.reset_at) {
|
||||
const resetTime = new Date(details.reset_at);
|
||||
const waitTime = Math.ceil((resetTime - new Date()) / 1000);
|
||||
errorMessage = `Rate limit exceeded. Please try again after ${waitTime} seconds.`;
|
||||
} else {
|
||||
errorMessage = `Rate limit exceeded. Please try again later.`;
|
||||
}
|
||||
} else if (error.message) {
|
||||
errorMessage = error.message;
|
||||
}
|
||||
|
||||
document.querySelector('#error .terminal-alert').textContent = errorMessage;
|
||||
});
|
||||
});
|
||||
</script>
|
||||
|
||||
@@ -14,6 +14,9 @@ Let's see how we can customize the crawler using hooks! In this example, we'll:
|
||||
### Hook Definitions
|
||||
|
||||
```python
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.crawler_strategy import *
|
||||
|
||||
def on_driver_created(driver):
|
||||
print("[HOOK] on_driver_created")
|
||||
# Example customization: maximize the window
|
||||
@@ -66,12 +69,13 @@ def before_return_html(driver, html):
|
||||
|
||||
```python
|
||||
print("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
||||
crawler = WebCrawler(verbose=True)
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||
crawler_strategy.set_hook('on_driver_created', on_driver_created)
|
||||
crawler_strategy.set_hook('before_get_url', before_get_url)
|
||||
crawler_strategy.set_hook('after_get_url', after_get_url)
|
||||
crawler_strategy.set_hook('before_return_html', before_return_html)
|
||||
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||
crawler.warmup()
|
||||
crawler.set_hook('on_driver_created', on_driver_created)
|
||||
crawler.set_hook('before_get_url', before_get_url)
|
||||
crawler.set_hook('after_get_url', after_get_url)
|
||||
crawler.set_hook('before_return_html', before_return_html)
|
||||
|
||||
result = crawler.run(url="https://example.com")
|
||||
|
||||
|
||||
@@ -45,7 +45,7 @@ model_fees = json.loads(result.extracted_content)
|
||||
|
||||
print(len(model_fees))
|
||||
|
||||
with open(".data/data.json", "w") as f:
|
||||
with open(".data/data.json", "w", encoding="utf-8") as f:
|
||||
f.write(result.extracted_content)
|
||||
```
|
||||
|
||||
@@ -71,7 +71,7 @@ model_fees = json.loads(result.extracted_content)
|
||||
|
||||
print(len(model_fees))
|
||||
|
||||
with open(".data/data.json", "w") as f:
|
||||
with open(".data/data.json", "w", encoding="utf-8") as f:
|
||||
f.write(result.extracted_content)
|
||||
```
|
||||
|
||||
|
||||
@@ -91,7 +91,7 @@ This example demonstrates how to use `Crawl4AI` to extract a summary from a web
|
||||
Save the extracted data to a file for further use.
|
||||
|
||||
```python
|
||||
with open(".data/page_summary.json", "w") as f:
|
||||
with open(".data/page_summary.json", "w", encoding="utf-8") as f:
|
||||
f.write(result.extracted_content)
|
||||
```
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Crawl4AI v0.2.73
|
||||
# Crawl4AI v0.2.75
|
||||
|
||||
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
|
||||
|
||||
|
||||
@@ -176,41 +176,29 @@ print(f"JavaScript Code (Load More button) result: {result}")
|
||||
Let's see how we can customize the crawler using hooks!
|
||||
|
||||
```python
|
||||
def on_driver_created(driver):
|
||||
print("[HOOK] on_driver_created")
|
||||
driver.maximize_window()
|
||||
driver.get('https://example.com/login')
|
||||
driver.find_element(By.NAME, 'username').send_keys('testuser')
|
||||
driver.find_element(By.NAME, 'password').send_keys('password123')
|
||||
driver.find_element(By.NAME, 'login').click()
|
||||
driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
|
||||
return driver
|
||||
import time
|
||||
|
||||
def before_get_url(driver):
|
||||
print("[HOOK] before_get_url")
|
||||
driver.execute_cdp_cmd('Network.enable', {})
|
||||
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
|
||||
return driver
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.crawler_strategy import *
|
||||
|
||||
def after_get_url(driver):
|
||||
print("[HOOK] after_get_url")
|
||||
print(driver.current_url)
|
||||
return driver
|
||||
def delay(driver):
|
||||
print("Delaying for 5 seconds...")
|
||||
time.sleep(5)
|
||||
print("Resuming...")
|
||||
|
||||
def create_crawler():
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||
crawler_strategy.set_hook('after_get_url', delay)
|
||||
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||
crawler.warmup()
|
||||
return crawler
|
||||
|
||||
def before_return_html(driver, html):
|
||||
print("[HOOK] before_return_html")
|
||||
print(len(html))
|
||||
return driver
|
||||
|
||||
crawler.set_hook('on_driver_created', on_driver_created)
|
||||
crawler.set_hook('before_get_url', before_get_url)
|
||||
crawler.set_hook('after_get_url', after_get_url)
|
||||
crawler.set_hook('before_return_html', before_return_html)
|
||||
|
||||
result = crawler.run(url="https://example.com")
|
||||
print(f"Crawler Hooks result: {result}")
|
||||
crawler = create_crawler()
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
|
||||
```
|
||||
|
||||
check [Hooks](examples/hooks_auth.md) for more examples.
|
||||
|
||||
## Congratulations! 🎉
|
||||
|
||||
You've made it through the Crawl4AI Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️
|
||||
|
||||
83
main.py
83
main.py
@@ -22,6 +22,15 @@ from typing import List, Optional
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.database import get_total_count, clear_db
|
||||
|
||||
import time
|
||||
from slowapi import Limiter, _rate_limit_exceeded_handler
|
||||
from slowapi.util import get_remote_address
|
||||
from slowapi.errors import RateLimitExceeded
|
||||
|
||||
# load .env file
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
# Configuration
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
MAX_CONCURRENT_REQUESTS = 10 # Adjust this to change the maximum concurrent requests
|
||||
@@ -30,6 +39,78 @@ lock = asyncio.Lock()
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
# Initialize rate limiter
|
||||
def rate_limit_key_func(request: Request):
|
||||
access_token = request.headers.get("access-token")
|
||||
if access_token == os.environ.get('ACCESS_TOKEN'):
|
||||
return None
|
||||
return get_remote_address(request)
|
||||
|
||||
limiter = Limiter(key_func=rate_limit_key_func)
|
||||
app.state.limiter = limiter
|
||||
|
||||
# Dictionary to store last request times for each client
|
||||
last_request_times = {}
|
||||
last_rate_limit = {}
|
||||
|
||||
|
||||
def get_rate_limit():
|
||||
limit = os.environ.get('ACCESS_PER_MIN', "5")
|
||||
return f"{limit}/minute"
|
||||
|
||||
# Custom rate limit exceeded handler
|
||||
async def custom_rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded) -> JSONResponse:
|
||||
if request.client.host not in last_rate_limit or time.time() - last_rate_limit[request.client.host] > 60:
|
||||
last_rate_limit[request.client.host] = time.time()
|
||||
retry_after = 60 - (time.time() - last_rate_limit[request.client.host])
|
||||
reset_at = time.time() + retry_after
|
||||
return JSONResponse(
|
||||
status_code=429,
|
||||
content={
|
||||
"detail": "Rate limit exceeded",
|
||||
"limit": str(exc.limit.limit),
|
||||
"retry_after": retry_after,
|
||||
'reset_at': reset_at,
|
||||
"message": f"You have exceeded the rate limit of {exc.limit.limit}."
|
||||
}
|
||||
)
|
||||
|
||||
app.add_exception_handler(RateLimitExceeded, custom_rate_limit_exceeded_handler)
|
||||
|
||||
|
||||
# Middleware for token-based bypass and per-request limit
|
||||
class RateLimitMiddleware(BaseHTTPMiddleware):
|
||||
async def dispatch(self, request: Request, call_next):
|
||||
SPAN = int(os.environ.get('ACCESS_TIME_SPAN', 10))
|
||||
access_token = request.headers.get("access-token")
|
||||
if access_token == os.environ.get('ACCESS_TOKEN'):
|
||||
return await call_next(request)
|
||||
|
||||
path = request.url.path
|
||||
if path in ["/crawl", "/old"]:
|
||||
client_ip = request.client.host
|
||||
current_time = time.time()
|
||||
|
||||
# Check time since last request
|
||||
if client_ip in last_request_times:
|
||||
time_since_last_request = current_time - last_request_times[client_ip]
|
||||
if time_since_last_request < SPAN:
|
||||
return JSONResponse(
|
||||
status_code=429,
|
||||
content={
|
||||
"detail": "Too many requests",
|
||||
"message": "Rate limit exceeded. Please wait 10 seconds between requests.",
|
||||
"retry_after": max(0, SPAN - time_since_last_request),
|
||||
"reset_at": current_time + max(0, SPAN - time_since_last_request),
|
||||
}
|
||||
)
|
||||
|
||||
last_request_times[client_ip] = current_time
|
||||
|
||||
return await call_next(request)
|
||||
|
||||
app.add_middleware(RateLimitMiddleware)
|
||||
|
||||
# CORS configuration
|
||||
origins = ["*"] # Allow all origins
|
||||
app.add_middleware(
|
||||
@@ -73,6 +154,7 @@ def read_root():
|
||||
return RedirectResponse(url="/mkdocs")
|
||||
|
||||
@app.get("/old", response_class=HTMLResponse)
|
||||
@limiter.limit(get_rate_limit())
|
||||
async def read_index(request: Request):
|
||||
partials_dir = os.path.join(__location__, "pages", "partial")
|
||||
partials = {}
|
||||
@@ -107,6 +189,7 @@ def import_strategy(module_name: str, class_name: str, *args, **kwargs):
|
||||
raise HTTPException(status_code=400, detail=f"Class {class_name} not found in {module_name}.")
|
||||
|
||||
@app.post("/crawl")
|
||||
@limiter.limit(get_rate_limit())
|
||||
async def crawl_urls(crawl_request: CrawlRequest, request: Request):
|
||||
logging.debug(f"[LOG] Crawl request for URL: {crawl_request.urls}")
|
||||
global current_requests
|
||||
|
||||
0
middlewares.py
Normal file
0
middlewares.py
Normal file
17
setup.py
17
setup.py
@@ -1,13 +1,18 @@
|
||||
from setuptools import setup, find_packages
|
||||
import os
|
||||
from pathlib import Path
|
||||
import subprocess
|
||||
from setuptools.command.install import install
|
||||
import shutil
|
||||
|
||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||
crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
os.makedirs(crawl4ai_folder, exist_ok=True)
|
||||
os.makedirs(f"{crawl4ai_folder}/cache", exist_ok=True)
|
||||
# If the folder already exists, remove the cache folder
|
||||
crawl4ai_folder = Path.home() / ".crawl4ai"
|
||||
cache_folder = crawl4ai_folder / "cache"
|
||||
|
||||
if cache_folder.exists():
|
||||
shutil.rmtree(cache_folder)
|
||||
|
||||
crawl4ai_folder.mkdir(exist_ok=True)
|
||||
cache_folder.mkdir(exist_ok=True)
|
||||
|
||||
# Read the requirements from requirements.txt
|
||||
with open("requirements.txt") as f:
|
||||
@@ -20,7 +25,7 @@ transformer_requirements = [req for req in requirements if req.startswith(("tran
|
||||
|
||||
setup(
|
||||
name="Crawl4AI",
|
||||
version="0.2.73",
|
||||
version="0.2.74",
|
||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
||||
long_description=open("README.md", encoding="utf-8").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
|
||||
Reference in New Issue
Block a user