Compare commits
11 Commits
v0.2.73
...
image-desc
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cf6c835e18 | ||
|
|
e5ecf291f3 | ||
|
|
9d0cafcfa6 | ||
|
|
7715623430 | ||
|
|
f5a4e80e2c | ||
|
|
8463aabedf | ||
|
|
7f30144ef2 | ||
|
|
fa5516aad6 | ||
|
|
ca0336af9e | ||
|
|
65ed1aeade | ||
|
|
4d283ab386 |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -165,6 +165,8 @@ Crawl4AI.egg-info/
|
|||||||
Crawl4AI.egg-info/*
|
Crawl4AI.egg-info/*
|
||||||
crawler_data.db
|
crawler_data.db
|
||||||
.vscode/
|
.vscode/
|
||||||
|
.tests/
|
||||||
|
.test_pads/
|
||||||
test_pad.py
|
test_pad.py
|
||||||
test_pad*.py
|
test_pad*.py
|
||||||
.data/
|
.data/
|
||||||
|
|||||||
18
CHANGELOG.md
18
CHANGELOG.md
@@ -1,5 +1,23 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## [v0.2.75] - 2024-07-19
|
||||||
|
|
||||||
|
Minor improvements for a more maintainable codebase:
|
||||||
|
|
||||||
|
- 🔄 Fixed typos in `chunking_strategy.py` and `crawler_strategy.py` to improve code readability
|
||||||
|
- 🔄 Removed `.test_pads/` directory from `.gitignore` to keep our repository clean and organized
|
||||||
|
|
||||||
|
These changes may seem small, but they contribute to a more stable and sustainable codebase. By fixing typos and updating our `.gitignore` settings, we're ensuring that our code is easier to maintain and scale in the long run.
|
||||||
|
|
||||||
|
## [v0.2.74] - 2024-07-08
|
||||||
|
A slew of exciting updates to improve the crawler's stability and robustness! 🎉
|
||||||
|
|
||||||
|
- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
|
||||||
|
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
|
||||||
|
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
|
||||||
|
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
|
||||||
|
|
||||||
|
|
||||||
## [v0.2.73] - 2024-07-03
|
## [v0.2.73] - 2024-07-03
|
||||||
|
|
||||||
💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.
|
💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# Crawl4AI v0.2.73 🕷️🤖
|
# Crawl4AI v0.2.75 🕷️🤖
|
||||||
|
|
||||||
[](https://github.com/unclecode/crawl4ai/stargazers)
|
[](https://github.com/unclecode/crawl4ai/stargazers)
|
||||||
[](https://github.com/unclecode/crawl4ai/network/members)
|
[](https://github.com/unclecode/crawl4ai/network/members)
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ import re
|
|||||||
from collections import Counter
|
from collections import Counter
|
||||||
import string
|
import string
|
||||||
from .model_loader import load_nltk_punkt
|
from .model_loader import load_nltk_punkt
|
||||||
|
from .utils import *
|
||||||
|
|
||||||
# Define the abstract base class for chunking strategies
|
# Define the abstract base class for chunking strategies
|
||||||
class ChunkingStrategy(ABC):
|
class ChunkingStrategy(ABC):
|
||||||
@@ -54,7 +55,7 @@ class TopicSegmentationChunking(ChunkingStrategy):
|
|||||||
|
|
||||||
def __init__(self, num_keywords=3, **kwargs):
|
def __init__(self, num_keywords=3, **kwargs):
|
||||||
import nltk as nl
|
import nltk as nl
|
||||||
self.tokenizer = nl.toknize.TextTilingTokenizer()
|
self.tokenizer = nl.tokenize.TextTilingTokenizer()
|
||||||
self.num_keywords = num_keywords
|
self.num_keywords = num_keywords
|
||||||
|
|
||||||
def chunk(self, text: str) -> list:
|
def chunk(self, text: str) -> list:
|
||||||
|
|||||||
@@ -27,3 +27,13 @@ WORD_TOKEN_RATE = 1.3
|
|||||||
|
|
||||||
# Threshold for the minimum number of word in a HTML tag to be considered
|
# Threshold for the minimum number of word in a HTML tag to be considered
|
||||||
MIN_WORD_THRESHOLD = 1
|
MIN_WORD_THRESHOLD = 1
|
||||||
|
|
||||||
|
# Threshold for the Image extraction - Range is 1 to 6
|
||||||
|
# Images are scored based on point based system, to filter based on usefulness. Points are assigned
|
||||||
|
# to each image based on the following aspects.
|
||||||
|
# If either height or width exceeds 150px
|
||||||
|
# If image size is greater than 10Kb
|
||||||
|
# If alt property is set
|
||||||
|
# If image format is in jpg, png or webp
|
||||||
|
# If image is in the first half of the total images extracted from the page
|
||||||
|
IMAGE_SCORE_THRESHOLD = 2
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ from selenium.webdriver.chrome.options import Options
|
|||||||
from selenium.common.exceptions import InvalidArgumentException, WebDriverException
|
from selenium.common.exceptions import InvalidArgumentException, WebDriverException
|
||||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||||
from webdriver_manager.chrome import ChromeDriverManager
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
from urllib3.exceptions import MaxRetryError
|
||||||
|
|
||||||
from .config import *
|
from .config import *
|
||||||
import logging, time
|
import logging, time
|
||||||
@@ -18,7 +19,7 @@ from typing import List, Callable
|
|||||||
import requests
|
import requests
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from .utils import wrap_text
|
from .utils import *
|
||||||
|
|
||||||
logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
|
logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
|
||||||
logger.setLevel(logging.WARNING)
|
logger.setLevel(logging.WARNING)
|
||||||
@@ -73,7 +74,7 @@ class CloudCrawlerStrategy(CrawlerStrategy):
|
|||||||
response = requests.post("http://crawl4ai.uccode.io/crawl", json=data)
|
response = requests.post("http://crawl4ai.uccode.io/crawl", json=data)
|
||||||
response = response.json()
|
response = response.json()
|
||||||
html = response["results"][0]["html"]
|
html = response["results"][0]["html"]
|
||||||
return html
|
return sanitize_input_encode(html)
|
||||||
|
|
||||||
class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||||
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
|
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
|
||||||
@@ -200,7 +201,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
||||||
if os.path.exists(cache_file_path):
|
if os.path.exists(cache_file_path):
|
||||||
with open(cache_file_path, "r") as f:
|
with open(cache_file_path, "r") as f:
|
||||||
return f.read()
|
return sanitize_input_encode(f.read())
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.driver = self.execute_hook('before_get_url', self.driver)
|
self.driver = self.execute_hook('before_get_url', self.driver)
|
||||||
@@ -214,11 +215,15 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
WebDriverWait(self.driver, 10).until(
|
WebDriverWait(self.driver, 10).until(
|
||||||
EC.presence_of_all_elements_located((By.TAG_NAME, "body"))
|
EC.presence_of_all_elements_located((By.TAG_NAME, "body"))
|
||||||
)
|
)
|
||||||
|
|
||||||
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
html = self._ensure_page_load() # self.driver.page_source
|
|
||||||
|
self.driver = self.execute_hook('after_get_url', self.driver)
|
||||||
|
html = sanitize_input_encode(self._ensure_page_load()) # self.driver.page_source
|
||||||
can_not_be_done_headless = False # Look at my creativity for naming variables
|
can_not_be_done_headless = False # Look at my creativity for naming variables
|
||||||
# TODO: Very ugly way for now but it works
|
|
||||||
if not kwargs.get('bypass_headless', False) and html == "<html><head></head><body></body></html>":
|
# TODO: Very ugly approach, but promise to change it!
|
||||||
|
if kwargs.get('bypass_headless', False) or html == "<html><head></head><body></body></html>":
|
||||||
print("[LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode...")
|
print("[LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode...")
|
||||||
can_not_be_done_headless = True
|
can_not_be_done_headless = True
|
||||||
options = Options()
|
options = Options()
|
||||||
@@ -227,11 +232,10 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
options.add_argument("--window-size=5,5")
|
options.add_argument("--window-size=5,5")
|
||||||
driver = webdriver.Chrome(service=self.service, options=options)
|
driver = webdriver.Chrome(service=self.service, options=options)
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
html = driver.page_source
|
self.driver = self.execute_hook('after_get_url', driver)
|
||||||
|
html = sanitize_input_encode(driver.page_source)
|
||||||
driver.quit()
|
driver.quit()
|
||||||
|
|
||||||
self.driver = self.execute_hook('after_get_url', self.driver)
|
|
||||||
|
|
||||||
# Execute JS code if provided
|
# Execute JS code if provided
|
||||||
if self.js_code and type(self.js_code) == str:
|
if self.js_code and type(self.js_code) == str:
|
||||||
self.driver.execute_script(self.js_code)
|
self.driver.execute_script(self.js_code)
|
||||||
@@ -247,12 +251,12 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if not can_not_be_done_headless:
|
if not can_not_be_done_headless:
|
||||||
html = self.driver.page_source
|
html = sanitize_input_encode(self.driver.page_source)
|
||||||
self.driver = self.execute_hook('before_return_html', self.driver, html)
|
self.driver = self.execute_hook('before_return_html', self.driver, html)
|
||||||
|
|
||||||
# Store in cache
|
# Store in cache
|
||||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
||||||
with open(cache_file_path, "w") as f:
|
with open(cache_file_path, "w", encoding="utf-8") as f:
|
||||||
f.write(html)
|
f.write(html)
|
||||||
|
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
@@ -261,16 +265,16 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
return html
|
return html
|
||||||
except InvalidArgumentException:
|
except InvalidArgumentException:
|
||||||
if not hasattr(e, 'msg'):
|
if not hasattr(e, 'msg'):
|
||||||
e.msg = str(e)
|
e.msg = sanitize_input_encode(str(e))
|
||||||
raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
|
raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
|
||||||
except WebDriverException as e:
|
except WebDriverException as e:
|
||||||
# If e does nlt have msg attribute create it and set it to str(e)
|
# If e does nlt have msg attribute create it and set it to str(e)
|
||||||
if not hasattr(e, 'msg'):
|
if not hasattr(e, 'msg'):
|
||||||
e.msg = str(e)
|
e.msg = sanitize_input_encode(str(e))
|
||||||
raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
|
raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if not hasattr(e, 'msg'):
|
if not hasattr(e, 'msg'):
|
||||||
e.msg = str(e)
|
e.msg = sanitize_input_encode(str(e))
|
||||||
raise Exception(f"Failed to crawl {url}: {e.msg}")
|
raise Exception(f"Failed to crawl {url}: {e.msg}")
|
||||||
|
|
||||||
def take_screenshot(self) -> str:
|
def take_screenshot(self) -> str:
|
||||||
@@ -288,18 +292,25 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
# Open the screenshot with PIL
|
# Open the screenshot with PIL
|
||||||
image = Image.open(BytesIO(screenshot))
|
image = Image.open(BytesIO(screenshot))
|
||||||
|
|
||||||
|
# Convert image to RGB mode
|
||||||
|
rgb_image = image.convert('RGB')
|
||||||
|
|
||||||
# Convert to JPEG and compress
|
# Convert to JPEG and compress
|
||||||
buffered = BytesIO()
|
buffered = BytesIO()
|
||||||
image.save(buffered, format="JPEG", quality=85)
|
rgb_image.save(buffered, format="JPEG", quality=85)
|
||||||
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||||
|
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"[LOG] 📸 Screenshot taken and converted to base64")
|
print(f"[LOG] 📸 Screenshot taken and converted to base64")
|
||||||
|
|
||||||
return img_base64
|
return img_base64
|
||||||
|
except Exception as e:
|
||||||
|
if self.verbose:
|
||||||
|
print(f"[ERROR] Failed to take screenshot: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_message = f"Failed to take screenshot: {str(e)}"
|
error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}")
|
||||||
print(error_message)
|
print(error_message)
|
||||||
|
|
||||||
# Generate an image with black background
|
# Generate an image with black background
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ def init_db():
|
|||||||
extracted_content TEXT,
|
extracted_content TEXT,
|
||||||
success BOOLEAN,
|
success BOOLEAN,
|
||||||
media TEXT DEFAULT "{}",
|
media TEXT DEFAULT "{}",
|
||||||
link TEXT DEFAULT "{}",
|
links TEXT DEFAULT "{}",
|
||||||
metadata TEXT DEFAULT "{}",
|
metadata TEXT DEFAULT "{}",
|
||||||
screenshot TEXT DEFAULT ""
|
screenshot TEXT DEFAULT ""
|
||||||
)
|
)
|
||||||
@@ -127,6 +127,9 @@ def update_existing_records(new_column: str = "media", default_value: str = "{}"
|
|||||||
print(f"Error updating existing records: {e}")
|
print(f"Error updating existing records: {e}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
init_db() # Initialize the database if not already initialized
|
# Delete the existing database file
|
||||||
alter_db_add_screenshot("metadata") # Add the new column to the table
|
if os.path.exists(DB_PATH):
|
||||||
update_existing_records("metadata") # Update existing records to set the new column to an empty string
|
os.remove(DB_PATH)
|
||||||
|
init_db()
|
||||||
|
# alter_db_add_screenshot("COL_NAME")
|
||||||
|
|
||||||
|
|||||||
@@ -116,7 +116,6 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
for block in blocks:
|
for block in blocks:
|
||||||
block['error'] = False
|
block['error'] = False
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Error extracting blocks:", str(e))
|
|
||||||
parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
|
parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
|
||||||
blocks = parsed
|
blocks = parsed
|
||||||
if unparsed:
|
if unparsed:
|
||||||
@@ -192,7 +191,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
# Sequential processing with a delay
|
# Sequential processing with a delay
|
||||||
for ix, section in enumerate(merged_sections):
|
for ix, section in enumerate(merged_sections):
|
||||||
extract_func = partial(self.extract, url)
|
extract_func = partial(self.extract, url)
|
||||||
extracted_content.extend(extract_func(ix, section))
|
extracted_content.extend(extract_func(ix, sanitize_input_encode(section)))
|
||||||
time.sleep(0.5) # 500 ms delay between each processing
|
time.sleep(0.5) # 500 ms delay between each processing
|
||||||
else:
|
else:
|
||||||
# Parallel processing using ThreadPoolExecutor
|
# Parallel processing using ThreadPoolExecutor
|
||||||
@@ -202,10 +201,21 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=4) as executor:
|
with ThreadPoolExecutor(max_workers=4) as executor:
|
||||||
extract_func = partial(self.extract, url)
|
extract_func = partial(self.extract, url)
|
||||||
futures = [executor.submit(extract_func, ix, section) for ix, section in enumerate(merged_sections)]
|
futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)]
|
||||||
|
|
||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
extracted_content.extend(future.result())
|
try:
|
||||||
|
extracted_content.extend(future.result())
|
||||||
|
except Exception as e:
|
||||||
|
if self.verbose:
|
||||||
|
print(f"Error in thread execution: {e}")
|
||||||
|
# Add error information to extracted_content
|
||||||
|
extracted_content.append({
|
||||||
|
"index": 0,
|
||||||
|
"error": True,
|
||||||
|
"tags": ["error"],
|
||||||
|
"content": str(e)
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
return extracted_content
|
return extracted_content
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ from pathlib import Path
|
|||||||
import subprocess, os
|
import subprocess, os
|
||||||
import shutil
|
import shutil
|
||||||
import tarfile
|
import tarfile
|
||||||
from crawl4ai.config import MODEL_REPO_BRANCH
|
from .model_loader import *
|
||||||
import argparse
|
import argparse
|
||||||
import urllib.request
|
import urllib.request
|
||||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
|
|||||||
@@ -11,6 +11,9 @@ from .prompts import PROMPT_EXTRACT_BLOCKS
|
|||||||
from .config import *
|
from .config import *
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Any
|
from typing import Dict, Any
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
import requests
|
||||||
|
from requests.exceptions import InvalidSchema
|
||||||
|
|
||||||
class InvalidCSSSelectorError(Exception):
|
class InvalidCSSSelectorError(Exception):
|
||||||
pass
|
pass
|
||||||
@@ -96,6 +99,16 @@ def sanitize_html(html):
|
|||||||
|
|
||||||
return sanitized_html
|
return sanitized_html
|
||||||
|
|
||||||
|
def sanitize_input_encode(text: str) -> str:
|
||||||
|
"""Sanitize input to handle potential encoding issues."""
|
||||||
|
try:
|
||||||
|
# Attempt to encode and decode as UTF-8 to handle potential encoding issues
|
||||||
|
return text.encode('utf-8', errors='ignore').decode('utf-8')
|
||||||
|
except UnicodeEncodeError as e:
|
||||||
|
print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}")
|
||||||
|
# Fall back to ASCII if UTF-8 fails
|
||||||
|
return text.encode('ascii', errors='ignore').decode('ascii')
|
||||||
|
|
||||||
def escape_json_string(s):
|
def escape_json_string(s):
|
||||||
"""
|
"""
|
||||||
Escapes characters in a string to be JSON safe.
|
Escapes characters in a string to be JSON safe.
|
||||||
@@ -437,6 +450,101 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
|||||||
links = {'internal': [], 'external': []}
|
links = {'internal': [], 'external': []}
|
||||||
media = {'images': [], 'videos': [], 'audios': []}
|
media = {'images': [], 'videos': [], 'audios': []}
|
||||||
|
|
||||||
|
def process_image(img, url, index, total_images):
|
||||||
|
#Check if an image has valid display and inside undesired html elements
|
||||||
|
def is_valid_image(img, parent, parent_classes):
|
||||||
|
style = img.get('style', '')
|
||||||
|
src = img.get('src', '')
|
||||||
|
classes_to_check = ['button', 'icon', 'logo']
|
||||||
|
tags_to_check = ['button', 'input']
|
||||||
|
return all([
|
||||||
|
'display:none' not in style,
|
||||||
|
src,
|
||||||
|
not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check),
|
||||||
|
parent.name not in tags_to_check
|
||||||
|
])
|
||||||
|
|
||||||
|
#Score an image for it's usefulness
|
||||||
|
def score_image_for_usefulness(img, base_url, index, images_count):
|
||||||
|
# Function to parse image height/width value and units
|
||||||
|
def parse_dimension(dimension):
|
||||||
|
if dimension:
|
||||||
|
match = re.match(r"(\d+)(\D*)", dimension)
|
||||||
|
if match:
|
||||||
|
number = int(match.group(1))
|
||||||
|
unit = match.group(2) or 'px' # Default unit is 'px' if not specified
|
||||||
|
return number, unit
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# Fetch image file metadata to extract size and extension
|
||||||
|
def fetch_image_file_size(img, base_url):
|
||||||
|
#If src is relative path construct full URL, if not it may be CDN URL
|
||||||
|
img_url = urljoin(base_url,img.get('src'))
|
||||||
|
try:
|
||||||
|
response = requests.head(img_url)
|
||||||
|
if response.status_code == 200:
|
||||||
|
return response.headers.get('Content-Length',None)
|
||||||
|
else:
|
||||||
|
print(f"Failed to retrieve file size for {img_url}")
|
||||||
|
return None
|
||||||
|
except InvalidSchema as e:
|
||||||
|
return None
|
||||||
|
finally:
|
||||||
|
return
|
||||||
|
|
||||||
|
image_height = img.get('height')
|
||||||
|
height_value, height_unit = parse_dimension(image_height)
|
||||||
|
image_width = img.get('width')
|
||||||
|
width_value, width_unit = parse_dimension(image_width)
|
||||||
|
image_size = int(fetch_image_file_size(img,base_url) or 0)
|
||||||
|
image_format = os.path.splitext(img.get('src',''))[1].lower()
|
||||||
|
score = 0
|
||||||
|
if height_value:
|
||||||
|
if height_unit == 'px' and height_value > 150:
|
||||||
|
score += 1
|
||||||
|
if height_unit in ['%','vh','vmin','vmax'] and height_value >30:
|
||||||
|
score += 1
|
||||||
|
if width_value:
|
||||||
|
if width_unit == 'px' and width_value > 150:
|
||||||
|
score += 1
|
||||||
|
if width_unit in ['%','vh','vmin','vmax'] and width_value >30:
|
||||||
|
score += 1
|
||||||
|
if image_size > 10000:
|
||||||
|
score += 1
|
||||||
|
if img.get('alt') != '':
|
||||||
|
score+=1
|
||||||
|
if any(image_format==format for format in ['jpg','png','webp']):
|
||||||
|
score+=1
|
||||||
|
if index/images_count<0.5:
|
||||||
|
score+=1
|
||||||
|
return score
|
||||||
|
|
||||||
|
# Extract meaningful text for images from closest parent
|
||||||
|
def find_closest_parent_with_useful_text(tag):
|
||||||
|
current_tag = tag
|
||||||
|
while current_tag:
|
||||||
|
current_tag = current_tag.parent
|
||||||
|
# Get the text content of the parent tag
|
||||||
|
if current_tag:
|
||||||
|
text_content = current_tag.get_text(separator=' ',strip=True)
|
||||||
|
# Check if the text content has at least word_count_threshold
|
||||||
|
if len(text_content.split()) >= word_count_threshold:
|
||||||
|
return text_content
|
||||||
|
return None
|
||||||
|
|
||||||
|
if not is_valid_image(img, img.parent, img.parent.get('class', [])):
|
||||||
|
return None
|
||||||
|
score = score_image_for_usefulness(img, url, index, total_images)
|
||||||
|
if score <= IMAGE_SCORE_THRESHOLD:
|
||||||
|
return None
|
||||||
|
return {
|
||||||
|
'src': img.get('src', ''),
|
||||||
|
'alt': img.get('alt', ''),
|
||||||
|
'desc': find_closest_parent_with_useful_text(img),
|
||||||
|
'score': score,
|
||||||
|
'type': 'image'
|
||||||
|
}
|
||||||
|
|
||||||
def process_element(element: element.PageElement) -> bool:
|
def process_element(element: element.PageElement) -> bool:
|
||||||
try:
|
try:
|
||||||
if isinstance(element, NavigableString):
|
if isinstance(element, NavigableString):
|
||||||
@@ -461,11 +569,6 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
|||||||
keep_element = True
|
keep_element = True
|
||||||
|
|
||||||
elif element.name == 'img':
|
elif element.name == 'img':
|
||||||
media['images'].append({
|
|
||||||
'src': element.get('src'),
|
|
||||||
'alt': element.get('alt'),
|
|
||||||
'type': 'image'
|
|
||||||
})
|
|
||||||
return True # Always keep image elements
|
return True # Always keep image elements
|
||||||
|
|
||||||
elif element.name in ['video', 'audio']:
|
elif element.name in ['video', 'audio']:
|
||||||
@@ -508,6 +611,14 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
|||||||
print('Error processing element:', str(e))
|
print('Error processing element:', str(e))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
#process images by filtering and extracting contextual text from the page
|
||||||
|
imgs = body.find_all('img')
|
||||||
|
media['images'] = [
|
||||||
|
result for result in
|
||||||
|
(process_image(img, url, i, len(imgs)) for i, img in enumerate(imgs))
|
||||||
|
if result is not None
|
||||||
|
]
|
||||||
|
|
||||||
process_element(body)
|
process_element(body)
|
||||||
|
|
||||||
def flatten_nested_elements(node):
|
def flatten_nested_elements(node):
|
||||||
@@ -664,7 +775,6 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
|
|||||||
for block in blocks:
|
for block in blocks:
|
||||||
block['error'] = False
|
block['error'] = False
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Error extracting blocks:", str(e))
|
|
||||||
parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
|
parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
|
||||||
blocks = parsed
|
blocks = parsed
|
||||||
# Append all unparsed segments as onr error block and content is list of unparsed segments
|
# Append all unparsed segments as onr error block and content is list of unparsed segments
|
||||||
@@ -710,7 +820,6 @@ def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_toke
|
|||||||
blocks = json.loads(blocks)
|
blocks = json.loads(blocks)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Error extracting blocks:", str(e))
|
|
||||||
blocks = [{
|
blocks = [{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
"tags": ["error"],
|
"tags": ["error"],
|
||||||
|
|||||||
@@ -155,8 +155,8 @@ class WebCrawler:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
if cached:
|
if cached:
|
||||||
html = cached[1]
|
html = sanitize_input_encode(cached[1])
|
||||||
extracted_content = cached[4]
|
extracted_content = sanitize_input_encode(cached[4])
|
||||||
if screenshot:
|
if screenshot:
|
||||||
screenshot_data = cached[9]
|
screenshot_data = cached[9]
|
||||||
if not screenshot_data:
|
if not screenshot_data:
|
||||||
@@ -166,7 +166,7 @@ class WebCrawler:
|
|||||||
if user_agent:
|
if user_agent:
|
||||||
self.crawler_strategy.update_user_agent(user_agent)
|
self.crawler_strategy.update_user_agent(user_agent)
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
html = self.crawler_strategy.crawl(url, **kwargs)
|
html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
|
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
|
||||||
@@ -213,8 +213,8 @@ class WebCrawler:
|
|||||||
except InvalidCSSSelectorError as e:
|
except InvalidCSSSelectorError as e:
|
||||||
raise ValueError(str(e))
|
raise ValueError(str(e))
|
||||||
|
|
||||||
cleaned_html = result.get("cleaned_html", "")
|
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
||||||
markdown = result.get("markdown", "")
|
markdown = sanitize_input_encode(result.get("markdown", ""))
|
||||||
media = result.get("media", [])
|
media = result.get("media", [])
|
||||||
links = result.get("links", [])
|
links = result.get("links", [])
|
||||||
metadata = result.get("metadata", {})
|
metadata = result.get("metadata", {})
|
||||||
|
|||||||
@@ -36,5 +36,5 @@ model_fees = json.loads(result.extracted_content)
|
|||||||
|
|
||||||
print(len(model_fees))
|
print(len(model_fees))
|
||||||
|
|
||||||
with open(".data/data.json", "w") as f:
|
with open(".data/data.json", "w", encoding="utf-8") as f:
|
||||||
f.write(result.extracted_content)
|
f.write(result.extracted_content)
|
||||||
@@ -249,15 +249,40 @@ def using_crawler_hooks(crawler):
|
|||||||
|
|
||||||
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
||||||
|
|
||||||
crawler.set_hook('on_driver_created', on_driver_created)
|
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||||
crawler.set_hook('before_get_url', before_get_url)
|
crawler_strategy.set_hook('on_driver_created', on_driver_created)
|
||||||
crawler.set_hook('after_get_url', after_get_url)
|
crawler_strategy.set_hook('before_get_url', before_get_url)
|
||||||
crawler.set_hook('before_return_html', before_return_html)
|
crawler_strategy.set_hook('after_get_url', after_get_url)
|
||||||
|
crawler_strategy.set_hook('before_return_html', before_return_html)
|
||||||
|
|
||||||
|
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||||
|
crawler.warmup()
|
||||||
result = crawler.run(url="https://example.com")
|
result = crawler.run(url="https://example.com")
|
||||||
|
|
||||||
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||||||
print_result(result= result)
|
print_result(result= result)
|
||||||
|
|
||||||
|
def using_crawler_hooks_dleay_example(crawler):
|
||||||
|
def delay(driver):
|
||||||
|
print("Delaying for 5 seconds...")
|
||||||
|
time.sleep(5)
|
||||||
|
print("Resuming...")
|
||||||
|
|
||||||
|
def create_crawler():
|
||||||
|
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||||
|
crawler_strategy.set_hook('after_get_url', delay)
|
||||||
|
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||||
|
crawler.warmup()
|
||||||
|
return crawler
|
||||||
|
|
||||||
|
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]")
|
||||||
|
crawler = create_crawler()
|
||||||
|
result = crawler.run(url="https://google.com", bypass_cache=True)
|
||||||
|
|
||||||
|
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||||||
|
print_result(result)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
||||||
|
|||||||
@@ -42,5 +42,5 @@ page_summary = json.loads(result.extracted_content)
|
|||||||
|
|
||||||
print(page_summary)
|
print(page_summary)
|
||||||
|
|
||||||
with open(".data/page_summary.json", "w") as f:
|
with open(".data/page_summary.json", "w", encoding="utf-8") as f:
|
||||||
f.write(result.extracted_content)
|
f.write(result.extracted_content)
|
||||||
|
|||||||
@@ -1,5 +1,23 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## [v0.2.75] - 2024-07-19
|
||||||
|
|
||||||
|
Minor improvements for a more maintainable codebase:
|
||||||
|
|
||||||
|
- 🔄 Fixed typos in `chunking_strategy.py` and `crawler_strategy.py` to improve code readability
|
||||||
|
- 🔄 Removed `.test_pads/` directory from `.gitignore` to keep our repository clean and organized
|
||||||
|
|
||||||
|
These changes may seem small, but they contribute to a more stable and sustainable codebase. By fixing typos and updating our `.gitignore` settings, we're ensuring that our code is easier to maintain and scale in the long run.
|
||||||
|
|
||||||
|
|
||||||
|
## v0.2.74 - 2024-07-08
|
||||||
|
A slew of exciting updates to improve the crawler's stability and robustness! 🎉
|
||||||
|
|
||||||
|
- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
|
||||||
|
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
|
||||||
|
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
|
||||||
|
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
|
||||||
|
|
||||||
## [v0.2.73] - 2024-07-03
|
## [v0.2.73] - 2024-07-03
|
||||||
|
|
||||||
💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.
|
💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.
|
||||||
|
|||||||
@@ -14,6 +14,7 @@
|
|||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<button class="btn btn-default" type="submit">Submit</button>
|
<button class="btn btn-default" type="submit">Submit</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
</fieldset>
|
</fieldset>
|
||||||
</form>
|
</form>
|
||||||
|
|
||||||
@@ -93,6 +94,10 @@
|
|||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
|
<div id="error" class="error-message" style="display: none; margin-top:1em;">
|
||||||
|
<div class="terminal-alert terminal-alert-error"></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
function showTab(tabId) {
|
function showTab(tabId) {
|
||||||
const tabs = document.querySelectorAll('.tab-content');
|
const tabs = document.querySelectorAll('.tab-content');
|
||||||
@@ -162,7 +167,17 @@
|
|||||||
},
|
},
|
||||||
body: JSON.stringify(data)
|
body: JSON.stringify(data)
|
||||||
})
|
})
|
||||||
.then(response => response.json())
|
.then(response => {
|
||||||
|
if (!response.ok) {
|
||||||
|
if (response.status === 429) {
|
||||||
|
return response.json().then(err => {
|
||||||
|
throw Object.assign(new Error('Rate limit exceeded'), { status: 429, details: err });
|
||||||
|
});
|
||||||
|
}
|
||||||
|
throw new Error('Network response was not ok');
|
||||||
|
}
|
||||||
|
return response.json();
|
||||||
|
})
|
||||||
.then(data => {
|
.then(data => {
|
||||||
data = data.results[0]; // Only one URL is requested
|
data = data.results[0]; // Only one URL is requested
|
||||||
document.getElementById('loading').style.display = 'none';
|
document.getElementById('loading').style.display = 'none';
|
||||||
@@ -187,11 +202,29 @@ result = crawler.run(
|
|||||||
print(result)
|
print(result)
|
||||||
`;
|
`;
|
||||||
redo(document.getElementById('pythonCode'), pythonCode);
|
redo(document.getElementById('pythonCode'), pythonCode);
|
||||||
|
document.getElementById('error').style.display = 'none';
|
||||||
})
|
})
|
||||||
.catch(error => {
|
.catch(error => {
|
||||||
document.getElementById('loading').style.display = 'none';
|
document.getElementById('loading').style.display = 'none';
|
||||||
document.getElementById('response').style.display = 'block';
|
document.getElementById('error').style.display = 'block';
|
||||||
document.getElementById('markdownContent').textContent = 'Error: ' + error;
|
let errorMessage = 'An unexpected error occurred. Please try again later.';
|
||||||
|
|
||||||
|
if (error.status === 429) {
|
||||||
|
const details = error.details;
|
||||||
|
if (details.retry_after) {
|
||||||
|
errorMessage = `Rate limit exceeded. Please wait ${parseFloat(details.retry_after).toFixed(1)} seconds before trying again.`;
|
||||||
|
} else if (details.reset_at) {
|
||||||
|
const resetTime = new Date(details.reset_at);
|
||||||
|
const waitTime = Math.ceil((resetTime - new Date()) / 1000);
|
||||||
|
errorMessage = `Rate limit exceeded. Please try again after ${waitTime} seconds.`;
|
||||||
|
} else {
|
||||||
|
errorMessage = `Rate limit exceeded. Please try again later.`;
|
||||||
|
}
|
||||||
|
} else if (error.message) {
|
||||||
|
errorMessage = error.message;
|
||||||
|
}
|
||||||
|
|
||||||
|
document.querySelector('#error .terminal-alert').textContent = errorMessage;
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
</script>
|
</script>
|
||||||
|
|||||||
@@ -14,6 +14,9 @@ Let's see how we can customize the crawler using hooks! In this example, we'll:
|
|||||||
### Hook Definitions
|
### Hook Definitions
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
from crawl4ai.web_crawler import WebCrawler
|
||||||
|
from crawl4ai.crawler_strategy import *
|
||||||
|
|
||||||
def on_driver_created(driver):
|
def on_driver_created(driver):
|
||||||
print("[HOOK] on_driver_created")
|
print("[HOOK] on_driver_created")
|
||||||
# Example customization: maximize the window
|
# Example customization: maximize the window
|
||||||
@@ -66,12 +69,13 @@ def before_return_html(driver, html):
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
print("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
print("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
||||||
crawler = WebCrawler(verbose=True)
|
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||||
|
crawler_strategy.set_hook('on_driver_created', on_driver_created)
|
||||||
|
crawler_strategy.set_hook('before_get_url', before_get_url)
|
||||||
|
crawler_strategy.set_hook('after_get_url', after_get_url)
|
||||||
|
crawler_strategy.set_hook('before_return_html', before_return_html)
|
||||||
|
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||||
crawler.warmup()
|
crawler.warmup()
|
||||||
crawler.set_hook('on_driver_created', on_driver_created)
|
|
||||||
crawler.set_hook('before_get_url', before_get_url)
|
|
||||||
crawler.set_hook('after_get_url', after_get_url)
|
|
||||||
crawler.set_hook('before_return_html', before_return_html)
|
|
||||||
|
|
||||||
result = crawler.run(url="https://example.com")
|
result = crawler.run(url="https://example.com")
|
||||||
|
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ model_fees = json.loads(result.extracted_content)
|
|||||||
|
|
||||||
print(len(model_fees))
|
print(len(model_fees))
|
||||||
|
|
||||||
with open(".data/data.json", "w") as f:
|
with open(".data/data.json", "w", encoding="utf-8") as f:
|
||||||
f.write(result.extracted_content)
|
f.write(result.extracted_content)
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -71,7 +71,7 @@ model_fees = json.loads(result.extracted_content)
|
|||||||
|
|
||||||
print(len(model_fees))
|
print(len(model_fees))
|
||||||
|
|
||||||
with open(".data/data.json", "w") as f:
|
with open(".data/data.json", "w", encoding="utf-8") as f:
|
||||||
f.write(result.extracted_content)
|
f.write(result.extracted_content)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -91,7 +91,7 @@ This example demonstrates how to use `Crawl4AI` to extract a summary from a web
|
|||||||
Save the extracted data to a file for further use.
|
Save the extracted data to a file for further use.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
with open(".data/page_summary.json", "w") as f:
|
with open(".data/page_summary.json", "w", encoding="utf-8") as f:
|
||||||
f.write(result.extracted_content)
|
f.write(result.extracted_content)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# Crawl4AI v0.2.73
|
# Crawl4AI v0.2.75
|
||||||
|
|
||||||
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
|
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
|
||||||
|
|
||||||
|
|||||||
@@ -176,41 +176,29 @@ print(f"JavaScript Code (Load More button) result: {result}")
|
|||||||
Let's see how we can customize the crawler using hooks!
|
Let's see how we can customize the crawler using hooks!
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def on_driver_created(driver):
|
import time
|
||||||
print("[HOOK] on_driver_created")
|
|
||||||
driver.maximize_window()
|
|
||||||
driver.get('https://example.com/login')
|
|
||||||
driver.find_element(By.NAME, 'username').send_keys('testuser')
|
|
||||||
driver.find_element(By.NAME, 'password').send_keys('password123')
|
|
||||||
driver.find_element(By.NAME, 'login').click()
|
|
||||||
driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
|
|
||||||
return driver
|
|
||||||
|
|
||||||
def before_get_url(driver):
|
from crawl4ai.web_crawler import WebCrawler
|
||||||
print("[HOOK] before_get_url")
|
from crawl4ai.crawler_strategy import *
|
||||||
driver.execute_cdp_cmd('Network.enable', {})
|
|
||||||
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
|
|
||||||
return driver
|
|
||||||
|
|
||||||
def after_get_url(driver):
|
def delay(driver):
|
||||||
print("[HOOK] after_get_url")
|
print("Delaying for 5 seconds...")
|
||||||
print(driver.current_url)
|
time.sleep(5)
|
||||||
return driver
|
print("Resuming...")
|
||||||
|
|
||||||
|
def create_crawler():
|
||||||
|
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||||
|
crawler_strategy.set_hook('after_get_url', delay)
|
||||||
|
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||||
|
crawler.warmup()
|
||||||
|
return crawler
|
||||||
|
|
||||||
def before_return_html(driver, html):
|
crawler = create_crawler()
|
||||||
print("[HOOK] before_return_html")
|
result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
|
||||||
print(len(html))
|
|
||||||
return driver
|
|
||||||
|
|
||||||
crawler.set_hook('on_driver_created', on_driver_created)
|
|
||||||
crawler.set_hook('before_get_url', before_get_url)
|
|
||||||
crawler.set_hook('after_get_url', after_get_url)
|
|
||||||
crawler.set_hook('before_return_html', before_return_html)
|
|
||||||
|
|
||||||
result = crawler.run(url="https://example.com")
|
|
||||||
print(f"Crawler Hooks result: {result}")
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
check [Hooks](examples/hooks_auth.md) for more examples.
|
||||||
|
|
||||||
## Congratulations! 🎉
|
## Congratulations! 🎉
|
||||||
|
|
||||||
You've made it through the Crawl4AI Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️
|
You've made it through the Crawl4AI Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️
|
||||||
|
|||||||
83
main.py
83
main.py
@@ -22,6 +22,15 @@ from typing import List, Optional
|
|||||||
from crawl4ai.web_crawler import WebCrawler
|
from crawl4ai.web_crawler import WebCrawler
|
||||||
from crawl4ai.database import get_total_count, clear_db
|
from crawl4ai.database import get_total_count, clear_db
|
||||||
|
|
||||||
|
import time
|
||||||
|
from slowapi import Limiter, _rate_limit_exceeded_handler
|
||||||
|
from slowapi.util import get_remote_address
|
||||||
|
from slowapi.errors import RateLimitExceeded
|
||||||
|
|
||||||
|
# load .env file
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
MAX_CONCURRENT_REQUESTS = 10 # Adjust this to change the maximum concurrent requests
|
MAX_CONCURRENT_REQUESTS = 10 # Adjust this to change the maximum concurrent requests
|
||||||
@@ -30,6 +39,78 @@ lock = asyncio.Lock()
|
|||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
|
|
||||||
|
# Initialize rate limiter
|
||||||
|
def rate_limit_key_func(request: Request):
|
||||||
|
access_token = request.headers.get("access-token")
|
||||||
|
if access_token == os.environ.get('ACCESS_TOKEN'):
|
||||||
|
return None
|
||||||
|
return get_remote_address(request)
|
||||||
|
|
||||||
|
limiter = Limiter(key_func=rate_limit_key_func)
|
||||||
|
app.state.limiter = limiter
|
||||||
|
|
||||||
|
# Dictionary to store last request times for each client
|
||||||
|
last_request_times = {}
|
||||||
|
last_rate_limit = {}
|
||||||
|
|
||||||
|
|
||||||
|
def get_rate_limit():
|
||||||
|
limit = os.environ.get('ACCESS_PER_MIN', "5")
|
||||||
|
return f"{limit}/minute"
|
||||||
|
|
||||||
|
# Custom rate limit exceeded handler
|
||||||
|
async def custom_rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded) -> JSONResponse:
|
||||||
|
if request.client.host not in last_rate_limit or time.time() - last_rate_limit[request.client.host] > 60:
|
||||||
|
last_rate_limit[request.client.host] = time.time()
|
||||||
|
retry_after = 60 - (time.time() - last_rate_limit[request.client.host])
|
||||||
|
reset_at = time.time() + retry_after
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=429,
|
||||||
|
content={
|
||||||
|
"detail": "Rate limit exceeded",
|
||||||
|
"limit": str(exc.limit.limit),
|
||||||
|
"retry_after": retry_after,
|
||||||
|
'reset_at': reset_at,
|
||||||
|
"message": f"You have exceeded the rate limit of {exc.limit.limit}."
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
app.add_exception_handler(RateLimitExceeded, custom_rate_limit_exceeded_handler)
|
||||||
|
|
||||||
|
|
||||||
|
# Middleware for token-based bypass and per-request limit
|
||||||
|
class RateLimitMiddleware(BaseHTTPMiddleware):
|
||||||
|
async def dispatch(self, request: Request, call_next):
|
||||||
|
SPAN = int(os.environ.get('ACCESS_TIME_SPAN', 10))
|
||||||
|
access_token = request.headers.get("access-token")
|
||||||
|
if access_token == os.environ.get('ACCESS_TOKEN'):
|
||||||
|
return await call_next(request)
|
||||||
|
|
||||||
|
path = request.url.path
|
||||||
|
if path in ["/crawl", "/old"]:
|
||||||
|
client_ip = request.client.host
|
||||||
|
current_time = time.time()
|
||||||
|
|
||||||
|
# Check time since last request
|
||||||
|
if client_ip in last_request_times:
|
||||||
|
time_since_last_request = current_time - last_request_times[client_ip]
|
||||||
|
if time_since_last_request < SPAN:
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=429,
|
||||||
|
content={
|
||||||
|
"detail": "Too many requests",
|
||||||
|
"message": "Rate limit exceeded. Please wait 10 seconds between requests.",
|
||||||
|
"retry_after": max(0, SPAN - time_since_last_request),
|
||||||
|
"reset_at": current_time + max(0, SPAN - time_since_last_request),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
last_request_times[client_ip] = current_time
|
||||||
|
|
||||||
|
return await call_next(request)
|
||||||
|
|
||||||
|
app.add_middleware(RateLimitMiddleware)
|
||||||
|
|
||||||
# CORS configuration
|
# CORS configuration
|
||||||
origins = ["*"] # Allow all origins
|
origins = ["*"] # Allow all origins
|
||||||
app.add_middleware(
|
app.add_middleware(
|
||||||
@@ -73,6 +154,7 @@ def read_root():
|
|||||||
return RedirectResponse(url="/mkdocs")
|
return RedirectResponse(url="/mkdocs")
|
||||||
|
|
||||||
@app.get("/old", response_class=HTMLResponse)
|
@app.get("/old", response_class=HTMLResponse)
|
||||||
|
@limiter.limit(get_rate_limit())
|
||||||
async def read_index(request: Request):
|
async def read_index(request: Request):
|
||||||
partials_dir = os.path.join(__location__, "pages", "partial")
|
partials_dir = os.path.join(__location__, "pages", "partial")
|
||||||
partials = {}
|
partials = {}
|
||||||
@@ -107,6 +189,7 @@ def import_strategy(module_name: str, class_name: str, *args, **kwargs):
|
|||||||
raise HTTPException(status_code=400, detail=f"Class {class_name} not found in {module_name}.")
|
raise HTTPException(status_code=400, detail=f"Class {class_name} not found in {module_name}.")
|
||||||
|
|
||||||
@app.post("/crawl")
|
@app.post("/crawl")
|
||||||
|
@limiter.limit(get_rate_limit())
|
||||||
async def crawl_urls(crawl_request: CrawlRequest, request: Request):
|
async def crawl_urls(crawl_request: CrawlRequest, request: Request):
|
||||||
logging.debug(f"[LOG] Crawl request for URL: {crawl_request.urls}")
|
logging.debug(f"[LOG] Crawl request for URL: {crawl_request.urls}")
|
||||||
global current_requests
|
global current_requests
|
||||||
|
|||||||
0
middlewares.py
Normal file
0
middlewares.py
Normal file
17
setup.py
17
setup.py
@@ -1,13 +1,18 @@
|
|||||||
from setuptools import setup, find_packages
|
from setuptools import setup, find_packages
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import subprocess
|
import shutil
|
||||||
from setuptools.command.install import install
|
|
||||||
|
|
||||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||||
crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
# If the folder already exists, remove the cache folder
|
||||||
os.makedirs(crawl4ai_folder, exist_ok=True)
|
crawl4ai_folder = Path.home() / ".crawl4ai"
|
||||||
os.makedirs(f"{crawl4ai_folder}/cache", exist_ok=True)
|
cache_folder = crawl4ai_folder / "cache"
|
||||||
|
|
||||||
|
if cache_folder.exists():
|
||||||
|
shutil.rmtree(cache_folder)
|
||||||
|
|
||||||
|
crawl4ai_folder.mkdir(exist_ok=True)
|
||||||
|
cache_folder.mkdir(exist_ok=True)
|
||||||
|
|
||||||
# Read the requirements from requirements.txt
|
# Read the requirements from requirements.txt
|
||||||
with open("requirements.txt") as f:
|
with open("requirements.txt") as f:
|
||||||
@@ -20,7 +25,7 @@ transformer_requirements = [req for req in requirements if req.startswith(("tran
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="Crawl4AI",
|
name="Crawl4AI",
|
||||||
version="0.2.73",
|
version="0.2.74",
|
||||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
||||||
long_description=open("README.md", encoding="utf-8").read(),
|
long_description=open("README.md", encoding="utf-8").read(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
|
|||||||
Reference in New Issue
Block a user