Compare commits

..

8 Commits

Author SHA1 Message Date
unclecode
7715623430 chore: Fix typos and update .gitignore
These changes fix typos in `chunking_strategy.py` and `crawler_strategy.py` to improve code readability. Additionally, the `.test_pads/` directory is removed from the `.gitignore` file to keep the repository clean and organized.
2024-07-19 17:42:39 +08:00
unclecode
f5a4e80e2c chore: Fix typo in chunking_strategy.py and crawler_strategy.py
The commit fixes a typo in the `chunking_strategy.py` file where `nl.toknize.TextTilingTokenizer()` was corrected to `nl.tokenize.TextTilingTokenizer()`. Additionally, in the `crawler_strategy.py` file, the commit converts the screenshot image to RGB mode before saving it as a JPEG. This ensures consistent image quality and compression.
2024-07-19 17:40:31 +08:00
unclecode
8463aabedf chore: Remove .test_pads/ directory from .gitignore 2024-07-19 17:09:29 +08:00
unclecode
7f30144ef2 chore: Remove .tests/ directory from .gitignore 2024-07-09 15:10:18 +08:00
unclecode
fa5516aad6 chore: Refactor setup.py to use pathlib and shutil for folder creation and removal, to remove cache folder in cross platform manner. 2024-07-09 13:25:00 +08:00
unclecode
ca0336af9e feat: Add error handling for rate limit exceeded in form submission
This commit adds error handling for rate limit exceeded in the form submission process. If the server returns a 429 status code, the client will display an error message indicating the rate limit has been exceeded and provide information on when the user can try again. This improves the user experience by providing clear feedback and guidance when rate limits are reached.
2024-07-08 20:24:00 +08:00
unclecode
65ed1aeade feat: Add rate limiting functionality with custom handlers 2024-07-08 20:02:12 +08:00
unclecode
4d283ab386 ## [v0.2.74] - 2024-07-08
A slew of exciting updates to improve the crawler's stability and robustness! 🎉

- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
2024-07-08 16:33:25 +08:00
11 changed files with 158 additions and 14 deletions

2
.gitignore vendored
View File

@@ -165,6 +165,8 @@ Crawl4AI.egg-info/
Crawl4AI.egg-info/*
crawler_data.db
.vscode/
.tests/
.test_pads/
test_pad.py
test_pad*.py
.data/

View File

@@ -1,5 +1,14 @@
# Changelog
## [v0.2.75] - 2024-07-19
Minor improvements for a more maintainable codebase:
- 🔄 Fixed typos in `chunking_strategy.py` and `crawler_strategy.py` to improve code readability
- 🔄 Removed `.test_pads/` directory from `.gitignore` to keep our repository clean and organized
These changes may seem small, but they contribute to a more stable and sustainable codebase. By fixing typos and updating our `.gitignore` settings, we're ensuring that our code is easier to maintain and scale in the long run.
## [v0.2.74] - 2024-07-08
A slew of exciting updates to improve the crawler's stability and robustness! 🎉

View File

@@ -1,4 +1,4 @@
# Crawl4AI v0.2.74 🕷️🤖
# Crawl4AI v0.2.75 🕷️🤖
[![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
[![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)

View File

@@ -55,7 +55,7 @@ class TopicSegmentationChunking(ChunkingStrategy):
def __init__(self, num_keywords=3, **kwargs):
import nltk as nl
self.tokenizer = nl.toknize.TextTilingTokenizer()
self.tokenizer = nl.tokenize.TextTilingTokenizer()
self.num_keywords = num_keywords
def chunk(self, text: str) -> list:

View File

@@ -292,15 +292,22 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
# Open the screenshot with PIL
image = Image.open(BytesIO(screenshot))
# Convert image to RGB mode
rgb_image = image.convert('RGB')
# Convert to JPEG and compress
buffered = BytesIO()
image.save(buffered, format="JPEG", quality=85)
rgb_image.save(buffered, format="JPEG", quality=85)
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
if self.verbose:
print(f"[LOG] 📸 Screenshot taken and converted to base64")
return img_base64
except Exception as e:
if self.verbose:
print(f"[ERROR] Failed to take screenshot: {str(e)}")
return ""
except Exception as e:
error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}")

View File

@@ -1,5 +1,15 @@
# Changelog
## [v0.2.75] - 2024-07-19
Minor improvements for a more maintainable codebase:
- 🔄 Fixed typos in `chunking_strategy.py` and `crawler_strategy.py` to improve code readability
- 🔄 Removed `.test_pads/` directory from `.gitignore` to keep our repository clean and organized
These changes may seem small, but they contribute to a more stable and sustainable codebase. By fixing typos and updating our `.gitignore` settings, we're ensuring that our code is easier to maintain and scale in the long run.
## v0.2.74 - 2024-07-08
A slew of exciting updates to improve the crawler's stability and robustness! 🎉

View File

@@ -14,6 +14,7 @@
<div class="form-group">
<button class="btn btn-default" type="submit">Submit</button>
</div>
</fieldset>
</form>
@@ -93,6 +94,10 @@
</div>
</section>
<div id="error" class="error-message" style="display: none; margin-top:1em;">
<div class="terminal-alert terminal-alert-error"></div>
</div>
<script>
function showTab(tabId) {
const tabs = document.querySelectorAll('.tab-content');
@@ -162,7 +167,17 @@
},
body: JSON.stringify(data)
})
.then(response => response.json())
.then(response => {
if (!response.ok) {
if (response.status === 429) {
return response.json().then(err => {
throw Object.assign(new Error('Rate limit exceeded'), { status: 429, details: err });
});
}
throw new Error('Network response was not ok');
}
return response.json();
})
.then(data => {
data = data.results[0]; // Only one URL is requested
document.getElementById('loading').style.display = 'none';
@@ -187,11 +202,29 @@ result = crawler.run(
print(result)
`;
redo(document.getElementById('pythonCode'), pythonCode);
document.getElementById('error').style.display = 'none';
})
.catch(error => {
document.getElementById('loading').style.display = 'none';
document.getElementById('response').style.display = 'block';
document.getElementById('markdownContent').textContent = 'Error: ' + error;
document.getElementById('error').style.display = 'block';
let errorMessage = 'An unexpected error occurred. Please try again later.';
if (error.status === 429) {
const details = error.details;
if (details.retry_after) {
errorMessage = `Rate limit exceeded. Please wait ${parseFloat(details.retry_after).toFixed(1)} seconds before trying again.`;
} else if (details.reset_at) {
const resetTime = new Date(details.reset_at);
const waitTime = Math.ceil((resetTime - new Date()) / 1000);
errorMessage = `Rate limit exceeded. Please try again after ${waitTime} seconds.`;
} else {
errorMessage = `Rate limit exceeded. Please try again later.`;
}
} else if (error.message) {
errorMessage = error.message;
}
document.querySelector('#error .terminal-alert').textContent = errorMessage;
});
});
</script>

View File

@@ -1,4 +1,4 @@
# Crawl4AI v0.2.74
# Crawl4AI v0.2.75
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.

83
main.py
View File

@@ -22,6 +22,15 @@ from typing import List, Optional
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.database import get_total_count, clear_db
import time
from slowapi import Limiter, _rate_limit_exceeded_handler
from slowapi.util import get_remote_address
from slowapi.errors import RateLimitExceeded
# load .env file
from dotenv import load_dotenv
load_dotenv()
# Configuration
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
MAX_CONCURRENT_REQUESTS = 10 # Adjust this to change the maximum concurrent requests
@@ -30,6 +39,78 @@ lock = asyncio.Lock()
app = FastAPI()
# Initialize rate limiter
def rate_limit_key_func(request: Request):
access_token = request.headers.get("access-token")
if access_token == os.environ.get('ACCESS_TOKEN'):
return None
return get_remote_address(request)
limiter = Limiter(key_func=rate_limit_key_func)
app.state.limiter = limiter
# Dictionary to store last request times for each client
last_request_times = {}
last_rate_limit = {}
def get_rate_limit():
limit = os.environ.get('ACCESS_PER_MIN', "5")
return f"{limit}/minute"
# Custom rate limit exceeded handler
async def custom_rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded) -> JSONResponse:
if request.client.host not in last_rate_limit or time.time() - last_rate_limit[request.client.host] > 60:
last_rate_limit[request.client.host] = time.time()
retry_after = 60 - (time.time() - last_rate_limit[request.client.host])
reset_at = time.time() + retry_after
return JSONResponse(
status_code=429,
content={
"detail": "Rate limit exceeded",
"limit": str(exc.limit.limit),
"retry_after": retry_after,
'reset_at': reset_at,
"message": f"You have exceeded the rate limit of {exc.limit.limit}."
}
)
app.add_exception_handler(RateLimitExceeded, custom_rate_limit_exceeded_handler)
# Middleware for token-based bypass and per-request limit
class RateLimitMiddleware(BaseHTTPMiddleware):
async def dispatch(self, request: Request, call_next):
SPAN = int(os.environ.get('ACCESS_TIME_SPAN', 10))
access_token = request.headers.get("access-token")
if access_token == os.environ.get('ACCESS_TOKEN'):
return await call_next(request)
path = request.url.path
if path in ["/crawl", "/old"]:
client_ip = request.client.host
current_time = time.time()
# Check time since last request
if client_ip in last_request_times:
time_since_last_request = current_time - last_request_times[client_ip]
if time_since_last_request < SPAN:
return JSONResponse(
status_code=429,
content={
"detail": "Too many requests",
"message": "Rate limit exceeded. Please wait 10 seconds between requests.",
"retry_after": max(0, SPAN - time_since_last_request),
"reset_at": current_time + max(0, SPAN - time_since_last_request),
}
)
last_request_times[client_ip] = current_time
return await call_next(request)
app.add_middleware(RateLimitMiddleware)
# CORS configuration
origins = ["*"] # Allow all origins
app.add_middleware(
@@ -73,6 +154,7 @@ def read_root():
return RedirectResponse(url="/mkdocs")
@app.get("/old", response_class=HTMLResponse)
@limiter.limit(get_rate_limit())
async def read_index(request: Request):
partials_dir = os.path.join(__location__, "pages", "partial")
partials = {}
@@ -107,6 +189,7 @@ def import_strategy(module_name: str, class_name: str, *args, **kwargs):
raise HTTPException(status_code=400, detail=f"Class {class_name} not found in {module_name}.")
@app.post("/crawl")
@limiter.limit(get_rate_limit())
async def crawl_urls(crawl_request: CrawlRequest, request: Request):
logging.debug(f"[LOG] Crawl request for URL: {crawl_request.urls}")
global current_requests

0
middlewares.py Normal file
View File

View File

@@ -1,18 +1,18 @@
from setuptools import setup, find_packages
import os
from pathlib import Path
import subprocess
from setuptools.command.install import install
import shutil
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
# If the folder already exists, remove the cache folder
crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
if os.path.exists(f"{crawl4ai_folder}/cache"):
subprocess.run(["rm", "-rf", f"{crawl4ai_folder}/cache"])
os.makedirs(crawl4ai_folder, exist_ok=True)
os.makedirs(f"{crawl4ai_folder}/cache", exist_ok=True)
crawl4ai_folder = Path.home() / ".crawl4ai"
cache_folder = crawl4ai_folder / "cache"
if cache_folder.exists():
shutil.rmtree(cache_folder)
crawl4ai_folder.mkdir(exist_ok=True)
cache_folder.mkdir(exist_ok=True)
# Read the requirements from requirements.txt
with open("requirements.txt") as f: