Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2101540819 | ||
|
|
9d98393606 | ||
|
|
6f99368744 | ||
|
|
ea2f83ac10 | ||
|
|
7f41ff4a74 | ||
|
|
236bdb4035 | ||
|
|
1368248254 | ||
|
|
b0ec54b9e9 | ||
|
|
fb6ed5f000 | ||
|
|
597fe8bdb7 |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -165,8 +165,6 @@ Crawl4AI.egg-info/
|
|||||||
Crawl4AI.egg-info/*
|
Crawl4AI.egg-info/*
|
||||||
crawler_data.db
|
crawler_data.db
|
||||||
.vscode/
|
.vscode/
|
||||||
.tests/
|
|
||||||
.test_pads/
|
|
||||||
test_pad.py
|
test_pad.py
|
||||||
test_pad*.py
|
test_pad*.py
|
||||||
.data/
|
.data/
|
||||||
|
|||||||
@@ -1,14 +1,5 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
## [v0.2.75] - 2024-07-19
|
|
||||||
|
|
||||||
Minor improvements for a more maintainable codebase:
|
|
||||||
|
|
||||||
- 🔄 Fixed typos in `chunking_strategy.py` and `crawler_strategy.py` to improve code readability
|
|
||||||
- 🔄 Removed `.test_pads/` directory from `.gitignore` to keep our repository clean and organized
|
|
||||||
|
|
||||||
These changes may seem small, but they contribute to a more stable and sustainable codebase. By fixing typos and updating our `.gitignore` settings, we're ensuring that our code is easier to maintain and scale in the long run.
|
|
||||||
|
|
||||||
## [v0.2.74] - 2024-07-08
|
## [v0.2.74] - 2024-07-08
|
||||||
A slew of exciting updates to improve the crawler's stability and robustness! 🎉
|
A slew of exciting updates to improve the crawler's stability and robustness! 🎉
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# Crawl4AI v0.2.75 🕷️🤖
|
# Crawl4AI v0.2.74 🕷️🤖
|
||||||
|
|
||||||
[](https://github.com/unclecode/crawl4ai/stargazers)
|
[](https://github.com/unclecode/crawl4ai/stargazers)
|
||||||
[](https://github.com/unclecode/crawl4ai/network/members)
|
[](https://github.com/unclecode/crawl4ai/network/members)
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ class TopicSegmentationChunking(ChunkingStrategy):
|
|||||||
|
|
||||||
def __init__(self, num_keywords=3, **kwargs):
|
def __init__(self, num_keywords=3, **kwargs):
|
||||||
import nltk as nl
|
import nltk as nl
|
||||||
self.tokenizer = nl.tokenize.TextTilingTokenizer()
|
self.tokenizer = nl.toknize.TextTilingTokenizer()
|
||||||
self.num_keywords = num_keywords
|
self.num_keywords = num_keywords
|
||||||
|
|
||||||
def chunk(self, text: str) -> list:
|
def chunk(self, text: str) -> list:
|
||||||
|
|||||||
@@ -292,22 +292,15 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
# Open the screenshot with PIL
|
# Open the screenshot with PIL
|
||||||
image = Image.open(BytesIO(screenshot))
|
image = Image.open(BytesIO(screenshot))
|
||||||
|
|
||||||
# Convert image to RGB mode
|
|
||||||
rgb_image = image.convert('RGB')
|
|
||||||
|
|
||||||
# Convert to JPEG and compress
|
# Convert to JPEG and compress
|
||||||
buffered = BytesIO()
|
buffered = BytesIO()
|
||||||
rgb_image.save(buffered, format="JPEG", quality=85)
|
image.save(buffered, format="JPEG", quality=85)
|
||||||
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||||
|
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"[LOG] 📸 Screenshot taken and converted to base64")
|
print(f"[LOG] 📸 Screenshot taken and converted to base64")
|
||||||
|
|
||||||
return img_base64
|
return img_base64
|
||||||
except Exception as e:
|
|
||||||
if self.verbose:
|
|
||||||
print(f"[ERROR] Failed to take screenshot: {str(e)}")
|
|
||||||
return ""
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}")
|
error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}")
|
||||||
|
|||||||
@@ -1,15 +1,5 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
## [v0.2.75] - 2024-07-19
|
|
||||||
|
|
||||||
Minor improvements for a more maintainable codebase:
|
|
||||||
|
|
||||||
- 🔄 Fixed typos in `chunking_strategy.py` and `crawler_strategy.py` to improve code readability
|
|
||||||
- 🔄 Removed `.test_pads/` directory from `.gitignore` to keep our repository clean and organized
|
|
||||||
|
|
||||||
These changes may seem small, but they contribute to a more stable and sustainable codebase. By fixing typos and updating our `.gitignore` settings, we're ensuring that our code is easier to maintain and scale in the long run.
|
|
||||||
|
|
||||||
|
|
||||||
## v0.2.74 - 2024-07-08
|
## v0.2.74 - 2024-07-08
|
||||||
A slew of exciting updates to improve the crawler's stability and robustness! 🎉
|
A slew of exciting updates to improve the crawler's stability and robustness! 🎉
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,6 @@
|
|||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<button class="btn btn-default" type="submit">Submit</button>
|
<button class="btn btn-default" type="submit">Submit</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
</fieldset>
|
</fieldset>
|
||||||
</form>
|
</form>
|
||||||
|
|
||||||
@@ -94,10 +93,6 @@
|
|||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
<div id="error" class="error-message" style="display: none; margin-top:1em;">
|
|
||||||
<div class="terminal-alert terminal-alert-error"></div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
function showTab(tabId) {
|
function showTab(tabId) {
|
||||||
const tabs = document.querySelectorAll('.tab-content');
|
const tabs = document.querySelectorAll('.tab-content');
|
||||||
@@ -167,17 +162,7 @@
|
|||||||
},
|
},
|
||||||
body: JSON.stringify(data)
|
body: JSON.stringify(data)
|
||||||
})
|
})
|
||||||
.then(response => {
|
.then(response => response.json())
|
||||||
if (!response.ok) {
|
|
||||||
if (response.status === 429) {
|
|
||||||
return response.json().then(err => {
|
|
||||||
throw Object.assign(new Error('Rate limit exceeded'), { status: 429, details: err });
|
|
||||||
});
|
|
||||||
}
|
|
||||||
throw new Error('Network response was not ok');
|
|
||||||
}
|
|
||||||
return response.json();
|
|
||||||
})
|
|
||||||
.then(data => {
|
.then(data => {
|
||||||
data = data.results[0]; // Only one URL is requested
|
data = data.results[0]; // Only one URL is requested
|
||||||
document.getElementById('loading').style.display = 'none';
|
document.getElementById('loading').style.display = 'none';
|
||||||
@@ -202,29 +187,11 @@ result = crawler.run(
|
|||||||
print(result)
|
print(result)
|
||||||
`;
|
`;
|
||||||
redo(document.getElementById('pythonCode'), pythonCode);
|
redo(document.getElementById('pythonCode'), pythonCode);
|
||||||
document.getElementById('error').style.display = 'none';
|
|
||||||
})
|
})
|
||||||
.catch(error => {
|
.catch(error => {
|
||||||
document.getElementById('loading').style.display = 'none';
|
document.getElementById('loading').style.display = 'none';
|
||||||
document.getElementById('error').style.display = 'block';
|
document.getElementById('response').style.display = 'block';
|
||||||
let errorMessage = 'An unexpected error occurred. Please try again later.';
|
document.getElementById('markdownContent').textContent = 'Error: ' + error;
|
||||||
|
|
||||||
if (error.status === 429) {
|
|
||||||
const details = error.details;
|
|
||||||
if (details.retry_after) {
|
|
||||||
errorMessage = `Rate limit exceeded. Please wait ${parseFloat(details.retry_after).toFixed(1)} seconds before trying again.`;
|
|
||||||
} else if (details.reset_at) {
|
|
||||||
const resetTime = new Date(details.reset_at);
|
|
||||||
const waitTime = Math.ceil((resetTime - new Date()) / 1000);
|
|
||||||
errorMessage = `Rate limit exceeded. Please try again after ${waitTime} seconds.`;
|
|
||||||
} else {
|
|
||||||
errorMessage = `Rate limit exceeded. Please try again later.`;
|
|
||||||
}
|
|
||||||
} else if (error.message) {
|
|
||||||
errorMessage = error.message;
|
|
||||||
}
|
|
||||||
|
|
||||||
document.querySelector('#error .terminal-alert').textContent = errorMessage;
|
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
</script>
|
</script>
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# Crawl4AI v0.2.75
|
# Crawl4AI v0.2.74
|
||||||
|
|
||||||
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
|
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
|
||||||
|
|
||||||
|
|||||||
83
main.py
83
main.py
@@ -22,15 +22,6 @@ from typing import List, Optional
|
|||||||
from crawl4ai.web_crawler import WebCrawler
|
from crawl4ai.web_crawler import WebCrawler
|
||||||
from crawl4ai.database import get_total_count, clear_db
|
from crawl4ai.database import get_total_count, clear_db
|
||||||
|
|
||||||
import time
|
|
||||||
from slowapi import Limiter, _rate_limit_exceeded_handler
|
|
||||||
from slowapi.util import get_remote_address
|
|
||||||
from slowapi.errors import RateLimitExceeded
|
|
||||||
|
|
||||||
# load .env file
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
load_dotenv()
|
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
MAX_CONCURRENT_REQUESTS = 10 # Adjust this to change the maximum concurrent requests
|
MAX_CONCURRENT_REQUESTS = 10 # Adjust this to change the maximum concurrent requests
|
||||||
@@ -39,78 +30,6 @@ lock = asyncio.Lock()
|
|||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
|
|
||||||
# Initialize rate limiter
|
|
||||||
def rate_limit_key_func(request: Request):
|
|
||||||
access_token = request.headers.get("access-token")
|
|
||||||
if access_token == os.environ.get('ACCESS_TOKEN'):
|
|
||||||
return None
|
|
||||||
return get_remote_address(request)
|
|
||||||
|
|
||||||
limiter = Limiter(key_func=rate_limit_key_func)
|
|
||||||
app.state.limiter = limiter
|
|
||||||
|
|
||||||
# Dictionary to store last request times for each client
|
|
||||||
last_request_times = {}
|
|
||||||
last_rate_limit = {}
|
|
||||||
|
|
||||||
|
|
||||||
def get_rate_limit():
|
|
||||||
limit = os.environ.get('ACCESS_PER_MIN', "5")
|
|
||||||
return f"{limit}/minute"
|
|
||||||
|
|
||||||
# Custom rate limit exceeded handler
|
|
||||||
async def custom_rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded) -> JSONResponse:
|
|
||||||
if request.client.host not in last_rate_limit or time.time() - last_rate_limit[request.client.host] > 60:
|
|
||||||
last_rate_limit[request.client.host] = time.time()
|
|
||||||
retry_after = 60 - (time.time() - last_rate_limit[request.client.host])
|
|
||||||
reset_at = time.time() + retry_after
|
|
||||||
return JSONResponse(
|
|
||||||
status_code=429,
|
|
||||||
content={
|
|
||||||
"detail": "Rate limit exceeded",
|
|
||||||
"limit": str(exc.limit.limit),
|
|
||||||
"retry_after": retry_after,
|
|
||||||
'reset_at': reset_at,
|
|
||||||
"message": f"You have exceeded the rate limit of {exc.limit.limit}."
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
app.add_exception_handler(RateLimitExceeded, custom_rate_limit_exceeded_handler)
|
|
||||||
|
|
||||||
|
|
||||||
# Middleware for token-based bypass and per-request limit
|
|
||||||
class RateLimitMiddleware(BaseHTTPMiddleware):
|
|
||||||
async def dispatch(self, request: Request, call_next):
|
|
||||||
SPAN = int(os.environ.get('ACCESS_TIME_SPAN', 10))
|
|
||||||
access_token = request.headers.get("access-token")
|
|
||||||
if access_token == os.environ.get('ACCESS_TOKEN'):
|
|
||||||
return await call_next(request)
|
|
||||||
|
|
||||||
path = request.url.path
|
|
||||||
if path in ["/crawl", "/old"]:
|
|
||||||
client_ip = request.client.host
|
|
||||||
current_time = time.time()
|
|
||||||
|
|
||||||
# Check time since last request
|
|
||||||
if client_ip in last_request_times:
|
|
||||||
time_since_last_request = current_time - last_request_times[client_ip]
|
|
||||||
if time_since_last_request < SPAN:
|
|
||||||
return JSONResponse(
|
|
||||||
status_code=429,
|
|
||||||
content={
|
|
||||||
"detail": "Too many requests",
|
|
||||||
"message": "Rate limit exceeded. Please wait 10 seconds between requests.",
|
|
||||||
"retry_after": max(0, SPAN - time_since_last_request),
|
|
||||||
"reset_at": current_time + max(0, SPAN - time_since_last_request),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
last_request_times[client_ip] = current_time
|
|
||||||
|
|
||||||
return await call_next(request)
|
|
||||||
|
|
||||||
app.add_middleware(RateLimitMiddleware)
|
|
||||||
|
|
||||||
# CORS configuration
|
# CORS configuration
|
||||||
origins = ["*"] # Allow all origins
|
origins = ["*"] # Allow all origins
|
||||||
app.add_middleware(
|
app.add_middleware(
|
||||||
@@ -154,7 +73,6 @@ def read_root():
|
|||||||
return RedirectResponse(url="/mkdocs")
|
return RedirectResponse(url="/mkdocs")
|
||||||
|
|
||||||
@app.get("/old", response_class=HTMLResponse)
|
@app.get("/old", response_class=HTMLResponse)
|
||||||
@limiter.limit(get_rate_limit())
|
|
||||||
async def read_index(request: Request):
|
async def read_index(request: Request):
|
||||||
partials_dir = os.path.join(__location__, "pages", "partial")
|
partials_dir = os.path.join(__location__, "pages", "partial")
|
||||||
partials = {}
|
partials = {}
|
||||||
@@ -189,7 +107,6 @@ def import_strategy(module_name: str, class_name: str, *args, **kwargs):
|
|||||||
raise HTTPException(status_code=400, detail=f"Class {class_name} not found in {module_name}.")
|
raise HTTPException(status_code=400, detail=f"Class {class_name} not found in {module_name}.")
|
||||||
|
|
||||||
@app.post("/crawl")
|
@app.post("/crawl")
|
||||||
@limiter.limit(get_rate_limit())
|
|
||||||
async def crawl_urls(crawl_request: CrawlRequest, request: Request):
|
async def crawl_urls(crawl_request: CrawlRequest, request: Request):
|
||||||
logging.debug(f"[LOG] Crawl request for URL: {crawl_request.urls}")
|
logging.debug(f"[LOG] Crawl request for URL: {crawl_request.urls}")
|
||||||
global current_requests
|
global current_requests
|
||||||
|
|||||||
14
setup.py
14
setup.py
@@ -1,18 +1,18 @@
|
|||||||
from setuptools import setup, find_packages
|
from setuptools import setup, find_packages
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import shutil
|
import subprocess
|
||||||
|
from setuptools.command.install import install
|
||||||
|
|
||||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||||
# If the folder already exists, remove the cache folder
|
# If the folder already exists, remove the cache folder
|
||||||
crawl4ai_folder = Path.home() / ".crawl4ai"
|
crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||||
cache_folder = crawl4ai_folder / "cache"
|
if os.path.exists(f"{crawl4ai_folder}/cache"):
|
||||||
|
subprocess.run(["rm", "-rf", f"{crawl4ai_folder}/cache"])
|
||||||
|
os.makedirs(crawl4ai_folder, exist_ok=True)
|
||||||
|
os.makedirs(f"{crawl4ai_folder}/cache", exist_ok=True)
|
||||||
|
|
||||||
if cache_folder.exists():
|
|
||||||
shutil.rmtree(cache_folder)
|
|
||||||
|
|
||||||
crawl4ai_folder.mkdir(exist_ok=True)
|
|
||||||
cache_folder.mkdir(exist_ok=True)
|
|
||||||
|
|
||||||
# Read the requirements from requirements.txt
|
# Read the requirements from requirements.txt
|
||||||
with open("requirements.txt") as f:
|
with open("requirements.txt") as f:
|
||||||
|
|||||||
Reference in New Issue
Block a user