chore: Remove .test_pads/ directory from .gitignore

chore: Remove .tests/ directory from .gitignore
chore: Refactor setup.py to use pathlib and shutil for folder creation and removal, to remove cache folder in cross platform manner.
2024-07-19 17:09:29 +08:00 · 2024-07-09 15:10:18 +08:00 · 2024-07-09 13:25:00 +08:00 · 2024-07-08 20:24:00 +08:00 · 2024-07-08 20:02:12 +08:00 · 2024-07-08 16:33:25 +08:00
5 changed files with 128 additions and 10 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -165,6 +165,8 @@ Crawl4AI.egg-info/
 Crawl4AI.egg-info/*
 crawler_data.db
 .vscode/
+.tests/
+.test_pads/
 test_pad.py
 test_pad*.py
 .data/
--- a/docs/md/demo.md
+++ b/docs/md/demo.md
@@ -14,6 +14,7 @@
            <div class="form-group">
                <button class="btn btn-default" type="submit">Submit</button>
            </div>
+
        </fieldset>
    </form>

@@ -93,6 +94,10 @@
        </div>
    </section>

+    <div id="error" class="error-message" style="display: none; margin-top:1em;">
+        <div class="terminal-alert terminal-alert-error"></div>
+    </div>
+
    <script>
        function showTab(tabId) {
            const tabs = document.querySelectorAll('.tab-content');
@@ -162,7 +167,17 @@
                },
                body: JSON.stringify(data)
            })
-            .then(response => response.json())
+            .then(response => {
+                if (!response.ok) {
+                    if (response.status === 429) {
+                        return response.json().then(err => { 
+                            throw Object.assign(new Error('Rate limit exceeded'), { status: 429, details: err });
+                        });
+                    }
+                    throw new Error('Network response was not ok');
+                }
+                return response.json();
+            })
            .then(data => {
                data = data.results[0]; // Only one URL is requested
                document.getElementById('loading').style.display = 'none';
@@ -187,11 +202,29 @@ result = crawler.run(
 print(result)
                `;
                redo(document.getElementById('pythonCode'), pythonCode);
+                document.getElementById('error').style.display = 'none';
            })
            .catch(error => {
                document.getElementById('loading').style.display = 'none';
-                document.getElementById('response').style.display = 'block';
-                document.getElementById('markdownContent').textContent = 'Error: ' + error;
+                document.getElementById('error').style.display = 'block';
+                let errorMessage = 'An unexpected error occurred. Please try again later.';
+                
+                if (error.status === 429) {
+                    const details = error.details;
+                    if (details.retry_after) {
+                        errorMessage = `Rate limit exceeded. Please wait ${parseFloat(details.retry_after).toFixed(1)} seconds before trying again.`;
+                    } else if (details.reset_at) {
+                        const resetTime = new Date(details.reset_at);
+                        const waitTime = Math.ceil((resetTime - new Date()) / 1000);
+                        errorMessage = `Rate limit exceeded. Please try again after ${waitTime} seconds.`;
+                    } else {
+                        errorMessage = `Rate limit exceeded. Please try again later.`;
+                    }
+                } else if (error.message) {
+                    errorMessage = error.message;
+                }
+                
+                document.querySelector('#error .terminal-alert').textContent = errorMessage;
            });
        });
    </script>
--- a/main.py
+++ b/main.py
@@ -22,6 +22,15 @@ from typing import List, Optional
 from crawl4ai.web_crawler import WebCrawler
 from crawl4ai.database import get_total_count, clear_db

+import time
+from slowapi import Limiter, _rate_limit_exceeded_handler
+from slowapi.util import get_remote_address
+from slowapi.errors import RateLimitExceeded
+
+# load .env file
+from dotenv import load_dotenv
+load_dotenv()
+
 # Configuration
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 MAX_CONCURRENT_REQUESTS = 10  # Adjust this to change the maximum concurrent requests
@@ -30,6 +39,78 @@ lock = asyncio.Lock()

 app = FastAPI()

+# Initialize rate limiter
+def rate_limit_key_func(request: Request):
+    access_token = request.headers.get("access-token")
+    if access_token == os.environ.get('ACCESS_TOKEN'):
+        return None
+    return get_remote_address(request)
+
+limiter = Limiter(key_func=rate_limit_key_func)
+app.state.limiter = limiter
+
+# Dictionary to store last request times for each client
+last_request_times = {}
+last_rate_limit = {}
+
+
+def get_rate_limit():
+    limit = os.environ.get('ACCESS_PER_MIN', "5")
+    return f"{limit}/minute"
+
+# Custom rate limit exceeded handler
+async def custom_rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded) -> JSONResponse:
+    if request.client.host not in last_rate_limit or time.time() - last_rate_limit[request.client.host] > 60:
+        last_rate_limit[request.client.host] = time.time()
+    retry_after = 60 - (time.time() - last_rate_limit[request.client.host])
+    reset_at = time.time() + retry_after
+    return JSONResponse(
+        status_code=429,
+        content={
+            "detail": "Rate limit exceeded",
+            "limit": str(exc.limit.limit),
+            "retry_after": retry_after,
+            'reset_at': reset_at,
+            "message": f"You have exceeded the rate limit of {exc.limit.limit}."
+        }
+    )
+    
+app.add_exception_handler(RateLimitExceeded, custom_rate_limit_exceeded_handler)
+
+
+# Middleware for token-based bypass and per-request limit
+class RateLimitMiddleware(BaseHTTPMiddleware):
+    async def dispatch(self, request: Request, call_next):
+        SPAN = int(os.environ.get('ACCESS_TIME_SPAN', 10))
+        access_token = request.headers.get("access-token")
+        if access_token == os.environ.get('ACCESS_TOKEN'):
+            return await call_next(request)
+        
+        path = request.url.path
+        if path in ["/crawl", "/old"]:
+            client_ip = request.client.host
+            current_time = time.time()
+            
+            # Check time since last request
+            if client_ip in last_request_times:
+                time_since_last_request = current_time - last_request_times[client_ip]
+                if time_since_last_request < SPAN:
+                    return JSONResponse(
+                        status_code=429,
+                        content={
+                            "detail": "Too many requests",
+                            "message": "Rate limit exceeded. Please wait 10 seconds between requests.",
+                            "retry_after": max(0, SPAN - time_since_last_request),
+                            "reset_at": current_time + max(0, SPAN - time_since_last_request),
+                        }
+                    )
+            
+            last_request_times[client_ip] = current_time
+
+        return await call_next(request)
+
+app.add_middleware(RateLimitMiddleware)
+
 # CORS configuration
 origins = ["*"]  # Allow all origins
 app.add_middleware(
@@ -73,6 +154,7 @@ def read_root():
    return RedirectResponse(url="/mkdocs")

@app.get("/old", response_class=HTMLResponse)
+@limiter.limit(get_rate_limit())
 async def read_index(request: Request):
    partials_dir = os.path.join(__location__, "pages", "partial")
    partials = {}
@@ -107,6 +189,7 @@ def import_strategy(module_name: str, class_name: str, *args, **kwargs):
        raise HTTPException(status_code=400, detail=f"Class {class_name} not found in {module_name}.")

@app.post("/crawl")
+@limiter.limit(get_rate_limit())
 async def crawl_urls(crawl_request: CrawlRequest, request: Request):
    logging.debug(f"[LOG] Crawl request for URL: {crawl_request.urls}")
    global current_requests
--- a/middlewares.py
+++ b/middlewares.py
--- a/setup.py
+++ b/setup.py
@@ -1,18 +1,18 @@
 from setuptools import setup, find_packages
 import os
 from pathlib import Path
-import subprocess
-from setuptools.command.install import install
+import shutil

 # Create the .crawl4ai folder in the user's home directory if it doesn't exist
 # If the folder already exists, remove the cache folder
-crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
-if os.path.exists(f"{crawl4ai_folder}/cache"):
-    subprocess.run(["rm", "-rf", f"{crawl4ai_folder}/cache"])
-os.makedirs(crawl4ai_folder, exist_ok=True)
-os.makedirs(f"{crawl4ai_folder}/cache", exist_ok=True)
+crawl4ai_folder = Path.home() / ".crawl4ai"
+cache_folder = crawl4ai_folder / "cache"

+if cache_folder.exists():
+    shutil.rmtree(cache_folder)

+crawl4ai_folder.mkdir(exist_ok=True)
+cache_folder.mkdir(exist_ok=True)

 # Read the requirements from requirements.txt
 with open("requirements.txt") as f:
Author	SHA1	Message	Date
unclecode	8463aabedf	chore: Remove .test_pads/ directory from .gitignore	2024-07-19 17:09:29 +08:00
unclecode	7f30144ef2	chore: Remove .tests/ directory from .gitignore	2024-07-09 15:10:18 +08:00
unclecode	fa5516aad6	chore: Refactor setup.py to use pathlib and shutil for folder creation and removal, to remove cache folder in cross platform manner.	2024-07-09 13:25:00 +08:00
unclecode	ca0336af9e	feat: Add error handling for rate limit exceeded in form submission This commit adds error handling for rate limit exceeded in the form submission process. If the server returns a 429 status code, the client will display an error message indicating the rate limit has been exceeded and provide information on when the user can try again. This improves the user experience by providing clear feedback and guidance when rate limits are reached.	2024-07-08 20:24:00 +08:00
unclecode	65ed1aeade	feat: Add rate limiting functionality with custom handlers	2024-07-08 20:02:12 +08:00
unclecode	4d283ab386	## [v0.2.74] - 2024-07-08 A slew of exciting updates to improve the crawler's stability and robustness! 🎉 - 💻 UTF encoding fix: Resolved the Windows \"charmap\" error by adding UTF encoding. - 🛡️ Error handling: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy. - 🧹 Input sanitization: Improved input sanitization and handled encoding issues in LLMExtractionStrategy. - 🚮 Database cleanup: Removed existing database file and initialized a new one.	2024-07-08 16:33:25 +08:00