feat(robots): add robots.txt compliance support

Add support for checking and respecting robots.txt rules before crawling websites:
- Implement RobotsParser class with SQLite caching
- Add check_robots_txt parameter to CrawlerRunConfig
- Integrate robots.txt checking in AsyncWebCrawler
- Update documentation with robots.txt compliance examples
- Add tests for robot parser functionality

The cache uses WAL mode for better concurrency and has a default TTL of 7 days.
This commit is contained in:
UncleCode
2025-01-21 17:54:13 +08:00
parent 9247877037
commit d09c611d15
11 changed files with 482 additions and 12 deletions

View File

@@ -23,6 +23,138 @@ import pstats
from functools import wraps
import asyncio
import sqlite3
import hashlib
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
import aiohttp
class RobotsParser:
# Default 7 days cache TTL
CACHE_TTL = 7 * 24 * 60 * 60
def __init__(self, cache_dir=None, cache_ttl=None):
self.cache_dir = cache_dir or os.path.join(get_home_folder(), ".crawl4ai", "robots")
self.cache_ttl = cache_ttl or self.CACHE_TTL
os.makedirs(self.cache_dir, exist_ok=True)
self.db_path = os.path.join(self.cache_dir, "robots_cache.db")
self._init_db()
def _init_db(self):
# Use WAL mode for better concurrency and performance
with sqlite3.connect(self.db_path) as conn:
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("""
CREATE TABLE IF NOT EXISTS robots_cache (
domain TEXT PRIMARY KEY,
rules TEXT NOT NULL,
fetch_time INTEGER NOT NULL,
hash TEXT NOT NULL
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_domain ON robots_cache(domain)")
def _get_cached_rules(self, domain: str) -> tuple[str, bool]:
"""Get cached rules. Returns (rules, is_fresh)"""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute(
"SELECT rules, fetch_time, hash FROM robots_cache WHERE domain = ?",
(domain,)
)
result = cursor.fetchone()
if not result:
return None, False
rules, fetch_time, _ = result
# Check if cache is still fresh based on TTL
return rules, (time.time() - fetch_time) < self.cache_ttl
def _cache_rules(self, domain: str, content: str):
"""Cache robots.txt content with hash for change detection"""
hash_val = hashlib.md5(content.encode()).hexdigest()
with sqlite3.connect(self.db_path) as conn:
# Check if content actually changed
cursor = conn.execute(
"SELECT hash FROM robots_cache WHERE domain = ?",
(domain,)
)
result = cursor.fetchone()
# Only update if hash changed or no previous entry
if not result or result[0] != hash_val:
conn.execute(
"""INSERT OR REPLACE INTO robots_cache
(domain, rules, fetch_time, hash)
VALUES (?, ?, ?, ?)""",
(domain, content, int(time.time()), hash_val)
)
async def can_fetch(self, url: str, user_agent: str = "*") -> bool:
"""
Check if URL can be fetched according to robots.txt rules.
Args:
url: The URL to check
user_agent: User agent string to check against (default: "*")
Returns:
bool: True if allowed, False if disallowed by robots.txt
"""
# Handle empty/invalid URLs
try:
parsed = urlparse(url)
domain = parsed.netloc
if not domain:
return True
except:
return True
# Fast path - check cache first
rules, is_fresh = self._get_cached_rules(domain)
# If rules not found or stale, fetch new ones
if not is_fresh:
try:
# Ensure we use the same scheme as the input URL
scheme = parsed.scheme or 'http'
robots_url = f"{scheme}://{domain}/robots.txt"
async with aiohttp.ClientSession() as session:
async with session.get(robots_url, timeout=2) as response:
if response.status == 200:
rules = await response.text()
self._cache_rules(domain, rules)
else:
return True
except:
# On any error (timeout, connection failed, etc), allow access
return True
if not rules:
return True
# Create parser for this check
parser = RobotFileParser()
parser.parse(rules.splitlines())
# If parser can't read rules, allow access
if not parser.mtime():
return True
return parser.can_fetch(user_agent, url)
def clear_cache(self):
"""Clear all cached robots.txt entries"""
with sqlite3.connect(self.db_path) as conn:
conn.execute("DELETE FROM robots_cache")
def clear_expired(self):
"""Remove only expired entries from cache"""
with sqlite3.connect(self.db_path) as conn:
expire_time = int(time.time()) - self.cache_ttl
conn.execute("DELETE FROM robots_cache WHERE fetch_time < ?", (expire_time,))
class InvalidCSSSelectorError(Exception):
pass