feat(robots): add robots.txt compliance support

Add support for checking and respecting robots.txt rules before crawling websites: - Implement RobotsParser class with SQLite caching - Add check_robots_txt parameter to CrawlerRunConfig - Integrate robots.txt checking in AsyncWebCrawler - Update documentation with robots.txt compliance examples - Add tests for robot parser functionality The cache uses WAL mode for better concurrency and has a default TTL of 7 days.
2025-01-21 17:54:13 +08:00
parent 9247877037
commit d09c611d15
11 changed files with 482 additions and 12 deletions
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -23,6 +23,138 @@ import pstats
 from functools import wraps
 import asyncio

+import sqlite3
+import hashlib
+from urllib.parse import urljoin, urlparse
+from urllib.robotparser import RobotFileParser
+import aiohttp
+
+class RobotsParser:
+    # Default 7 days cache TTL
+    CACHE_TTL = 7 * 24 * 60 * 60
+
+    def __init__(self, cache_dir=None, cache_ttl=None):
+        self.cache_dir = cache_dir or os.path.join(get_home_folder(), ".crawl4ai", "robots")
+        self.cache_ttl = cache_ttl or self.CACHE_TTL
+        os.makedirs(self.cache_dir, exist_ok=True)
+        self.db_path = os.path.join(self.cache_dir, "robots_cache.db")
+        self._init_db()
+
+    def _init_db(self):
+        # Use WAL mode for better concurrency and performance
+        with sqlite3.connect(self.db_path) as conn:
+            conn.execute("PRAGMA journal_mode=WAL")
+            conn.execute("""
+                CREATE TABLE IF NOT EXISTS robots_cache (
+                    domain TEXT PRIMARY KEY,
+                    rules TEXT NOT NULL,
+                    fetch_time INTEGER NOT NULL,
+                    hash TEXT NOT NULL
+                )
+            """)
+            conn.execute("CREATE INDEX IF NOT EXISTS idx_domain ON robots_cache(domain)")
+
+    def _get_cached_rules(self, domain: str) -> tuple[str, bool]:
+        """Get cached rules. Returns (rules, is_fresh)"""
+        with sqlite3.connect(self.db_path) as conn:
+            cursor = conn.execute(
+                "SELECT rules, fetch_time, hash FROM robots_cache WHERE domain = ?", 
+                (domain,)
+            )
+            result = cursor.fetchone()
+            
+            if not result:
+                return None, False
+                
+            rules, fetch_time, _ = result
+            # Check if cache is still fresh based on TTL
+            return rules, (time.time() - fetch_time) < self.cache_ttl
+
+    def _cache_rules(self, domain: str, content: str):
+        """Cache robots.txt content with hash for change detection"""
+        hash_val = hashlib.md5(content.encode()).hexdigest()
+        with sqlite3.connect(self.db_path) as conn:
+            # Check if content actually changed
+            cursor = conn.execute(
+                "SELECT hash FROM robots_cache WHERE domain = ?", 
+                (domain,)
+            )
+            result = cursor.fetchone()
+            
+            # Only update if hash changed or no previous entry
+            if not result or result[0] != hash_val:
+                conn.execute(
+                    """INSERT OR REPLACE INTO robots_cache 
+                       (domain, rules, fetch_time, hash) 
+                       VALUES (?, ?, ?, ?)""",
+                    (domain, content, int(time.time()), hash_val)
+                )
+
+    async def can_fetch(self, url: str, user_agent: str = "*") -> bool:
+        """
+        Check if URL can be fetched according to robots.txt rules.
+        
+        Args:
+            url: The URL to check
+            user_agent: User agent string to check against (default: "*")
+            
+        Returns:
+            bool: True if allowed, False if disallowed by robots.txt
+        """
+        # Handle empty/invalid URLs
+        try:
+            parsed = urlparse(url)
+            domain = parsed.netloc
+            if not domain:
+                return True
+        except:
+            return True
+
+        # Fast path - check cache first
+        rules, is_fresh = self._get_cached_rules(domain)
+        
+        # If rules not found or stale, fetch new ones
+        if not is_fresh:
+            try:
+                # Ensure we use the same scheme as the input URL
+                scheme = parsed.scheme or 'http'
+                robots_url = f"{scheme}://{domain}/robots.txt"
+                
+                async with aiohttp.ClientSession() as session:
+                    async with session.get(robots_url, timeout=2) as response:
+                        if response.status == 200:
+                            rules = await response.text()
+                            self._cache_rules(domain, rules)
+                        else:
+                            return True
+            except:
+                # On any error (timeout, connection failed, etc), allow access
+                return True
+
+        if not rules:
+            return True
+
+        # Create parser for this check
+        parser = RobotFileParser() 
+        parser.parse(rules.splitlines())
+        
+        # If parser can't read rules, allow access
+        if not parser.mtime():
+            return True
+            
+        return parser.can_fetch(user_agent, url)
+
+    def clear_cache(self):
+        """Clear all cached robots.txt entries"""
+        with sqlite3.connect(self.db_path) as conn:
+            conn.execute("DELETE FROM robots_cache")
+
+    def clear_expired(self):
+        """Remove only expired entries from cache"""
+        with sqlite3.connect(self.db_path) as conn:
+            expire_time = int(time.time()) - self.cache_ttl
+            conn.execute("DELETE FROM robots_cache WHERE fetch_time < ?", (expire_time,))
+      

 class InvalidCSSSelectorError(Exception):
    pass