feat(robots): add robots.txt compliance support

Add support for checking and respecting robots.txt rules before crawling websites: - Implement RobotsParser class with SQLite caching - Add check_robots_txt parameter to CrawlerRunConfig - Integrate robots.txt checking in AsyncWebCrawler - Update documentation with robots.txt compliance examples - Add tests for robot parser functionality The cache uses WAL mode for better concurrency and has a default TTL of 7 days.
2025-01-21 17:54:13 +08:00
parent 9247877037
commit d09c611d15
11 changed files with 482 additions and 12 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -228,3 +228,6 @@ tree.md
 /plans
 .codeiumignore
 todo/
+
+# windsurf rules
+.windsurfrules
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,9 @@
+### [Added] 2025-01-21
+- Added robots.txt compliance support with efficient SQLite-based caching
+- New `check_robots_txt` parameter in CrawlerRunConfig to enable robots.txt checking
+- Documentation updates for robots.txt compliance features and examples
+- Automated robots.txt checking integrated into AsyncWebCrawler with 403 status codes for blocked URLs
+
 ### [Added] 2025-01-20
 - Added proxy configuration support to CrawlerRunConfig allowing dynamic proxy settings per crawl request
 - Updated documentation with examples for using proxy configuration in crawl operations
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -372,6 +372,7 @@ class CrawlerRunConfig:
        # Optional Parameters
        stream (bool): If True, stream the page content as it is being loaded.
        url: str = None  # This is not a compulsory parameter
+        check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
    """

    def __init__(
@@ -442,6 +443,7 @@ class CrawlerRunConfig:
        # Streaming Parameters
        stream: bool = False,
        url: str = None,
+        check_robots_txt: bool = False,
    ):
        self.url = url

@@ -521,6 +523,9 @@ class CrawlerRunConfig:
        # Streaming Parameters
        self.stream = stream

+        # Robots.txt Handling Parameters
+        self.check_robots_txt = check_robots_txt
+
        # Validate type of extraction strategy and chunking strategy if they are provided
        if self.extraction_strategy is not None and not isinstance(
            self.extraction_strategy, ExtractionStrategy
@@ -617,6 +622,7 @@ class CrawlerRunConfig:
            # Streaming Parameters
            stream=kwargs.get("stream", False),
            url=kwargs.get("url"),
+            check_robots_txt=kwargs.get("check_robots_txt", False),
        )

    # Create a funciton returns dict of the object
@@ -679,6 +685,7 @@ class CrawlerRunConfig:
            "log_console": self.log_console,
            "stream": self.stream,
            "url": self.url,
+            "check_robots_txt": self.check_robots_txt,
        }

    def clone(self, **kwargs):
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -40,6 +40,7 @@ from .utils import (
    fast_format_html,
    create_box_message,
    get_error_context,
+    RobotsParser,
 )

 from typing import Union, AsyncGenerator, List, TypeVar
@@ -203,6 +204,9 @@ class AsyncWebCrawler:
        os.makedirs(self.crawl4ai_folder, exist_ok=True)
        os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)

+        # Initialize robots parser
+        self.robots_parser = RobotsParser()
+
        self.ready = False

    async def start(self):
@@ -414,6 +418,18 @@ class AsyncWebCrawler:
                    if user_agent:
                        self.crawler_strategy.update_user_agent(user_agent)

+                    # Check robots.txt if enabled
+                    if config and config.check_robots_txt:
+                        if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent):
+                            return CrawlResult(
+                                url=url,
+                                html="",
+                                success=False,
+                                status_code=403,
+                                error_message="Access denied by robots.txt",
+                                response_headers={"X-Robots-Status": "Blocked by robots.txt"}
+                            )
+
                    # Pass config to crawl method
                    async_response = await self.crawler_strategy.crawl(
                        url,
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -23,6 +23,138 @@ import pstats
 from functools import wraps
 import asyncio

+import sqlite3
+import hashlib
+from urllib.parse import urljoin, urlparse
+from urllib.robotparser import RobotFileParser
+import aiohttp
+
+class RobotsParser:
+    # Default 7 days cache TTL
+    CACHE_TTL = 7 * 24 * 60 * 60
+
+    def __init__(self, cache_dir=None, cache_ttl=None):
+        self.cache_dir = cache_dir or os.path.join(get_home_folder(), ".crawl4ai", "robots")
+        self.cache_ttl = cache_ttl or self.CACHE_TTL
+        os.makedirs(self.cache_dir, exist_ok=True)
+        self.db_path = os.path.join(self.cache_dir, "robots_cache.db")
+        self._init_db()
+
+    def _init_db(self):
+        # Use WAL mode for better concurrency and performance
+        with sqlite3.connect(self.db_path) as conn:
+            conn.execute("PRAGMA journal_mode=WAL")
+            conn.execute("""
+                CREATE TABLE IF NOT EXISTS robots_cache (
+                    domain TEXT PRIMARY KEY,
+                    rules TEXT NOT NULL,
+                    fetch_time INTEGER NOT NULL,
+                    hash TEXT NOT NULL
+                )
+            """)
+            conn.execute("CREATE INDEX IF NOT EXISTS idx_domain ON robots_cache(domain)")
+
+    def _get_cached_rules(self, domain: str) -> tuple[str, bool]:
+        """Get cached rules. Returns (rules, is_fresh)"""
+        with sqlite3.connect(self.db_path) as conn:
+            cursor = conn.execute(
+                "SELECT rules, fetch_time, hash FROM robots_cache WHERE domain = ?", 
+                (domain,)
+            )
+            result = cursor.fetchone()
+            
+            if not result:
+                return None, False
+                
+            rules, fetch_time, _ = result
+            # Check if cache is still fresh based on TTL
+            return rules, (time.time() - fetch_time) < self.cache_ttl
+
+    def _cache_rules(self, domain: str, content: str):
+        """Cache robots.txt content with hash for change detection"""
+        hash_val = hashlib.md5(content.encode()).hexdigest()
+        with sqlite3.connect(self.db_path) as conn:
+            # Check if content actually changed
+            cursor = conn.execute(
+                "SELECT hash FROM robots_cache WHERE domain = ?", 
+                (domain,)
+            )
+            result = cursor.fetchone()
+            
+            # Only update if hash changed or no previous entry
+            if not result or result[0] != hash_val:
+                conn.execute(
+                    """INSERT OR REPLACE INTO robots_cache 
+                       (domain, rules, fetch_time, hash) 
+                       VALUES (?, ?, ?, ?)""",
+                    (domain, content, int(time.time()), hash_val)
+                )
+
+    async def can_fetch(self, url: str, user_agent: str = "*") -> bool:
+        """
+        Check if URL can be fetched according to robots.txt rules.
+        
+        Args:
+            url: The URL to check
+            user_agent: User agent string to check against (default: "*")
+            
+        Returns:
+            bool: True if allowed, False if disallowed by robots.txt
+        """
+        # Handle empty/invalid URLs
+        try:
+            parsed = urlparse(url)
+            domain = parsed.netloc
+            if not domain:
+                return True
+        except:
+            return True
+
+        # Fast path - check cache first
+        rules, is_fresh = self._get_cached_rules(domain)
+        
+        # If rules not found or stale, fetch new ones
+        if not is_fresh:
+            try:
+                # Ensure we use the same scheme as the input URL
+                scheme = parsed.scheme or 'http'
+                robots_url = f"{scheme}://{domain}/robots.txt"
+                
+                async with aiohttp.ClientSession() as session:
+                    async with session.get(robots_url, timeout=2) as response:
+                        if response.status == 200:
+                            rules = await response.text()
+                            self._cache_rules(domain, rules)
+                        else:
+                            return True
+            except:
+                # On any error (timeout, connection failed, etc), allow access
+                return True
+
+        if not rules:
+            return True
+
+        # Create parser for this check
+        parser = RobotFileParser() 
+        parser.parse(rules.splitlines())
+        
+        # If parser can't read rules, allow access
+        if not parser.mtime():
+            return True
+            
+        return parser.can_fetch(user_agent, url)
+
+    def clear_cache(self):
+        """Clear all cached robots.txt entries"""
+        with sqlite3.connect(self.db_path) as conn:
+            conn.execute("DELETE FROM robots_cache")
+
+    def clear_expired(self):
+        """Remove only expired entries from cache"""
+        with sqlite3.connect(self.db_path) as conn:
+            expire_time = int(time.time()) - self.cache_ttl
+            conn.execute("DELETE FROM robots_cache WHERE fetch_time < ?", (expire_time,))
+      

 class InvalidCSSSelectorError(Exception):
    pass
--- a/docs/md_v2/advanced/advanced-features.md
+++ b/docs/md_v2/advanced/advanced-features.md
@@ -8,6 +8,7 @@ Crawl4AI offers multiple power-user features that go beyond simple crawling. Thi
 3. **Handling SSL Certificates**  
 4. **Custom Headers**  
 5. **Session Persistence & Local Storage**
+6. **Robots.txt Compliance**

 > **Prerequisites**  
 > - You have a basic grasp of [AsyncWebCrawler Basics](../core/simple-crawling.md)  
@@ -251,6 +252,42 @@ You can sign in once, export the browser context, and reuse it later—without r

 ---

+## 6. Robots.txt Compliance
+
+Crawl4AI supports respecting robots.txt rules with efficient caching:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Enable robots.txt checking in config
+    config = CrawlerRunConfig(
+        check_robots_txt=True  # Will check and respect robots.txt rules
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            "https://example.com",
+            config=config
+        )
+        
+        if not result.success and result.status_code == 403:
+            print("Access denied by robots.txt")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Points**
+- Robots.txt files are cached locally for efficiency
+- Cache is stored in `~/.crawl4ai/robots/robots_cache.db`
+- Cache has a default TTL of 7 days
+- If robots.txt can't be fetched, crawling is allowed
+- Returns 403 status code if URL is disallowed
+
+---
+
 ## Putting It All Together

 Here’s a snippet that combines multiple “advanced” features (proxy, PDF, screenshot, SSL, custom headers, and session reuse) into one run. Normally, you’d tailor each setting to your project’s needs.
@@ -321,6 +358,7 @@ You’ve now explored several **advanced** features:
 - **SSL Certificate** retrieval & exporting  
 - **Custom Headers** for language or specialized requests  
 - **Session Persistence** via storage state
+- **Robots.txt Compliance**

 With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, and manage sessions across multiple runs—streamlining your entire data collection pipeline.

--- a/docs/md_v2/advanced/multi-url-crawling.md
+++ b/docs/md_v2/advanced/multi-url-crawling.md
@@ -189,6 +189,44 @@ async def crawl_with_semaphore(urls):
        return results
 ```

+### 4.4 Robots.txt Consideration
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def main():
+    urls = [
+        "https://example1.com",
+        "https://example2.com",
+        "https://example3.com"
+    ]
+    
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.ENABLED,
+        check_robots_txt=True,  # Will respect robots.txt for each URL
+        semaphore_count=3      # Max concurrent requests
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        async for result in crawler.arun_many(urls, config=config):
+            if result.success:
+                print(f"Successfully crawled {result.url}")
+            elif result.status_code == 403 and "robots.txt" in result.error_message:
+                print(f"Skipped {result.url} - blocked by robots.txt")
+            else:
+                print(f"Failed to crawl {result.url}: {result.error_message}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Points**:
+- When `check_robots_txt=True`, each URL's robots.txt is checked before crawling
+- Robots.txt files are cached for efficiency
+- Failed robots.txt checks return 403 status code
+- Dispatcher handles robots.txt checks automatically for each URL
+
 ## 5. Dispatch Results

 Each crawl result includes dispatch information:
--- a/docs/md_v2/api/arun.md
+++ b/docs/md_v2/api/arun.md
@@ -22,6 +22,7 @@ async def main():
    run_config = CrawlerRunConfig(
        verbose=True,            # Detailed logging
        cache_mode=CacheMode.ENABLED,  # Use normal read/write cache
+        check_robots_txt=True,   # Respect robots.txt rules
        # ... other parameters
    )

@@ -30,8 +31,10 @@ async def main():
            url="https://example.com",
            config=run_config
        )
-        print(result.cleaned_html[:500])
        
+        # Check if blocked by robots.txt
+        if not result.success and result.status_code == 403:
+            print(f"Error: {result.error_message}")
 ```

 **Key Fields**:
@@ -226,6 +229,7 @@ async def main():
        # Core
        verbose=True,
        cache_mode=CacheMode.ENABLED,
+        check_robots_txt=True,   # Respect robots.txt rules
        
        # Content
        word_count_threshold=10,
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -106,6 +106,7 @@ Use these for controlling whether you read or write from a local content cache.
 | **`wait_for`**             | `str or None`           | Wait for a CSS (`"css:selector"`) or JS (`"js:() => bool"`) condition before content extraction.                     |
 | **`wait_for_images`**      | `bool` (False)          | Wait for images to load before finishing. Slows down if you only want text.                                          |
 | **`delay_before_return_html`** | `float` (0.1)       | Additional pause (seconds) before final HTML is captured. Good for last-second updates.                               |
+| **`check_robots_txt`**     | `bool` (False)          | Whether to check and respect robots.txt rules before crawling. If True, caches robots.txt for efficiency.            |
 | **`mean_delay`** and **`max_range`** | `float` (0.1, 0.3) | If you call `arun_many()`, these define random delay intervals between crawls, helping avoid detection or rate limits. |
 | **`semaphore_count`**      | `int` (5)               | Max concurrency for `arun_many()`. Increase if you have resources for parallel crawls.                                |

@@ -266,17 +267,21 @@ async def main():

 if __name__ == "__main__":
    asyncio.run(main())
+
+## 2.4 Compliance & Ethics
+
+| **Parameter**          | **Type / Default**      | **What It Does**                                                                                                    |
+|-----------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------|
+| **`check_robots_txt`**| `bool` (False)          | When True, checks and respects robots.txt rules before crawling. Uses efficient caching with SQLite backend.          |
+| **`user_agent`**      | `str` (None)            | User agent string to identify your crawler. Used for robots.txt checking when enabled.                                |
+
+```python
+run_config = CrawlerRunConfig(
+    check_robots_txt=True,  # Enable robots.txt compliance
+    user_agent="MyBot/1.0"  # Identify your crawler
+)
 ```

-**What’s Happening**:
- **`text_mode=True`** avoids loading images and other heavy resources, speeding up the crawl.  
- We disable caching (`cache_mode=CacheMode.BYPASS`) to always fetch fresh content.  
- We only keep `main.article` content by specifying `css_selector="main.article"`.  
- We exclude external links (`exclude_external_links=True`).  
- We do a quick screenshot (`screenshot=True`) before finishing.
-
---
-
 ## 3. Putting It All Together

 - **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent.  
--- a/tests/20241401/test_robot_parser.py
+++ b/tests/20241401/test_robot_parser.py
@@ -0,0 +1,159 @@
+from crawl4ai.utils import RobotsParser
+            
+import asyncio
+import aiohttp
+from aiohttp import web
+import tempfile
+import shutil
+import os, sys, time, json
+
+
+async def test_robots_parser():
+    print("\n=== Testing RobotsParser ===\n")
+    
+    # Setup temporary directory for testing
+    temp_dir = tempfile.mkdtemp()
+    try:
+        # 1. Basic setup test
+        print("1. Testing basic initialization...")
+        parser = RobotsParser(cache_dir=temp_dir)
+        assert os.path.exists(parser.db_path), "Database file not created"
+        print("✓ Basic initialization passed")
+
+        # 2. Test common cases
+        print("\n2. Testing common cases...")
+        allowed = await parser.can_fetch("https://www.example.com", "MyBot/1.0")
+        print(f"✓ Regular website fetch: {'allowed' if allowed else 'denied'}")
+        
+        # Test caching
+        print("Testing cache...")
+        start = time.time()
+        await parser.can_fetch("https://www.example.com", "MyBot/1.0")
+        duration = time.time() - start
+        print(f"✓ Cached lookup took: {duration*1000:.2f}ms")
+        assert duration < 0.03, "Cache lookup too slow"
+
+        # 3. Edge cases
+        print("\n3. Testing edge cases...")
+        
+        # Empty URL
+        result = await parser.can_fetch("", "MyBot/1.0")
+        print(f"✓ Empty URL handled: {'allowed' if result else 'denied'}")
+        
+        # Invalid URL
+        result = await parser.can_fetch("not_a_url", "MyBot/1.0")
+        print(f"✓ Invalid URL handled: {'allowed' if result else 'denied'}")
+        
+        # URL without scheme
+        result = await parser.can_fetch("example.com/page", "MyBot/1.0")
+        print(f"✓ URL without scheme handled: {'allowed' if result else 'denied'}")
+
+        # 4. Test with local server
+        async def start_test_server():
+            app = web.Application()
+            
+            async def robots_txt(request):
+                return web.Response(text="""User-agent: *
+Disallow: /private/
+Allow: /public/
+""")
+            
+            async def malformed_robots(request):
+                return web.Response(text="<<<malformed>>>")
+            
+            async def timeout_robots(request):
+                await asyncio.sleep(5)
+                return web.Response(text="Should timeout")
+            
+            async def empty_robots(request):
+                return web.Response(text="")
+            
+            async def giant_robots(request):
+                return web.Response(text="User-agent: *\nDisallow: /\n" * 10000)
+
+            # Mount all handlers at root level
+            app.router.add_get('/robots.txt', robots_txt)
+            app.router.add_get('/malformed/robots.txt', malformed_robots)
+            app.router.add_get('/timeout/robots.txt', timeout_robots)
+            app.router.add_get('/empty/robots.txt', empty_robots)
+            app.router.add_get('/giant/robots.txt', giant_robots)
+            
+            runner = web.AppRunner(app)
+            await runner.setup()
+            site = web.TCPSite(runner, 'localhost', 8080)
+            await site.start()
+            return runner
+
+        runner = await start_test_server()
+        try:
+            print("\n4. Testing robots.txt rules...")
+            base_url = "http://localhost:8080"
+            
+            # Test public access
+            result = await parser.can_fetch(f"{base_url}/public/page", "bot")
+            print(f"Public access (/public/page): {'allowed' if result else 'denied'}")
+            assert result, "Public path should be allowed"
+            
+            # Test private access
+            result = await parser.can_fetch(f"{base_url}/private/secret", "bot")
+            print(f"Private access (/private/secret): {'allowed' if result else 'denied'}")
+            assert not result, "Private path should be denied"
+            
+            # Test malformed
+            result = await parser.can_fetch("http://localhost:8080/malformed/page", "bot")
+            print(f"✓ Malformed robots.txt handled: {'allowed' if result else 'denied'}")
+            
+            # Test timeout
+            start = time.time()
+            result = await parser.can_fetch("http://localhost:8080/timeout/page", "bot")
+            duration = time.time() - start
+            print(f"✓ Timeout handled (took {duration:.2f}s): {'allowed' if result else 'denied'}")
+            assert duration < 3, "Timeout not working"
+            
+            # Test empty
+            result = await parser.can_fetch("http://localhost:8080/empty/page", "bot")
+            print(f"✓ Empty robots.txt handled: {'allowed' if result else 'denied'}")
+            
+            # Test giant file
+            start = time.time()
+            result = await parser.can_fetch("http://localhost:8080/giant/page", "bot")
+            duration = time.time() - start
+            print(f"✓ Giant robots.txt handled (took {duration:.2f}s): {'allowed' if result else 'denied'}")
+
+        finally:
+            await runner.cleanup()
+
+        # 5. Cache manipulation
+        print("\n5. Testing cache manipulation...")
+        
+        # Clear expired
+        parser.clear_expired()
+        print("✓ Clear expired entries completed")
+        
+        # Clear all
+        parser.clear_cache()
+        print("✓ Clear all cache completed")
+        
+        # Test with custom TTL
+        custom_parser = RobotsParser(cache_dir=temp_dir, cache_ttl=1)  # 1 second TTL
+        await custom_parser.can_fetch("https://www.example.com", "bot")
+        print("✓ Custom TTL fetch completed")
+        await asyncio.sleep(1.1)
+        start = time.time()
+        await custom_parser.can_fetch("https://www.example.com", "bot")
+        print(f"✓ TTL expiry working (refetched after {time.time() - start:.2f}s)")
+
+    finally:
+        # Cleanup
+        shutil.rmtree(temp_dir)
+        print("\nTest cleanup completed")
+
+async def main():
+    try:
+        await test_robots_parser()
+    except Exception as e:
+        print(f"Test failed: {str(e)}")
+        raise
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/tests/20241401/tets_robot.py
+++ b/tests/20241401/tets_robot.py
@@ -0,0 +1,62 @@
+import asyncio
+from crawl4ai import *
+
+async def test_real_websites():
+    print("\n=== Testing Real Website Robots.txt Compliance ===\n")
+    
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        
+        # Test cases with URLs
+        test_cases = [
+            # Public sites that should be allowed
+            ("https://example.com", True),  # Simple public site
+            ("https://httpbin.org/get", True),  # API endpoint
+            
+            # Sites with known strict robots.txt
+            ("https://www.facebook.com/robots.txt", False),  # Social media
+            ("https://www.google.com/search", False),  # Search pages
+            
+            # Edge cases
+            ("https://api.github.com", True),  # API service
+            ("https://raw.githubusercontent.com", True),  # Content delivery
+            
+            # Non-existent/error cases
+            ("https://thisisnotarealwebsite.com", True),  # Non-existent domain
+            ("https://localhost:12345", True),  # Invalid port
+        ]
+
+        for url, expected in test_cases:
+            print(f"\nTesting: {url}")
+            try:
+                config = CrawlerRunConfig(
+                    cache_mode=CacheMode.BYPASS,
+                    check_robots_txt=True,  # Enable robots.txt checking
+                    verbose=True
+                )
+                
+                result = await crawler.arun(url=url, config=config)
+                allowed = result.success and not result.error_message
+                
+                print(f"Expected: {'allowed' if expected else 'denied'}")
+                print(f"Actual: {'allowed' if allowed else 'denied'}")
+                print(f"Status Code: {result.status_code}")
+                if result.error_message:
+                    print(f"Error: {result.error_message}")
+                
+                # Optional: Print robots.txt content if available
+                if result.metadata and 'robots_txt' in result.metadata:
+                    print(f"Robots.txt rules:\n{result.metadata['robots_txt']}")
+                
+            except Exception as e:
+                print(f"Test failed with error: {str(e)}")
+
+async def main():
+    try:
+        await test_real_websites()
+    except Exception as e:
+        print(f"Test suite failed: {str(e)}")
+        raise
+
+if __name__ == "__main__":
+    asyncio.run(main())