feat(robots): add robots.txt compliance support

Add support for checking and respecting robots.txt rules before crawling websites: - Implement RobotsParser class with SQLite caching - Add check_robots_txt parameter to CrawlerRunConfig - Integrate robots.txt checking in AsyncWebCrawler - Update documentation with robots.txt compliance examples - Add tests for robot parser functionality The cache uses WAL mode for better concurrency and has a default TTL of 7 days.
2025-01-21 17:54:13 +08:00
parent 9247877037
commit d09c611d15
11 changed files with 482 additions and 12 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -227,4 +227,7 @@ tree.md
 .do
 /plans
 .codeiumignore
-todo/
+todo/
 # windsurf rules
 .windsurfrules
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,9 @@
 ### [Added] 2025-01-21
 - Added robots.txt compliance support with efficient SQLite-based caching
 - New `check_robots_txt` parameter in CrawlerRunConfig to enable robots.txt checking
 - Documentation updates for robots.txt compliance features and examples
 - Automated robots.txt checking integrated into AsyncWebCrawler with 403 status codes for blocked URLs
 ### [Added] 2025-01-20
 - Added proxy configuration support to CrawlerRunConfig allowing dynamic proxy settings per crawl request
 - Updated documentation with examples for using proxy configuration in crawl operations
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -372,6 +372,7 @@ class CrawlerRunConfig:
        # Optional Parameters
        stream (bool): If True, stream the page content as it is being loaded.
        url: str = None  # This is not a compulsory parameter
        check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
    """
    def __init__(
@@ -442,6 +443,7 @@ class CrawlerRunConfig:
        # Streaming Parameters
        stream: bool = False,
        url: str = None,
        check_robots_txt: bool = False,
    ):
        self.url = url
@@ -521,6 +523,9 @@ class CrawlerRunConfig:
        # Streaming Parameters
        self.stream = stream
        # Robots.txt Handling Parameters
        self.check_robots_txt = check_robots_txt
        # Validate type of extraction strategy and chunking strategy if they are provided
        if self.extraction_strategy is not None and not isinstance(
            self.extraction_strategy, ExtractionStrategy
@@ -617,6 +622,7 @@ class CrawlerRunConfig:
            # Streaming Parameters
            stream=kwargs.get("stream", False),
            url=kwargs.get("url"),
            check_robots_txt=kwargs.get("check_robots_txt", False),
        )
    # Create a funciton returns dict of the object
@@ -679,6 +685,7 @@ class CrawlerRunConfig:
            "log_console": self.log_console,
            "stream": self.stream,
            "url": self.url,
            "check_robots_txt": self.check_robots_txt,
        }
    def clone(self, **kwargs):
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -40,6 +40,7 @@ from .utils import (
    fast_format_html,
    create_box_message,
    get_error_context,
    RobotsParser,
 )
 from typing import Union, AsyncGenerator, List, TypeVar
@@ -203,6 +204,9 @@ class AsyncWebCrawler:
        os.makedirs(self.crawl4ai_folder, exist_ok=True)
        os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
        # Initialize robots parser
        self.robots_parser = RobotsParser()
        self.ready = False
    async def start(self):
@@ -414,6 +418,18 @@ class AsyncWebCrawler:
                    if user_agent:
                        self.crawler_strategy.update_user_agent(user_agent)
                    # Check robots.txt if enabled
                    if config and config.check_robots_txt:
                        if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent):
                            return CrawlResult(
                                url=url,
                                html="",
                                success=False,
                                status_code=403,
                                error_message="Access denied by robots.txt",
                                response_headers={"X-Robots-Status": "Blocked by robots.txt"}
                            )
                    # Pass config to crawl method
                    async_response = await self.crawler_strategy.crawl(
                        url,
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -23,6 +23,138 @@ import pstats
 from functools import wraps
 import asyncio
 import sqlite3
 import hashlib
 from urllib.parse import urljoin, urlparse
 from urllib.robotparser import RobotFileParser
 import aiohttp
 class RobotsParser:
    # Default 7 days cache TTL
    CACHE_TTL = 7 * 24 * 60 * 60
    def __init__(self, cache_dir=None, cache_ttl=None):
        self.cache_dir = cache_dir or os.path.join(get_home_folder(), ".crawl4ai", "robots")
        self.cache_ttl = cache_ttl or self.CACHE_TTL
        os.makedirs(self.cache_dir, exist_ok=True)
        self.db_path = os.path.join(self.cache_dir, "robots_cache.db")
        self._init_db()
    def _init_db(self):
        # Use WAL mode for better concurrency and performance
        with sqlite3.connect(self.db_path) as conn:
            conn.execute("PRAGMA journal_mode=WAL")
            conn.execute("""
                CREATE TABLE IF NOT EXISTS robots_cache (
                    domain TEXT PRIMARY KEY,
                    rules TEXT NOT NULL,
                    fetch_time INTEGER NOT NULL,
                    hash TEXT NOT NULL
                )
            """)
            conn.execute("CREATE INDEX IF NOT EXISTS idx_domain ON robots_cache(domain)")
    def _get_cached_rules(self, domain: str) -> tuple[str, bool]:
        """Get cached rules. Returns (rules, is_fresh)"""
        with sqlite3.connect(self.db_path) as conn:
            cursor = conn.execute(
                "SELECT rules, fetch_time, hash FROM robots_cache WHERE domain = ?", 
                (domain,)
            )
            result = cursor.fetchone()
            if not result:
                return None, False
            rules, fetch_time, _ = result
            # Check if cache is still fresh based on TTL
            return rules, (time.time() - fetch_time) < self.cache_ttl
    def _cache_rules(self, domain: str, content: str):
        """Cache robots.txt content with hash for change detection"""
        hash_val = hashlib.md5(content.encode()).hexdigest()
        with sqlite3.connect(self.db_path) as conn:
            # Check if content actually changed
            cursor = conn.execute(
                "SELECT hash FROM robots_cache WHERE domain = ?", 
                (domain,)
            )
            result = cursor.fetchone()
            # Only update if hash changed or no previous entry
            if not result or result[0] != hash_val:
                conn.execute(
                    """INSERT OR REPLACE INTO robots_cache 
                       (domain, rules, fetch_time, hash) 
                       VALUES (?, ?, ?, ?)""",
                    (domain, content, int(time.time()), hash_val)
                )
    async def can_fetch(self, url: str, user_agent: str = "*") -> bool:
        """
        Check if URL can be fetched according to robots.txt rules.
        Args:
            url: The URL to check
            user_agent: User agent string to check against (default: "*")
        Returns:
            bool: True if allowed, False if disallowed by robots.txt
        """
        # Handle empty/invalid URLs
        try:
            parsed = urlparse(url)
            domain = parsed.netloc
            if not domain:
                return True
        except:
            return True
        # Fast path - check cache first
        rules, is_fresh = self._get_cached_rules(domain)
        # If rules not found or stale, fetch new ones
        if not is_fresh:
            try:
                # Ensure we use the same scheme as the input URL
                scheme = parsed.scheme or 'http'
                robots_url = f"{scheme}://{domain}/robots.txt"
                async with aiohttp.ClientSession() as session:
                    async with session.get(robots_url, timeout=2) as response:
                        if response.status == 200:
                            rules = await response.text()
                            self._cache_rules(domain, rules)
                        else:
                            return True
            except:
                # On any error (timeout, connection failed, etc), allow access
                return True
        if not rules:
            return True
        # Create parser for this check
        parser = RobotFileParser() 
        parser.parse(rules.splitlines())
        # If parser can't read rules, allow access
        if not parser.mtime():
            return True
        return parser.can_fetch(user_agent, url)
    def clear_cache(self):
        """Clear all cached robots.txt entries"""
        with sqlite3.connect(self.db_path) as conn:
            conn.execute("DELETE FROM robots_cache")
    def clear_expired(self):
        """Remove only expired entries from cache"""
        with sqlite3.connect(self.db_path) as conn:
            expire_time = int(time.time()) - self.cache_ttl
            conn.execute("DELETE FROM robots_cache WHERE fetch_time < ?", (expire_time,))
 class InvalidCSSSelectorError(Exception):
    pass
--- a/docs/md_v2/advanced/advanced-features.md
+++ b/docs/md_v2/advanced/advanced-features.md
@@ -8,6 +8,7 @@ Crawl4AI offers multiple power-user features that go beyond simple crawling. Thi
 3. **Handling SSL Certificates**  
 4. **Custom Headers**  
 5. **Session Persistence & Local Storage**
 6. **Robots.txt Compliance**
 > **Prerequisites**  
 > - You have a basic grasp of [AsyncWebCrawler Basics](../core/simple-crawling.md)  
@@ -251,6 +252,42 @@ You can sign in once, export the browser context, and reuse it later—without r
 ---
 ## 6. Robots.txt Compliance
 Crawl4AI supports respecting robots.txt rules with efficient caching:
 ```python
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
 async def main():
    # Enable robots.txt checking in config
    config = CrawlerRunConfig(
        check_robots_txt=True  # Will check and respect robots.txt rules
    )
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            "https://example.com",
            config=config
        )
        if not result.success and result.status_code == 403:
            print("Access denied by robots.txt")
 if __name__ == "__main__":
    asyncio.run(main())
 ```
 **Key Points**
 - Robots.txt files are cached locally for efficiency
 - Cache is stored in `~/.crawl4ai/robots/robots_cache.db`
 - Cache has a default TTL of 7 days
 - If robots.txt can't be fetched, crawling is allowed
 - Returns 403 status code if URL is disallowed
 ---
 ## Putting It All Together
 Here’s a snippet that combines multiple “advanced” features (proxy, PDF, screenshot, SSL, custom headers, and session reuse) into one run. Normally, you’d tailor each setting to your project’s needs.
@@ -321,6 +358,7 @@ You’ve now explored several **advanced** features:
 - **SSL Certificate** retrieval & exporting  
 - **Custom Headers** for language or specialized requests  
 - **Session Persistence** via storage state
 - **Robots.txt Compliance**
 With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, and manage sessions across multiple runs—streamlining your entire data collection pipeline.
--- a/docs/md_v2/advanced/multi-url-crawling.md
+++ b/docs/md_v2/advanced/multi-url-crawling.md
@@ -189,6 +189,44 @@ async def crawl_with_semaphore(urls):
        return results
 ```
 ### 4.4 Robots.txt Consideration
 ```python
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
 async def main():
    urls = [
        "https://example1.com",
        "https://example2.com",
        "https://example3.com"
    ]
    config = CrawlerRunConfig(
        cache_mode=CacheMode.ENABLED,
        check_robots_txt=True,  # Will respect robots.txt for each URL
        semaphore_count=3      # Max concurrent requests
    )
    async with AsyncWebCrawler() as crawler:
        async for result in crawler.arun_many(urls, config=config):
            if result.success:
                print(f"Successfully crawled {result.url}")
            elif result.status_code == 403 and "robots.txt" in result.error_message:
                print(f"Skipped {result.url} - blocked by robots.txt")
            else:
                print(f"Failed to crawl {result.url}: {result.error_message}")
 if __name__ == "__main__":
    asyncio.run(main())
 ```
 **Key Points**:
 - When `check_robots_txt=True`, each URL's robots.txt is checked before crawling
 - Robots.txt files are cached for efficiency
 - Failed robots.txt checks return 403 status code
 - Dispatcher handles robots.txt checks automatically for each URL
 ## 5. Dispatch Results
 Each crawl result includes dispatch information:
--- a/docs/md_v2/api/arun.md
+++ b/docs/md_v2/api/arun.md
@@ -22,6 +22,7 @@ async def main():
    run_config = CrawlerRunConfig(
        verbose=True,            # Detailed logging
        cache_mode=CacheMode.ENABLED,  # Use normal read/write cache
        check_robots_txt=True,   # Respect robots.txt rules
        # ... other parameters
    )
@@ -30,8 +31,10 @@ async def main():
            url="https://example.com",
            config=run_config
        )
-        print(result.cleaned_html[:500])
+        
-
+        # Check if blocked by robots.txt
        if not result.success and result.status_code == 403:
            print(f"Error: {result.error_message}")
 ```
 **Key Fields**:
@@ -226,6 +229,7 @@ async def main():
        # Core
        verbose=True,
        cache_mode=CacheMode.ENABLED,
        check_robots_txt=True,   # Respect robots.txt rules
        # Content
        word_count_threshold=10,
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -106,6 +106,7 @@ Use these for controlling whether you read or write from a local content cache.
 | **`wait_for`**             | `str or None`           | Wait for a CSS (`"css:selector"`) or JS (`"js:() => bool"`) condition before content extraction.                     |
 | **`wait_for_images`**      | `bool` (False)          | Wait for images to load before finishing. Slows down if you only want text.                                          |
 | **`delay_before_return_html`** | `float` (0.1)       | Additional pause (seconds) before final HTML is captured. Good for last-second updates.                               |
 | **`check_robots_txt`**     | `bool` (False)          | Whether to check and respect robots.txt rules before crawling. If True, caches robots.txt for efficiency.            |
 | **`mean_delay`** and **`max_range`** | `float` (0.1, 0.3) | If you call `arun_many()`, these define random delay intervals between crawls, helping avoid detection or rate limits. |
 | **`semaphore_count`**      | `int` (5)               | Max concurrency for `arun_many()`. Increase if you have resources for parallel crawls.                                |
@@ -266,17 +267,21 @@ async def main():
 if __name__ == "__main__":
    asyncio.run(main())
 ## 2.4 Compliance & Ethics
 | **Parameter**          | **Type / Default**      | **What It Does**                                                                                                    |
 |-----------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------|
 | **`check_robots_txt`**| `bool` (False)          | When True, checks and respects robots.txt rules before crawling. Uses efficient caching with SQLite backend.          |
 | **`user_agent`**      | `str` (None)            | User agent string to identify your crawler. Used for robots.txt checking when enabled.                                |
 ```python
 run_config = CrawlerRunConfig(
    check_robots_txt=True,  # Enable robots.txt compliance
    user_agent="MyBot/1.0"  # Identify your crawler
 )
 ```
 **What’s Happening**:
 - **`text_mode=True`** avoids loading images and other heavy resources, speeding up the crawl.  
 - We disable caching (`cache_mode=CacheMode.BYPASS`) to always fetch fresh content.  
 - We only keep `main.article` content by specifying `css_selector="main.article"`.  
 - We exclude external links (`exclude_external_links=True`).  
 - We do a quick screenshot (`screenshot=True`) before finishing.
 ---
 ## 3. Putting It All Together
 - **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent.  
--- a/tests/20241401/test_robot_parser.py
+++ b/tests/20241401/test_robot_parser.py
@@ -0,0 +1,159 @@
 from crawl4ai.utils import RobotsParser
 import asyncio
 import aiohttp
 from aiohttp import web
 import tempfile
 import shutil
 import os, sys, time, json
 async def test_robots_parser():
    print("\n=== Testing RobotsParser ===\n")
    # Setup temporary directory for testing
    temp_dir = tempfile.mkdtemp()
    try:
        # 1. Basic setup test
        print("1. Testing basic initialization...")
        parser = RobotsParser(cache_dir=temp_dir)
        assert os.path.exists(parser.db_path), "Database file not created"
        print("✓ Basic initialization passed")
        # 2. Test common cases
        print("\n2. Testing common cases...")
        allowed = await parser.can_fetch("https://www.example.com", "MyBot/1.0")
        print(f"✓ Regular website fetch: {'allowed' if allowed else 'denied'}")
        # Test caching
        print("Testing cache...")
        start = time.time()
        await parser.can_fetch("https://www.example.com", "MyBot/1.0")
        duration = time.time() - start
        print(f"✓ Cached lookup took: {duration*1000:.2f}ms")
        assert duration < 0.03, "Cache lookup too slow"
        # 3. Edge cases
        print("\n3. Testing edge cases...")
        # Empty URL
        result = await parser.can_fetch("", "MyBot/1.0")
        print(f"✓ Empty URL handled: {'allowed' if result else 'denied'}")
        # Invalid URL
        result = await parser.can_fetch("not_a_url", "MyBot/1.0")
        print(f"✓ Invalid URL handled: {'allowed' if result else 'denied'}")
        # URL without scheme
        result = await parser.can_fetch("example.com/page", "MyBot/1.0")
        print(f"✓ URL without scheme handled: {'allowed' if result else 'denied'}")
        # 4. Test with local server
        async def start_test_server():
            app = web.Application()
            async def robots_txt(request):
                return web.Response(text="""User-agent: *
 Disallow: /private/
 Allow: /public/
 """)
            async def malformed_robots(request):
                return web.Response(text="<<<malformed>>>")
            async def timeout_robots(request):
                await asyncio.sleep(5)
                return web.Response(text="Should timeout")
            async def empty_robots(request):
                return web.Response(text="")
            async def giant_robots(request):
                return web.Response(text="User-agent: *\nDisallow: /\n" * 10000)
            # Mount all handlers at root level
            app.router.add_get('/robots.txt', robots_txt)
            app.router.add_get('/malformed/robots.txt', malformed_robots)
            app.router.add_get('/timeout/robots.txt', timeout_robots)
            app.router.add_get('/empty/robots.txt', empty_robots)
            app.router.add_get('/giant/robots.txt', giant_robots)
            runner = web.AppRunner(app)
            await runner.setup()
            site = web.TCPSite(runner, 'localhost', 8080)
            await site.start()
            return runner
        runner = await start_test_server()
        try:
            print("\n4. Testing robots.txt rules...")
            base_url = "http://localhost:8080"
            # Test public access
            result = await parser.can_fetch(f"{base_url}/public/page", "bot")
            print(f"Public access (/public/page): {'allowed' if result else 'denied'}")
            assert result, "Public path should be allowed"
            # Test private access
            result = await parser.can_fetch(f"{base_url}/private/secret", "bot")
            print(f"Private access (/private/secret): {'allowed' if result else 'denied'}")
            assert not result, "Private path should be denied"
            # Test malformed
            result = await parser.can_fetch("http://localhost:8080/malformed/page", "bot")
            print(f"✓ Malformed robots.txt handled: {'allowed' if result else 'denied'}")
            # Test timeout
            start = time.time()
            result = await parser.can_fetch("http://localhost:8080/timeout/page", "bot")
            duration = time.time() - start
            print(f"✓ Timeout handled (took {duration:.2f}s): {'allowed' if result else 'denied'}")
            assert duration < 3, "Timeout not working"
            # Test empty
            result = await parser.can_fetch("http://localhost:8080/empty/page", "bot")
            print(f"✓ Empty robots.txt handled: {'allowed' if result else 'denied'}")
            # Test giant file
            start = time.time()
            result = await parser.can_fetch("http://localhost:8080/giant/page", "bot")
            duration = time.time() - start
            print(f"✓ Giant robots.txt handled (took {duration:.2f}s): {'allowed' if result else 'denied'}")
        finally:
            await runner.cleanup()
        # 5. Cache manipulation
        print("\n5. Testing cache manipulation...")
        # Clear expired
        parser.clear_expired()
        print("✓ Clear expired entries completed")
        # Clear all
        parser.clear_cache()
        print("✓ Clear all cache completed")
        # Test with custom TTL
        custom_parser = RobotsParser(cache_dir=temp_dir, cache_ttl=1)  # 1 second TTL
        await custom_parser.can_fetch("https://www.example.com", "bot")
        print("✓ Custom TTL fetch completed")
        await asyncio.sleep(1.1)
        start = time.time()
        await custom_parser.can_fetch("https://www.example.com", "bot")
        print(f"✓ TTL expiry working (refetched after {time.time() - start:.2f}s)")
    finally:
        # Cleanup
        shutil.rmtree(temp_dir)
        print("\nTest cleanup completed")
 async def main():
    try:
        await test_robots_parser()
    except Exception as e:
        print(f"Test failed: {str(e)}")
        raise
 if __name__ == "__main__":
    asyncio.run(main())
--- a/tests/20241401/tets_robot.py
+++ b/tests/20241401/tets_robot.py
@@ -0,0 +1,62 @@
 import asyncio
 from crawl4ai import *
 async def test_real_websites():
    print("\n=== Testing Real Website Robots.txt Compliance ===\n")
    browser_config = BrowserConfig(headless=True, verbose=True)
    async with AsyncWebCrawler(config=browser_config) as crawler:
        # Test cases with URLs
        test_cases = [
            # Public sites that should be allowed
            ("https://example.com", True),  # Simple public site
            ("https://httpbin.org/get", True),  # API endpoint
            # Sites with known strict robots.txt
            ("https://www.facebook.com/robots.txt", False),  # Social media
            ("https://www.google.com/search", False),  # Search pages
            # Edge cases
            ("https://api.github.com", True),  # API service
            ("https://raw.githubusercontent.com", True),  # Content delivery
            # Non-existent/error cases
            ("https://thisisnotarealwebsite.com", True),  # Non-existent domain
            ("https://localhost:12345", True),  # Invalid port
        ]
        for url, expected in test_cases:
            print(f"\nTesting: {url}")
            try:
                config = CrawlerRunConfig(
                    cache_mode=CacheMode.BYPASS,
                    check_robots_txt=True,  # Enable robots.txt checking
                    verbose=True
                )
                result = await crawler.arun(url=url, config=config)
                allowed = result.success and not result.error_message
                print(f"Expected: {'allowed' if expected else 'denied'}")
                print(f"Actual: {'allowed' if allowed else 'denied'}")
                print(f"Status Code: {result.status_code}")
                if result.error_message:
                    print(f"Error: {result.error_message}")
                # Optional: Print robots.txt content if available
                if result.metadata and 'robots_txt' in result.metadata:
                    print(f"Robots.txt rules:\n{result.metadata['robots_txt']}")
            except Exception as e:
                print(f"Test failed with error: {str(e)}")
 async def main():
    try:
        await test_real_websites()
    except Exception as e:
        print(f"Test suite failed: {str(e)}")
        raise
 if __name__ == "__main__":
    asyncio.run(main())