feat(robots): add robots.txt compliance support

Add support for checking and respecting robots.txt rules before crawling websites: - Implement RobotsParser class with SQLite caching - Add check_robots_txt parameter to CrawlerRunConfig - Integrate robots.txt checking in AsyncWebCrawler - Update documentation with robots.txt compliance examples - Add tests for robot parser functionality The cache uses WAL mode for better concurrency and has a default TTL of 7 days.
2025-01-21 17:54:13 +08:00
parent 9247877037
commit d09c611d15
11 changed files with 482 additions and 12 deletions
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -40,6 +40,7 @@ from .utils import (
    fast_format_html,
    create_box_message,
    get_error_context,
+    RobotsParser,
 )

 from typing import Union, AsyncGenerator, List, TypeVar
@@ -203,6 +204,9 @@ class AsyncWebCrawler:
        os.makedirs(self.crawl4ai_folder, exist_ok=True)
        os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)

+        # Initialize robots parser
+        self.robots_parser = RobotsParser()
+
        self.ready = False

    async def start(self):
@@ -414,6 +418,18 @@ class AsyncWebCrawler:
                    if user_agent:
                        self.crawler_strategy.update_user_agent(user_agent)

+                    # Check robots.txt if enabled
+                    if config and config.check_robots_txt:
+                        if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent):
+                            return CrawlResult(
+                                url=url,
+                                html="",
+                                success=False,
+                                status_code=403,
+                                error_message="Access denied by robots.txt",
+                                response_headers={"X-Robots-Status": "Blocked by robots.txt"}
+                            )
+
                    # Pass config to crawl method
                    async_response = await self.crawler_strategy.crawl(
                        url,