feat(robots): add robots.txt compliance support

Add support for checking and respecting robots.txt rules before crawling websites:
- Implement RobotsParser class with SQLite caching
- Add check_robots_txt parameter to CrawlerRunConfig
- Integrate robots.txt checking in AsyncWebCrawler
- Update documentation with robots.txt compliance examples
- Add tests for robot parser functionality

The cache uses WAL mode for better concurrency and has a default TTL of 7 days.
This commit is contained in:
UncleCode
2025-01-21 17:54:13 +08:00
parent 9247877037
commit d09c611d15
11 changed files with 482 additions and 12 deletions

View File

@@ -40,6 +40,7 @@ from .utils import (
fast_format_html,
create_box_message,
get_error_context,
RobotsParser,
)
from typing import Union, AsyncGenerator, List, TypeVar
@@ -203,6 +204,9 @@ class AsyncWebCrawler:
os.makedirs(self.crawl4ai_folder, exist_ok=True)
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
# Initialize robots parser
self.robots_parser = RobotsParser()
self.ready = False
async def start(self):
@@ -414,6 +418,18 @@ class AsyncWebCrawler:
if user_agent:
self.crawler_strategy.update_user_agent(user_agent)
# Check robots.txt if enabled
if config and config.check_robots_txt:
if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent):
return CrawlResult(
url=url,
html="",
success=False,
status_code=403,
error_message="Access denied by robots.txt",
response_headers={"X-Robots-Status": "Blocked by robots.txt"}
)
# Pass config to crawl method
async_response = await self.crawler_strategy.crawl(
url,