feat(robots): add robots.txt compliance support

Add support for checking and respecting robots.txt rules before crawling websites:
- Implement RobotsParser class with SQLite caching
- Add check_robots_txt parameter to CrawlerRunConfig
- Integrate robots.txt checking in AsyncWebCrawler
- Update documentation with robots.txt compliance examples
- Add tests for robot parser functionality

The cache uses WAL mode for better concurrency and has a default TTL of 7 days.
This commit is contained in:
UncleCode
2025-01-21 17:54:13 +08:00
parent 9247877037
commit d09c611d15
11 changed files with 482 additions and 12 deletions

View File

@@ -372,6 +372,7 @@ class CrawlerRunConfig:
# Optional Parameters
stream (bool): If True, stream the page content as it is being loaded.
url: str = None # This is not a compulsory parameter
check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
"""
def __init__(
@@ -442,6 +443,7 @@ class CrawlerRunConfig:
# Streaming Parameters
stream: bool = False,
url: str = None,
check_robots_txt: bool = False,
):
self.url = url
@@ -521,6 +523,9 @@ class CrawlerRunConfig:
# Streaming Parameters
self.stream = stream
# Robots.txt Handling Parameters
self.check_robots_txt = check_robots_txt
# Validate type of extraction strategy and chunking strategy if they are provided
if self.extraction_strategy is not None and not isinstance(
self.extraction_strategy, ExtractionStrategy
@@ -617,6 +622,7 @@ class CrawlerRunConfig:
# Streaming Parameters
stream=kwargs.get("stream", False),
url=kwargs.get("url"),
check_robots_txt=kwargs.get("check_robots_txt", False),
)
# Create a funciton returns dict of the object
@@ -679,6 +685,7 @@ class CrawlerRunConfig:
"log_console": self.log_console,
"stream": self.stream,
"url": self.url,
"check_robots_txt": self.check_robots_txt,
}
def clone(self, **kwargs):