feat(robots): add robots.txt compliance support

Add support for checking and respecting robots.txt rules before crawling websites:
- Implement RobotsParser class with SQLite caching
- Add check_robots_txt parameter to CrawlerRunConfig
- Integrate robots.txt checking in AsyncWebCrawler
- Update documentation with robots.txt compliance examples
- Add tests for robot parser functionality

The cache uses WAL mode for better concurrency and has a default TTL of 7 days.
This commit is contained in:
UncleCode
2025-01-21 17:54:13 +08:00
parent 9247877037
commit d09c611d15
11 changed files with 482 additions and 12 deletions

3
.gitignore vendored
View File

@@ -228,3 +228,6 @@ tree.md
/plans
.codeiumignore
todo/
# windsurf rules
.windsurfrules

View File

@@ -1,3 +1,9 @@
### [Added] 2025-01-21
- Added robots.txt compliance support with efficient SQLite-based caching
- New `check_robots_txt` parameter in CrawlerRunConfig to enable robots.txt checking
- Documentation updates for robots.txt compliance features and examples
- Automated robots.txt checking integrated into AsyncWebCrawler with 403 status codes for blocked URLs
### [Added] 2025-01-20
- Added proxy configuration support to CrawlerRunConfig allowing dynamic proxy settings per crawl request
- Updated documentation with examples for using proxy configuration in crawl operations

View File

@@ -372,6 +372,7 @@ class CrawlerRunConfig:
# Optional Parameters
stream (bool): If True, stream the page content as it is being loaded.
url: str = None # This is not a compulsory parameter
check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
"""
def __init__(
@@ -442,6 +443,7 @@ class CrawlerRunConfig:
# Streaming Parameters
stream: bool = False,
url: str = None,
check_robots_txt: bool = False,
):
self.url = url
@@ -521,6 +523,9 @@ class CrawlerRunConfig:
# Streaming Parameters
self.stream = stream
# Robots.txt Handling Parameters
self.check_robots_txt = check_robots_txt
# Validate type of extraction strategy and chunking strategy if they are provided
if self.extraction_strategy is not None and not isinstance(
self.extraction_strategy, ExtractionStrategy
@@ -617,6 +622,7 @@ class CrawlerRunConfig:
# Streaming Parameters
stream=kwargs.get("stream", False),
url=kwargs.get("url"),
check_robots_txt=kwargs.get("check_robots_txt", False),
)
# Create a funciton returns dict of the object
@@ -679,6 +685,7 @@ class CrawlerRunConfig:
"log_console": self.log_console,
"stream": self.stream,
"url": self.url,
"check_robots_txt": self.check_robots_txt,
}
def clone(self, **kwargs):

View File

@@ -40,6 +40,7 @@ from .utils import (
fast_format_html,
create_box_message,
get_error_context,
RobotsParser,
)
from typing import Union, AsyncGenerator, List, TypeVar
@@ -203,6 +204,9 @@ class AsyncWebCrawler:
os.makedirs(self.crawl4ai_folder, exist_ok=True)
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
# Initialize robots parser
self.robots_parser = RobotsParser()
self.ready = False
async def start(self):
@@ -414,6 +418,18 @@ class AsyncWebCrawler:
if user_agent:
self.crawler_strategy.update_user_agent(user_agent)
# Check robots.txt if enabled
if config and config.check_robots_txt:
if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent):
return CrawlResult(
url=url,
html="",
success=False,
status_code=403,
error_message="Access denied by robots.txt",
response_headers={"X-Robots-Status": "Blocked by robots.txt"}
)
# Pass config to crawl method
async_response = await self.crawler_strategy.crawl(
url,

View File

@@ -23,6 +23,138 @@ import pstats
from functools import wraps
import asyncio
import sqlite3
import hashlib
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
import aiohttp
class RobotsParser:
# Default 7 days cache TTL
CACHE_TTL = 7 * 24 * 60 * 60
def __init__(self, cache_dir=None, cache_ttl=None):
self.cache_dir = cache_dir or os.path.join(get_home_folder(), ".crawl4ai", "robots")
self.cache_ttl = cache_ttl or self.CACHE_TTL
os.makedirs(self.cache_dir, exist_ok=True)
self.db_path = os.path.join(self.cache_dir, "robots_cache.db")
self._init_db()
def _init_db(self):
# Use WAL mode for better concurrency and performance
with sqlite3.connect(self.db_path) as conn:
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("""
CREATE TABLE IF NOT EXISTS robots_cache (
domain TEXT PRIMARY KEY,
rules TEXT NOT NULL,
fetch_time INTEGER NOT NULL,
hash TEXT NOT NULL
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_domain ON robots_cache(domain)")
def _get_cached_rules(self, domain: str) -> tuple[str, bool]:
"""Get cached rules. Returns (rules, is_fresh)"""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute(
"SELECT rules, fetch_time, hash FROM robots_cache WHERE domain = ?",
(domain,)
)
result = cursor.fetchone()
if not result:
return None, False
rules, fetch_time, _ = result
# Check if cache is still fresh based on TTL
return rules, (time.time() - fetch_time) < self.cache_ttl
def _cache_rules(self, domain: str, content: str):
"""Cache robots.txt content with hash for change detection"""
hash_val = hashlib.md5(content.encode()).hexdigest()
with sqlite3.connect(self.db_path) as conn:
# Check if content actually changed
cursor = conn.execute(
"SELECT hash FROM robots_cache WHERE domain = ?",
(domain,)
)
result = cursor.fetchone()
# Only update if hash changed or no previous entry
if not result or result[0] != hash_val:
conn.execute(
"""INSERT OR REPLACE INTO robots_cache
(domain, rules, fetch_time, hash)
VALUES (?, ?, ?, ?)""",
(domain, content, int(time.time()), hash_val)
)
async def can_fetch(self, url: str, user_agent: str = "*") -> bool:
"""
Check if URL can be fetched according to robots.txt rules.
Args:
url: The URL to check
user_agent: User agent string to check against (default: "*")
Returns:
bool: True if allowed, False if disallowed by robots.txt
"""
# Handle empty/invalid URLs
try:
parsed = urlparse(url)
domain = parsed.netloc
if not domain:
return True
except:
return True
# Fast path - check cache first
rules, is_fresh = self._get_cached_rules(domain)
# If rules not found or stale, fetch new ones
if not is_fresh:
try:
# Ensure we use the same scheme as the input URL
scheme = parsed.scheme or 'http'
robots_url = f"{scheme}://{domain}/robots.txt"
async with aiohttp.ClientSession() as session:
async with session.get(robots_url, timeout=2) as response:
if response.status == 200:
rules = await response.text()
self._cache_rules(domain, rules)
else:
return True
except:
# On any error (timeout, connection failed, etc), allow access
return True
if not rules:
return True
# Create parser for this check
parser = RobotFileParser()
parser.parse(rules.splitlines())
# If parser can't read rules, allow access
if not parser.mtime():
return True
return parser.can_fetch(user_agent, url)
def clear_cache(self):
"""Clear all cached robots.txt entries"""
with sqlite3.connect(self.db_path) as conn:
conn.execute("DELETE FROM robots_cache")
def clear_expired(self):
"""Remove only expired entries from cache"""
with sqlite3.connect(self.db_path) as conn:
expire_time = int(time.time()) - self.cache_ttl
conn.execute("DELETE FROM robots_cache WHERE fetch_time < ?", (expire_time,))
class InvalidCSSSelectorError(Exception):
pass

View File

@@ -8,6 +8,7 @@ Crawl4AI offers multiple power-user features that go beyond simple crawling. Thi
3. **Handling SSL Certificates**
4. **Custom Headers**
5. **Session Persistence & Local Storage**
6. **Robots.txt Compliance**
> **Prerequisites**
> - You have a basic grasp of [AsyncWebCrawler Basics](../core/simple-crawling.md)
@@ -251,6 +252,42 @@ You can sign in once, export the browser context, and reuse it later—without r
---
## 6. Robots.txt Compliance
Crawl4AI supports respecting robots.txt rules with efficient caching:
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
async def main():
# Enable robots.txt checking in config
config = CrawlerRunConfig(
check_robots_txt=True # Will check and respect robots.txt rules
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
"https://example.com",
config=config
)
if not result.success and result.status_code == 403:
print("Access denied by robots.txt")
if __name__ == "__main__":
asyncio.run(main())
```
**Key Points**
- Robots.txt files are cached locally for efficiency
- Cache is stored in `~/.crawl4ai/robots/robots_cache.db`
- Cache has a default TTL of 7 days
- If robots.txt can't be fetched, crawling is allowed
- Returns 403 status code if URL is disallowed
---
## Putting It All Together
Heres a snippet that combines multiple “advanced” features (proxy, PDF, screenshot, SSL, custom headers, and session reuse) into one run. Normally, youd tailor each setting to your projects needs.
@@ -321,6 +358,7 @@ Youve now explored several **advanced** features:
- **SSL Certificate** retrieval & exporting
- **Custom Headers** for language or specialized requests
- **Session Persistence** via storage state
- **Robots.txt Compliance**
With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, and manage sessions across multiple runs—streamlining your entire data collection pipeline.

View File

@@ -189,6 +189,44 @@ async def crawl_with_semaphore(urls):
return results
```
### 4.4 Robots.txt Consideration
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
async def main():
urls = [
"https://example1.com",
"https://example2.com",
"https://example3.com"
]
config = CrawlerRunConfig(
cache_mode=CacheMode.ENABLED,
check_robots_txt=True, # Will respect robots.txt for each URL
semaphore_count=3 # Max concurrent requests
)
async with AsyncWebCrawler() as crawler:
async for result in crawler.arun_many(urls, config=config):
if result.success:
print(f"Successfully crawled {result.url}")
elif result.status_code == 403 and "robots.txt" in result.error_message:
print(f"Skipped {result.url} - blocked by robots.txt")
else:
print(f"Failed to crawl {result.url}: {result.error_message}")
if __name__ == "__main__":
asyncio.run(main())
```
**Key Points**:
- When `check_robots_txt=True`, each URL's robots.txt is checked before crawling
- Robots.txt files are cached for efficiency
- Failed robots.txt checks return 403 status code
- Dispatcher handles robots.txt checks automatically for each URL
## 5. Dispatch Results
Each crawl result includes dispatch information:

View File

@@ -22,6 +22,7 @@ async def main():
run_config = CrawlerRunConfig(
verbose=True, # Detailed logging
cache_mode=CacheMode.ENABLED, # Use normal read/write cache
check_robots_txt=True, # Respect robots.txt rules
# ... other parameters
)
@@ -30,8 +31,10 @@ async def main():
url="https://example.com",
config=run_config
)
print(result.cleaned_html[:500])
# Check if blocked by robots.txt
if not result.success and result.status_code == 403:
print(f"Error: {result.error_message}")
```
**Key Fields**:
@@ -226,6 +229,7 @@ async def main():
# Core
verbose=True,
cache_mode=CacheMode.ENABLED,
check_robots_txt=True, # Respect robots.txt rules
# Content
word_count_threshold=10,

View File

@@ -106,6 +106,7 @@ Use these for controlling whether you read or write from a local content cache.
| **`wait_for`** | `str or None` | Wait for a CSS (`"css:selector"`) or JS (`"js:() => bool"`) condition before content extraction. |
| **`wait_for_images`** | `bool` (False) | Wait for images to load before finishing. Slows down if you only want text. |
| **`delay_before_return_html`** | `float` (0.1) | Additional pause (seconds) before final HTML is captured. Good for last-second updates. |
| **`check_robots_txt`** | `bool` (False) | Whether to check and respect robots.txt rules before crawling. If True, caches robots.txt for efficiency. |
| **`mean_delay`** and **`max_range`** | `float` (0.1, 0.3) | If you call `arun_many()`, these define random delay intervals between crawls, helping avoid detection or rate limits. |
| **`semaphore_count`** | `int` (5) | Max concurrency for `arun_many()`. Increase if you have resources for parallel crawls. |
@@ -266,17 +267,21 @@ async def main():
if __name__ == "__main__":
asyncio.run(main())
## 2.4 Compliance & Ethics
| **Parameter** | **Type / Default** | **What It Does** |
|-----------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------|
| **`check_robots_txt`**| `bool` (False) | When True, checks and respects robots.txt rules before crawling. Uses efficient caching with SQLite backend. |
| **`user_agent`** | `str` (None) | User agent string to identify your crawler. Used for robots.txt checking when enabled. |
```python
run_config = CrawlerRunConfig(
check_robots_txt=True, # Enable robots.txt compliance
user_agent="MyBot/1.0" # Identify your crawler
)
```
**Whats Happening**:
- **`text_mode=True`** avoids loading images and other heavy resources, speeding up the crawl.
- We disable caching (`cache_mode=CacheMode.BYPASS`) to always fetch fresh content.
- We only keep `main.article` content by specifying `css_selector="main.article"`.
- We exclude external links (`exclude_external_links=True`).
- We do a quick screenshot (`screenshot=True`) before finishing.
---
## 3. Putting It All Together
- **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent.

View File

@@ -0,0 +1,159 @@
from crawl4ai.utils import RobotsParser
import asyncio
import aiohttp
from aiohttp import web
import tempfile
import shutil
import os, sys, time, json
async def test_robots_parser():
print("\n=== Testing RobotsParser ===\n")
# Setup temporary directory for testing
temp_dir = tempfile.mkdtemp()
try:
# 1. Basic setup test
print("1. Testing basic initialization...")
parser = RobotsParser(cache_dir=temp_dir)
assert os.path.exists(parser.db_path), "Database file not created"
print("✓ Basic initialization passed")
# 2. Test common cases
print("\n2. Testing common cases...")
allowed = await parser.can_fetch("https://www.example.com", "MyBot/1.0")
print(f"✓ Regular website fetch: {'allowed' if allowed else 'denied'}")
# Test caching
print("Testing cache...")
start = time.time()
await parser.can_fetch("https://www.example.com", "MyBot/1.0")
duration = time.time() - start
print(f"✓ Cached lookup took: {duration*1000:.2f}ms")
assert duration < 0.03, "Cache lookup too slow"
# 3. Edge cases
print("\n3. Testing edge cases...")
# Empty URL
result = await parser.can_fetch("", "MyBot/1.0")
print(f"✓ Empty URL handled: {'allowed' if result else 'denied'}")
# Invalid URL
result = await parser.can_fetch("not_a_url", "MyBot/1.0")
print(f"✓ Invalid URL handled: {'allowed' if result else 'denied'}")
# URL without scheme
result = await parser.can_fetch("example.com/page", "MyBot/1.0")
print(f"✓ URL without scheme handled: {'allowed' if result else 'denied'}")
# 4. Test with local server
async def start_test_server():
app = web.Application()
async def robots_txt(request):
return web.Response(text="""User-agent: *
Disallow: /private/
Allow: /public/
""")
async def malformed_robots(request):
return web.Response(text="<<<malformed>>>")
async def timeout_robots(request):
await asyncio.sleep(5)
return web.Response(text="Should timeout")
async def empty_robots(request):
return web.Response(text="")
async def giant_robots(request):
return web.Response(text="User-agent: *\nDisallow: /\n" * 10000)
# Mount all handlers at root level
app.router.add_get('/robots.txt', robots_txt)
app.router.add_get('/malformed/robots.txt', malformed_robots)
app.router.add_get('/timeout/robots.txt', timeout_robots)
app.router.add_get('/empty/robots.txt', empty_robots)
app.router.add_get('/giant/robots.txt', giant_robots)
runner = web.AppRunner(app)
await runner.setup()
site = web.TCPSite(runner, 'localhost', 8080)
await site.start()
return runner
runner = await start_test_server()
try:
print("\n4. Testing robots.txt rules...")
base_url = "http://localhost:8080"
# Test public access
result = await parser.can_fetch(f"{base_url}/public/page", "bot")
print(f"Public access (/public/page): {'allowed' if result else 'denied'}")
assert result, "Public path should be allowed"
# Test private access
result = await parser.can_fetch(f"{base_url}/private/secret", "bot")
print(f"Private access (/private/secret): {'allowed' if result else 'denied'}")
assert not result, "Private path should be denied"
# Test malformed
result = await parser.can_fetch("http://localhost:8080/malformed/page", "bot")
print(f"✓ Malformed robots.txt handled: {'allowed' if result else 'denied'}")
# Test timeout
start = time.time()
result = await parser.can_fetch("http://localhost:8080/timeout/page", "bot")
duration = time.time() - start
print(f"✓ Timeout handled (took {duration:.2f}s): {'allowed' if result else 'denied'}")
assert duration < 3, "Timeout not working"
# Test empty
result = await parser.can_fetch("http://localhost:8080/empty/page", "bot")
print(f"✓ Empty robots.txt handled: {'allowed' if result else 'denied'}")
# Test giant file
start = time.time()
result = await parser.can_fetch("http://localhost:8080/giant/page", "bot")
duration = time.time() - start
print(f"✓ Giant robots.txt handled (took {duration:.2f}s): {'allowed' if result else 'denied'}")
finally:
await runner.cleanup()
# 5. Cache manipulation
print("\n5. Testing cache manipulation...")
# Clear expired
parser.clear_expired()
print("✓ Clear expired entries completed")
# Clear all
parser.clear_cache()
print("✓ Clear all cache completed")
# Test with custom TTL
custom_parser = RobotsParser(cache_dir=temp_dir, cache_ttl=1) # 1 second TTL
await custom_parser.can_fetch("https://www.example.com", "bot")
print("✓ Custom TTL fetch completed")
await asyncio.sleep(1.1)
start = time.time()
await custom_parser.can_fetch("https://www.example.com", "bot")
print(f"✓ TTL expiry working (refetched after {time.time() - start:.2f}s)")
finally:
# Cleanup
shutil.rmtree(temp_dir)
print("\nTest cleanup completed")
async def main():
try:
await test_robots_parser()
except Exception as e:
print(f"Test failed: {str(e)}")
raise
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,62 @@
import asyncio
from crawl4ai import *
async def test_real_websites():
print("\n=== Testing Real Website Robots.txt Compliance ===\n")
browser_config = BrowserConfig(headless=True, verbose=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
# Test cases with URLs
test_cases = [
# Public sites that should be allowed
("https://example.com", True), # Simple public site
("https://httpbin.org/get", True), # API endpoint
# Sites with known strict robots.txt
("https://www.facebook.com/robots.txt", False), # Social media
("https://www.google.com/search", False), # Search pages
# Edge cases
("https://api.github.com", True), # API service
("https://raw.githubusercontent.com", True), # Content delivery
# Non-existent/error cases
("https://thisisnotarealwebsite.com", True), # Non-existent domain
("https://localhost:12345", True), # Invalid port
]
for url, expected in test_cases:
print(f"\nTesting: {url}")
try:
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
check_robots_txt=True, # Enable robots.txt checking
verbose=True
)
result = await crawler.arun(url=url, config=config)
allowed = result.success and not result.error_message
print(f"Expected: {'allowed' if expected else 'denied'}")
print(f"Actual: {'allowed' if allowed else 'denied'}")
print(f"Status Code: {result.status_code}")
if result.error_message:
print(f"Error: {result.error_message}")
# Optional: Print robots.txt content if available
if result.metadata and 'robots_txt' in result.metadata:
print(f"Robots.txt rules:\n{result.metadata['robots_txt']}")
except Exception as e:
print(f"Test failed with error: {str(e)}")
async def main():
try:
await test_real_websites()
except Exception as e:
print(f"Test suite failed: {str(e)}")
raise
if __name__ == "__main__":
asyncio.run(main())