feat(robots): add robots.txt compliance support
Add support for checking and respecting robots.txt rules before crawling websites: - Implement RobotsParser class with SQLite caching - Add check_robots_txt parameter to CrawlerRunConfig - Integrate robots.txt checking in AsyncWebCrawler - Update documentation with robots.txt compliance examples - Add tests for robot parser functionality The cache uses WAL mode for better concurrency and has a default TTL of 7 days.
This commit is contained in:
5
.gitignore
vendored
5
.gitignore
vendored
@@ -227,4 +227,7 @@ tree.md
|
|||||||
.do
|
.do
|
||||||
/plans
|
/plans
|
||||||
.codeiumignore
|
.codeiumignore
|
||||||
todo/
|
todo/
|
||||||
|
|
||||||
|
# windsurf rules
|
||||||
|
.windsurfrules
|
||||||
|
|||||||
@@ -1,3 +1,9 @@
|
|||||||
|
### [Added] 2025-01-21
|
||||||
|
- Added robots.txt compliance support with efficient SQLite-based caching
|
||||||
|
- New `check_robots_txt` parameter in CrawlerRunConfig to enable robots.txt checking
|
||||||
|
- Documentation updates for robots.txt compliance features and examples
|
||||||
|
- Automated robots.txt checking integrated into AsyncWebCrawler with 403 status codes for blocked URLs
|
||||||
|
|
||||||
### [Added] 2025-01-20
|
### [Added] 2025-01-20
|
||||||
- Added proxy configuration support to CrawlerRunConfig allowing dynamic proxy settings per crawl request
|
- Added proxy configuration support to CrawlerRunConfig allowing dynamic proxy settings per crawl request
|
||||||
- Updated documentation with examples for using proxy configuration in crawl operations
|
- Updated documentation with examples for using proxy configuration in crawl operations
|
||||||
|
|||||||
@@ -372,6 +372,7 @@ class CrawlerRunConfig:
|
|||||||
# Optional Parameters
|
# Optional Parameters
|
||||||
stream (bool): If True, stream the page content as it is being loaded.
|
stream (bool): If True, stream the page content as it is being loaded.
|
||||||
url: str = None # This is not a compulsory parameter
|
url: str = None # This is not a compulsory parameter
|
||||||
|
check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -442,6 +443,7 @@ class CrawlerRunConfig:
|
|||||||
# Streaming Parameters
|
# Streaming Parameters
|
||||||
stream: bool = False,
|
stream: bool = False,
|
||||||
url: str = None,
|
url: str = None,
|
||||||
|
check_robots_txt: bool = False,
|
||||||
):
|
):
|
||||||
self.url = url
|
self.url = url
|
||||||
|
|
||||||
@@ -521,6 +523,9 @@ class CrawlerRunConfig:
|
|||||||
# Streaming Parameters
|
# Streaming Parameters
|
||||||
self.stream = stream
|
self.stream = stream
|
||||||
|
|
||||||
|
# Robots.txt Handling Parameters
|
||||||
|
self.check_robots_txt = check_robots_txt
|
||||||
|
|
||||||
# Validate type of extraction strategy and chunking strategy if they are provided
|
# Validate type of extraction strategy and chunking strategy if they are provided
|
||||||
if self.extraction_strategy is not None and not isinstance(
|
if self.extraction_strategy is not None and not isinstance(
|
||||||
self.extraction_strategy, ExtractionStrategy
|
self.extraction_strategy, ExtractionStrategy
|
||||||
@@ -617,6 +622,7 @@ class CrawlerRunConfig:
|
|||||||
# Streaming Parameters
|
# Streaming Parameters
|
||||||
stream=kwargs.get("stream", False),
|
stream=kwargs.get("stream", False),
|
||||||
url=kwargs.get("url"),
|
url=kwargs.get("url"),
|
||||||
|
check_robots_txt=kwargs.get("check_robots_txt", False),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create a funciton returns dict of the object
|
# Create a funciton returns dict of the object
|
||||||
@@ -679,6 +685,7 @@ class CrawlerRunConfig:
|
|||||||
"log_console": self.log_console,
|
"log_console": self.log_console,
|
||||||
"stream": self.stream,
|
"stream": self.stream,
|
||||||
"url": self.url,
|
"url": self.url,
|
||||||
|
"check_robots_txt": self.check_robots_txt,
|
||||||
}
|
}
|
||||||
|
|
||||||
def clone(self, **kwargs):
|
def clone(self, **kwargs):
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ from .utils import (
|
|||||||
fast_format_html,
|
fast_format_html,
|
||||||
create_box_message,
|
create_box_message,
|
||||||
get_error_context,
|
get_error_context,
|
||||||
|
RobotsParser,
|
||||||
)
|
)
|
||||||
|
|
||||||
from typing import Union, AsyncGenerator, List, TypeVar
|
from typing import Union, AsyncGenerator, List, TypeVar
|
||||||
@@ -203,6 +204,9 @@ class AsyncWebCrawler:
|
|||||||
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
||||||
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
||||||
|
|
||||||
|
# Initialize robots parser
|
||||||
|
self.robots_parser = RobotsParser()
|
||||||
|
|
||||||
self.ready = False
|
self.ready = False
|
||||||
|
|
||||||
async def start(self):
|
async def start(self):
|
||||||
@@ -414,6 +418,18 @@ class AsyncWebCrawler:
|
|||||||
if user_agent:
|
if user_agent:
|
||||||
self.crawler_strategy.update_user_agent(user_agent)
|
self.crawler_strategy.update_user_agent(user_agent)
|
||||||
|
|
||||||
|
# Check robots.txt if enabled
|
||||||
|
if config and config.check_robots_txt:
|
||||||
|
if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent):
|
||||||
|
return CrawlResult(
|
||||||
|
url=url,
|
||||||
|
html="",
|
||||||
|
success=False,
|
||||||
|
status_code=403,
|
||||||
|
error_message="Access denied by robots.txt",
|
||||||
|
response_headers={"X-Robots-Status": "Blocked by robots.txt"}
|
||||||
|
)
|
||||||
|
|
||||||
# Pass config to crawl method
|
# Pass config to crawl method
|
||||||
async_response = await self.crawler_strategy.crawl(
|
async_response = await self.crawler_strategy.crawl(
|
||||||
url,
|
url,
|
||||||
|
|||||||
@@ -23,6 +23,138 @@ import pstats
|
|||||||
from functools import wraps
|
from functools import wraps
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
import hashlib
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
from urllib.robotparser import RobotFileParser
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
class RobotsParser:
|
||||||
|
# Default 7 days cache TTL
|
||||||
|
CACHE_TTL = 7 * 24 * 60 * 60
|
||||||
|
|
||||||
|
def __init__(self, cache_dir=None, cache_ttl=None):
|
||||||
|
self.cache_dir = cache_dir or os.path.join(get_home_folder(), ".crawl4ai", "robots")
|
||||||
|
self.cache_ttl = cache_ttl or self.CACHE_TTL
|
||||||
|
os.makedirs(self.cache_dir, exist_ok=True)
|
||||||
|
self.db_path = os.path.join(self.cache_dir, "robots_cache.db")
|
||||||
|
self._init_db()
|
||||||
|
|
||||||
|
def _init_db(self):
|
||||||
|
# Use WAL mode for better concurrency and performance
|
||||||
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
|
conn.execute("PRAGMA journal_mode=WAL")
|
||||||
|
conn.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS robots_cache (
|
||||||
|
domain TEXT PRIMARY KEY,
|
||||||
|
rules TEXT NOT NULL,
|
||||||
|
fetch_time INTEGER NOT NULL,
|
||||||
|
hash TEXT NOT NULL
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_domain ON robots_cache(domain)")
|
||||||
|
|
||||||
|
def _get_cached_rules(self, domain: str) -> tuple[str, bool]:
|
||||||
|
"""Get cached rules. Returns (rules, is_fresh)"""
|
||||||
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
|
cursor = conn.execute(
|
||||||
|
"SELECT rules, fetch_time, hash FROM robots_cache WHERE domain = ?",
|
||||||
|
(domain,)
|
||||||
|
)
|
||||||
|
result = cursor.fetchone()
|
||||||
|
|
||||||
|
if not result:
|
||||||
|
return None, False
|
||||||
|
|
||||||
|
rules, fetch_time, _ = result
|
||||||
|
# Check if cache is still fresh based on TTL
|
||||||
|
return rules, (time.time() - fetch_time) < self.cache_ttl
|
||||||
|
|
||||||
|
def _cache_rules(self, domain: str, content: str):
|
||||||
|
"""Cache robots.txt content with hash for change detection"""
|
||||||
|
hash_val = hashlib.md5(content.encode()).hexdigest()
|
||||||
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
|
# Check if content actually changed
|
||||||
|
cursor = conn.execute(
|
||||||
|
"SELECT hash FROM robots_cache WHERE domain = ?",
|
||||||
|
(domain,)
|
||||||
|
)
|
||||||
|
result = cursor.fetchone()
|
||||||
|
|
||||||
|
# Only update if hash changed or no previous entry
|
||||||
|
if not result or result[0] != hash_val:
|
||||||
|
conn.execute(
|
||||||
|
"""INSERT OR REPLACE INTO robots_cache
|
||||||
|
(domain, rules, fetch_time, hash)
|
||||||
|
VALUES (?, ?, ?, ?)""",
|
||||||
|
(domain, content, int(time.time()), hash_val)
|
||||||
|
)
|
||||||
|
|
||||||
|
async def can_fetch(self, url: str, user_agent: str = "*") -> bool:
|
||||||
|
"""
|
||||||
|
Check if URL can be fetched according to robots.txt rules.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL to check
|
||||||
|
user_agent: User agent string to check against (default: "*")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if allowed, False if disallowed by robots.txt
|
||||||
|
"""
|
||||||
|
# Handle empty/invalid URLs
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
domain = parsed.netloc
|
||||||
|
if not domain:
|
||||||
|
return True
|
||||||
|
except:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Fast path - check cache first
|
||||||
|
rules, is_fresh = self._get_cached_rules(domain)
|
||||||
|
|
||||||
|
# If rules not found or stale, fetch new ones
|
||||||
|
if not is_fresh:
|
||||||
|
try:
|
||||||
|
# Ensure we use the same scheme as the input URL
|
||||||
|
scheme = parsed.scheme or 'http'
|
||||||
|
robots_url = f"{scheme}://{domain}/robots.txt"
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(robots_url, timeout=2) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
rules = await response.text()
|
||||||
|
self._cache_rules(domain, rules)
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
except:
|
||||||
|
# On any error (timeout, connection failed, etc), allow access
|
||||||
|
return True
|
||||||
|
|
||||||
|
if not rules:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Create parser for this check
|
||||||
|
parser = RobotFileParser()
|
||||||
|
parser.parse(rules.splitlines())
|
||||||
|
|
||||||
|
# If parser can't read rules, allow access
|
||||||
|
if not parser.mtime():
|
||||||
|
return True
|
||||||
|
|
||||||
|
return parser.can_fetch(user_agent, url)
|
||||||
|
|
||||||
|
def clear_cache(self):
|
||||||
|
"""Clear all cached robots.txt entries"""
|
||||||
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
|
conn.execute("DELETE FROM robots_cache")
|
||||||
|
|
||||||
|
def clear_expired(self):
|
||||||
|
"""Remove only expired entries from cache"""
|
||||||
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
|
expire_time = int(time.time()) - self.cache_ttl
|
||||||
|
conn.execute("DELETE FROM robots_cache WHERE fetch_time < ?", (expire_time,))
|
||||||
|
|
||||||
|
|
||||||
class InvalidCSSSelectorError(Exception):
|
class InvalidCSSSelectorError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ Crawl4AI offers multiple power-user features that go beyond simple crawling. Thi
|
|||||||
3. **Handling SSL Certificates**
|
3. **Handling SSL Certificates**
|
||||||
4. **Custom Headers**
|
4. **Custom Headers**
|
||||||
5. **Session Persistence & Local Storage**
|
5. **Session Persistence & Local Storage**
|
||||||
|
6. **Robots.txt Compliance**
|
||||||
|
|
||||||
> **Prerequisites**
|
> **Prerequisites**
|
||||||
> - You have a basic grasp of [AsyncWebCrawler Basics](../core/simple-crawling.md)
|
> - You have a basic grasp of [AsyncWebCrawler Basics](../core/simple-crawling.md)
|
||||||
@@ -251,6 +252,42 @@ You can sign in once, export the browser context, and reuse it later—without r
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## 6. Robots.txt Compliance
|
||||||
|
|
||||||
|
Crawl4AI supports respecting robots.txt rules with efficient caching:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
# Enable robots.txt checking in config
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
check_robots_txt=True # Will check and respect robots.txt rules
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
"https://example.com",
|
||||||
|
config=config
|
||||||
|
)
|
||||||
|
|
||||||
|
if not result.success and result.status_code == 403:
|
||||||
|
print("Access denied by robots.txt")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key Points**
|
||||||
|
- Robots.txt files are cached locally for efficiency
|
||||||
|
- Cache is stored in `~/.crawl4ai/robots/robots_cache.db`
|
||||||
|
- Cache has a default TTL of 7 days
|
||||||
|
- If robots.txt can't be fetched, crawling is allowed
|
||||||
|
- Returns 403 status code if URL is disallowed
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## Putting It All Together
|
## Putting It All Together
|
||||||
|
|
||||||
Here’s a snippet that combines multiple “advanced” features (proxy, PDF, screenshot, SSL, custom headers, and session reuse) into one run. Normally, you’d tailor each setting to your project’s needs.
|
Here’s a snippet that combines multiple “advanced” features (proxy, PDF, screenshot, SSL, custom headers, and session reuse) into one run. Normally, you’d tailor each setting to your project’s needs.
|
||||||
@@ -321,6 +358,7 @@ You’ve now explored several **advanced** features:
|
|||||||
- **SSL Certificate** retrieval & exporting
|
- **SSL Certificate** retrieval & exporting
|
||||||
- **Custom Headers** for language or specialized requests
|
- **Custom Headers** for language or specialized requests
|
||||||
- **Session Persistence** via storage state
|
- **Session Persistence** via storage state
|
||||||
|
- **Robots.txt Compliance**
|
||||||
|
|
||||||
With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, and manage sessions across multiple runs—streamlining your entire data collection pipeline.
|
With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, and manage sessions across multiple runs—streamlining your entire data collection pipeline.
|
||||||
|
|
||||||
|
|||||||
@@ -189,6 +189,44 @@ async def crawl_with_semaphore(urls):
|
|||||||
return results
|
return results
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### 4.4 Robots.txt Consideration
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
urls = [
|
||||||
|
"https://example1.com",
|
||||||
|
"https://example2.com",
|
||||||
|
"https://example3.com"
|
||||||
|
]
|
||||||
|
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.ENABLED,
|
||||||
|
check_robots_txt=True, # Will respect robots.txt for each URL
|
||||||
|
semaphore_count=3 # Max concurrent requests
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
async for result in crawler.arun_many(urls, config=config):
|
||||||
|
if result.success:
|
||||||
|
print(f"Successfully crawled {result.url}")
|
||||||
|
elif result.status_code == 403 and "robots.txt" in result.error_message:
|
||||||
|
print(f"Skipped {result.url} - blocked by robots.txt")
|
||||||
|
else:
|
||||||
|
print(f"Failed to crawl {result.url}: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key Points**:
|
||||||
|
- When `check_robots_txt=True`, each URL's robots.txt is checked before crawling
|
||||||
|
- Robots.txt files are cached for efficiency
|
||||||
|
- Failed robots.txt checks return 403 status code
|
||||||
|
- Dispatcher handles robots.txt checks automatically for each URL
|
||||||
|
|
||||||
## 5. Dispatch Results
|
## 5. Dispatch Results
|
||||||
|
|
||||||
Each crawl result includes dispatch information:
|
Each crawl result includes dispatch information:
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ async def main():
|
|||||||
run_config = CrawlerRunConfig(
|
run_config = CrawlerRunConfig(
|
||||||
verbose=True, # Detailed logging
|
verbose=True, # Detailed logging
|
||||||
cache_mode=CacheMode.ENABLED, # Use normal read/write cache
|
cache_mode=CacheMode.ENABLED, # Use normal read/write cache
|
||||||
|
check_robots_txt=True, # Respect robots.txt rules
|
||||||
# ... other parameters
|
# ... other parameters
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -30,8 +31,10 @@ async def main():
|
|||||||
url="https://example.com",
|
url="https://example.com",
|
||||||
config=run_config
|
config=run_config
|
||||||
)
|
)
|
||||||
print(result.cleaned_html[:500])
|
|
||||||
|
# Check if blocked by robots.txt
|
||||||
|
if not result.success and result.status_code == 403:
|
||||||
|
print(f"Error: {result.error_message}")
|
||||||
```
|
```
|
||||||
|
|
||||||
**Key Fields**:
|
**Key Fields**:
|
||||||
@@ -226,6 +229,7 @@ async def main():
|
|||||||
# Core
|
# Core
|
||||||
verbose=True,
|
verbose=True,
|
||||||
cache_mode=CacheMode.ENABLED,
|
cache_mode=CacheMode.ENABLED,
|
||||||
|
check_robots_txt=True, # Respect robots.txt rules
|
||||||
|
|
||||||
# Content
|
# Content
|
||||||
word_count_threshold=10,
|
word_count_threshold=10,
|
||||||
|
|||||||
@@ -106,6 +106,7 @@ Use these for controlling whether you read or write from a local content cache.
|
|||||||
| **`wait_for`** | `str or None` | Wait for a CSS (`"css:selector"`) or JS (`"js:() => bool"`) condition before content extraction. |
|
| **`wait_for`** | `str or None` | Wait for a CSS (`"css:selector"`) or JS (`"js:() => bool"`) condition before content extraction. |
|
||||||
| **`wait_for_images`** | `bool` (False) | Wait for images to load before finishing. Slows down if you only want text. |
|
| **`wait_for_images`** | `bool` (False) | Wait for images to load before finishing. Slows down if you only want text. |
|
||||||
| **`delay_before_return_html`** | `float` (0.1) | Additional pause (seconds) before final HTML is captured. Good for last-second updates. |
|
| **`delay_before_return_html`** | `float` (0.1) | Additional pause (seconds) before final HTML is captured. Good for last-second updates. |
|
||||||
|
| **`check_robots_txt`** | `bool` (False) | Whether to check and respect robots.txt rules before crawling. If True, caches robots.txt for efficiency. |
|
||||||
| **`mean_delay`** and **`max_range`** | `float` (0.1, 0.3) | If you call `arun_many()`, these define random delay intervals between crawls, helping avoid detection or rate limits. |
|
| **`mean_delay`** and **`max_range`** | `float` (0.1, 0.3) | If you call `arun_many()`, these define random delay intervals between crawls, helping avoid detection or rate limits. |
|
||||||
| **`semaphore_count`** | `int` (5) | Max concurrency for `arun_many()`. Increase if you have resources for parallel crawls. |
|
| **`semaphore_count`** | `int` (5) | Max concurrency for `arun_many()`. Increase if you have resources for parallel crawls. |
|
||||||
|
|
||||||
@@ -266,17 +267,21 @@ async def main():
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
|
|
||||||
|
## 2.4 Compliance & Ethics
|
||||||
|
|
||||||
|
| **Parameter** | **Type / Default** | **What It Does** |
|
||||||
|
|-----------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------|
|
||||||
|
| **`check_robots_txt`**| `bool` (False) | When True, checks and respects robots.txt rules before crawling. Uses efficient caching with SQLite backend. |
|
||||||
|
| **`user_agent`** | `str` (None) | User agent string to identify your crawler. Used for robots.txt checking when enabled. |
|
||||||
|
|
||||||
|
```python
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
check_robots_txt=True, # Enable robots.txt compliance
|
||||||
|
user_agent="MyBot/1.0" # Identify your crawler
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
**What’s Happening**:
|
|
||||||
- **`text_mode=True`** avoids loading images and other heavy resources, speeding up the crawl.
|
|
||||||
- We disable caching (`cache_mode=CacheMode.BYPASS`) to always fetch fresh content.
|
|
||||||
- We only keep `main.article` content by specifying `css_selector="main.article"`.
|
|
||||||
- We exclude external links (`exclude_external_links=True`).
|
|
||||||
- We do a quick screenshot (`screenshot=True`) before finishing.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 3. Putting It All Together
|
## 3. Putting It All Together
|
||||||
|
|
||||||
- **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent.
|
- **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent.
|
||||||
|
|||||||
159
tests/20241401/test_robot_parser.py
Normal file
159
tests/20241401/test_robot_parser.py
Normal file
@@ -0,0 +1,159 @@
|
|||||||
|
from crawl4ai.utils import RobotsParser
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
from aiohttp import web
|
||||||
|
import tempfile
|
||||||
|
import shutil
|
||||||
|
import os, sys, time, json
|
||||||
|
|
||||||
|
|
||||||
|
async def test_robots_parser():
|
||||||
|
print("\n=== Testing RobotsParser ===\n")
|
||||||
|
|
||||||
|
# Setup temporary directory for testing
|
||||||
|
temp_dir = tempfile.mkdtemp()
|
||||||
|
try:
|
||||||
|
# 1. Basic setup test
|
||||||
|
print("1. Testing basic initialization...")
|
||||||
|
parser = RobotsParser(cache_dir=temp_dir)
|
||||||
|
assert os.path.exists(parser.db_path), "Database file not created"
|
||||||
|
print("✓ Basic initialization passed")
|
||||||
|
|
||||||
|
# 2. Test common cases
|
||||||
|
print("\n2. Testing common cases...")
|
||||||
|
allowed = await parser.can_fetch("https://www.example.com", "MyBot/1.0")
|
||||||
|
print(f"✓ Regular website fetch: {'allowed' if allowed else 'denied'}")
|
||||||
|
|
||||||
|
# Test caching
|
||||||
|
print("Testing cache...")
|
||||||
|
start = time.time()
|
||||||
|
await parser.can_fetch("https://www.example.com", "MyBot/1.0")
|
||||||
|
duration = time.time() - start
|
||||||
|
print(f"✓ Cached lookup took: {duration*1000:.2f}ms")
|
||||||
|
assert duration < 0.03, "Cache lookup too slow"
|
||||||
|
|
||||||
|
# 3. Edge cases
|
||||||
|
print("\n3. Testing edge cases...")
|
||||||
|
|
||||||
|
# Empty URL
|
||||||
|
result = await parser.can_fetch("", "MyBot/1.0")
|
||||||
|
print(f"✓ Empty URL handled: {'allowed' if result else 'denied'}")
|
||||||
|
|
||||||
|
# Invalid URL
|
||||||
|
result = await parser.can_fetch("not_a_url", "MyBot/1.0")
|
||||||
|
print(f"✓ Invalid URL handled: {'allowed' if result else 'denied'}")
|
||||||
|
|
||||||
|
# URL without scheme
|
||||||
|
result = await parser.can_fetch("example.com/page", "MyBot/1.0")
|
||||||
|
print(f"✓ URL without scheme handled: {'allowed' if result else 'denied'}")
|
||||||
|
|
||||||
|
# 4. Test with local server
|
||||||
|
async def start_test_server():
|
||||||
|
app = web.Application()
|
||||||
|
|
||||||
|
async def robots_txt(request):
|
||||||
|
return web.Response(text="""User-agent: *
|
||||||
|
Disallow: /private/
|
||||||
|
Allow: /public/
|
||||||
|
""")
|
||||||
|
|
||||||
|
async def malformed_robots(request):
|
||||||
|
return web.Response(text="<<<malformed>>>")
|
||||||
|
|
||||||
|
async def timeout_robots(request):
|
||||||
|
await asyncio.sleep(5)
|
||||||
|
return web.Response(text="Should timeout")
|
||||||
|
|
||||||
|
async def empty_robots(request):
|
||||||
|
return web.Response(text="")
|
||||||
|
|
||||||
|
async def giant_robots(request):
|
||||||
|
return web.Response(text="User-agent: *\nDisallow: /\n" * 10000)
|
||||||
|
|
||||||
|
# Mount all handlers at root level
|
||||||
|
app.router.add_get('/robots.txt', robots_txt)
|
||||||
|
app.router.add_get('/malformed/robots.txt', malformed_robots)
|
||||||
|
app.router.add_get('/timeout/robots.txt', timeout_robots)
|
||||||
|
app.router.add_get('/empty/robots.txt', empty_robots)
|
||||||
|
app.router.add_get('/giant/robots.txt', giant_robots)
|
||||||
|
|
||||||
|
runner = web.AppRunner(app)
|
||||||
|
await runner.setup()
|
||||||
|
site = web.TCPSite(runner, 'localhost', 8080)
|
||||||
|
await site.start()
|
||||||
|
return runner
|
||||||
|
|
||||||
|
runner = await start_test_server()
|
||||||
|
try:
|
||||||
|
print("\n4. Testing robots.txt rules...")
|
||||||
|
base_url = "http://localhost:8080"
|
||||||
|
|
||||||
|
# Test public access
|
||||||
|
result = await parser.can_fetch(f"{base_url}/public/page", "bot")
|
||||||
|
print(f"Public access (/public/page): {'allowed' if result else 'denied'}")
|
||||||
|
assert result, "Public path should be allowed"
|
||||||
|
|
||||||
|
# Test private access
|
||||||
|
result = await parser.can_fetch(f"{base_url}/private/secret", "bot")
|
||||||
|
print(f"Private access (/private/secret): {'allowed' if result else 'denied'}")
|
||||||
|
assert not result, "Private path should be denied"
|
||||||
|
|
||||||
|
# Test malformed
|
||||||
|
result = await parser.can_fetch("http://localhost:8080/malformed/page", "bot")
|
||||||
|
print(f"✓ Malformed robots.txt handled: {'allowed' if result else 'denied'}")
|
||||||
|
|
||||||
|
# Test timeout
|
||||||
|
start = time.time()
|
||||||
|
result = await parser.can_fetch("http://localhost:8080/timeout/page", "bot")
|
||||||
|
duration = time.time() - start
|
||||||
|
print(f"✓ Timeout handled (took {duration:.2f}s): {'allowed' if result else 'denied'}")
|
||||||
|
assert duration < 3, "Timeout not working"
|
||||||
|
|
||||||
|
# Test empty
|
||||||
|
result = await parser.can_fetch("http://localhost:8080/empty/page", "bot")
|
||||||
|
print(f"✓ Empty robots.txt handled: {'allowed' if result else 'denied'}")
|
||||||
|
|
||||||
|
# Test giant file
|
||||||
|
start = time.time()
|
||||||
|
result = await parser.can_fetch("http://localhost:8080/giant/page", "bot")
|
||||||
|
duration = time.time() - start
|
||||||
|
print(f"✓ Giant robots.txt handled (took {duration:.2f}s): {'allowed' if result else 'denied'}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await runner.cleanup()
|
||||||
|
|
||||||
|
# 5. Cache manipulation
|
||||||
|
print("\n5. Testing cache manipulation...")
|
||||||
|
|
||||||
|
# Clear expired
|
||||||
|
parser.clear_expired()
|
||||||
|
print("✓ Clear expired entries completed")
|
||||||
|
|
||||||
|
# Clear all
|
||||||
|
parser.clear_cache()
|
||||||
|
print("✓ Clear all cache completed")
|
||||||
|
|
||||||
|
# Test with custom TTL
|
||||||
|
custom_parser = RobotsParser(cache_dir=temp_dir, cache_ttl=1) # 1 second TTL
|
||||||
|
await custom_parser.can_fetch("https://www.example.com", "bot")
|
||||||
|
print("✓ Custom TTL fetch completed")
|
||||||
|
await asyncio.sleep(1.1)
|
||||||
|
start = time.time()
|
||||||
|
await custom_parser.can_fetch("https://www.example.com", "bot")
|
||||||
|
print(f"✓ TTL expiry working (refetched after {time.time() - start:.2f}s)")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Cleanup
|
||||||
|
shutil.rmtree(temp_dir)
|
||||||
|
print("\nTest cleanup completed")
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
try:
|
||||||
|
await test_robots_parser()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Test failed: {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
62
tests/20241401/tets_robot.py
Normal file
62
tests/20241401/tets_robot.py
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
import asyncio
|
||||||
|
from crawl4ai import *
|
||||||
|
|
||||||
|
async def test_real_websites():
|
||||||
|
print("\n=== Testing Real Website Robots.txt Compliance ===\n")
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
|
||||||
|
# Test cases with URLs
|
||||||
|
test_cases = [
|
||||||
|
# Public sites that should be allowed
|
||||||
|
("https://example.com", True), # Simple public site
|
||||||
|
("https://httpbin.org/get", True), # API endpoint
|
||||||
|
|
||||||
|
# Sites with known strict robots.txt
|
||||||
|
("https://www.facebook.com/robots.txt", False), # Social media
|
||||||
|
("https://www.google.com/search", False), # Search pages
|
||||||
|
|
||||||
|
# Edge cases
|
||||||
|
("https://api.github.com", True), # API service
|
||||||
|
("https://raw.githubusercontent.com", True), # Content delivery
|
||||||
|
|
||||||
|
# Non-existent/error cases
|
||||||
|
("https://thisisnotarealwebsite.com", True), # Non-existent domain
|
||||||
|
("https://localhost:12345", True), # Invalid port
|
||||||
|
]
|
||||||
|
|
||||||
|
for url, expected in test_cases:
|
||||||
|
print(f"\nTesting: {url}")
|
||||||
|
try:
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
check_robots_txt=True, # Enable robots.txt checking
|
||||||
|
verbose=True
|
||||||
|
)
|
||||||
|
|
||||||
|
result = await crawler.arun(url=url, config=config)
|
||||||
|
allowed = result.success and not result.error_message
|
||||||
|
|
||||||
|
print(f"Expected: {'allowed' if expected else 'denied'}")
|
||||||
|
print(f"Actual: {'allowed' if allowed else 'denied'}")
|
||||||
|
print(f"Status Code: {result.status_code}")
|
||||||
|
if result.error_message:
|
||||||
|
print(f"Error: {result.error_message}")
|
||||||
|
|
||||||
|
# Optional: Print robots.txt content if available
|
||||||
|
if result.metadata and 'robots_txt' in result.metadata:
|
||||||
|
print(f"Robots.txt rules:\n{result.metadata['robots_txt']}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Test failed with error: {str(e)}")
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
try:
|
||||||
|
await test_real_websites()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Test suite failed: {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
Reference in New Issue
Block a user