Enhanced BFS Strategy: Improved monitoring, resource management & configuration
- Added CrawlStats for comprehensive crawl monitoring - Implemented proper resource cleanup with shutdown mechanism - Enhanced URL processing with better validation and politeness controls - Added configuration options (max_concurrent, timeout, external_links) - Improved error handling with retry logic - Added domain-specific queues for better performance - Created comprehensive documentation Note: URL normalization needs review - potential duplicate processing with core crawler for internal links. Currently commented out pending further investigation of edge cases.
This commit is contained in:
@@ -76,6 +76,7 @@ class BFSScraperStrategy(ScraperStrategy):
|
|||||||
# Crawl control
|
# Crawl control
|
||||||
self.stats = CrawlStats(start_time=datetime.now())
|
self.stats = CrawlStats(start_time=datetime.now())
|
||||||
self._cancel_event = asyncio.Event()
|
self._cancel_event = asyncio.Event()
|
||||||
|
self.process_external_links = False
|
||||||
|
|
||||||
# Rate limiting and politeness
|
# Rate limiting and politeness
|
||||||
self.rate_limiter = AsyncLimiter(1, 1)
|
self.rate_limiter = AsyncLimiter(1, 1)
|
||||||
@@ -84,7 +85,14 @@ class BFSScraperStrategy(ScraperStrategy):
|
|||||||
self.domain_queues: Dict[str, asyncio.Queue] = defaultdict(asyncio.Queue)
|
self.domain_queues: Dict[str, asyncio.Queue] = defaultdict(asyncio.Queue)
|
||||||
|
|
||||||
async def can_process_url(self, url: str) -> bool:
|
async def can_process_url(self, url: str) -> bool:
|
||||||
"""Check if URL can be processed based on robots.txt and filters"""
|
"""Check if URL can be processed based on robots.txt and filters
|
||||||
|
This is our gatekeeper method that determines if a URL should be processed. It:
|
||||||
|
- Validates URL format using the validators library
|
||||||
|
- Checks robots.txt permissions for the domain
|
||||||
|
- Applies custom filters from the filter chain
|
||||||
|
- Updates statistics for blocked URLs
|
||||||
|
- Returns False early if any check fails
|
||||||
|
"""
|
||||||
if not validators.url(url):
|
if not validators.url(url):
|
||||||
self.logger.warning(f"Invalid URL: {url}")
|
self.logger.warning(f"Invalid URL: {url}")
|
||||||
return False
|
return False
|
||||||
@@ -98,7 +106,13 @@ class BFSScraperStrategy(ScraperStrategy):
|
|||||||
return self.filter_chain.apply(url)
|
return self.filter_chain.apply(url)
|
||||||
|
|
||||||
async def _get_robot_parser(self, url: str) -> Optional[RobotFileParser]:
|
async def _get_robot_parser(self, url: str) -> Optional[RobotFileParser]:
|
||||||
"""Get or create robots.txt parser for domain"""
|
"""Get or create robots.txt parser for domain.
|
||||||
|
This is our robots.txt manager that:
|
||||||
|
- Uses domain-level caching of robot parsers
|
||||||
|
- Creates and caches new parsers as needed
|
||||||
|
- Handles failed robots.txt fetches gracefully
|
||||||
|
- Returns None if robots.txt can't be fetched, allowing crawling to proceed
|
||||||
|
"""
|
||||||
domain = urlparse(url).netloc
|
domain = urlparse(url).netloc
|
||||||
if domain not in self.robot_parsers:
|
if domain not in self.robot_parsers:
|
||||||
parser = RobotFileParser()
|
parser = RobotFileParser()
|
||||||
@@ -136,7 +150,17 @@ class BFSScraperStrategy(ScraperStrategy):
|
|||||||
visited: Set[str],
|
visited: Set[str],
|
||||||
depths: Dict[str, int]
|
depths: Dict[str, int]
|
||||||
) -> Optional[CrawlResult]:
|
) -> Optional[CrawlResult]:
|
||||||
"""Process a single URL and extract links"""
|
"""Process a single URL and extract links.
|
||||||
|
This is our main URL processing workhorse that:
|
||||||
|
- Checks for cancellation
|
||||||
|
- Validates URLs through can_process_url
|
||||||
|
- Implements politeness delays per domain
|
||||||
|
- Applies rate limiting
|
||||||
|
- Handles crawling with retries
|
||||||
|
- Updates various statistics
|
||||||
|
- Processes extracted links
|
||||||
|
- Returns the crawl result or None on failure
|
||||||
|
"""
|
||||||
|
|
||||||
if self._cancel_event.is_set():
|
if self._cancel_event.is_set():
|
||||||
return None
|
return None
|
||||||
@@ -176,11 +200,24 @@ class BFSScraperStrategy(ScraperStrategy):
|
|||||||
visited: Set[str],
|
visited: Set[str],
|
||||||
depths: Dict[str, int]
|
depths: Dict[str, int]
|
||||||
):
|
):
|
||||||
"""Process extracted links from crawl result"""
|
"""Process extracted links from crawl result.
|
||||||
for link_type in ["internal", "external"]:
|
This is our link processor that:
|
||||||
|
Handles both internal and external links
|
||||||
|
Normalizes URLs (removes fragments)
|
||||||
|
Checks depth limits
|
||||||
|
Scores URLs for priority
|
||||||
|
Updates depth tracking
|
||||||
|
Adds valid URLs to the queue
|
||||||
|
Updates maximum depth statistics
|
||||||
|
"""
|
||||||
|
links_ro_process = result.links["internal"]
|
||||||
|
if self.process_external_links:
|
||||||
|
links_ro_process += result.links["external"]
|
||||||
|
for link_type in links_ro_process:
|
||||||
for link in result.links[link_type]:
|
for link in result.links[link_type]:
|
||||||
url = urljoin(source_url, link['href'])
|
url = link['href']
|
||||||
url = urlunparse(urlparse(url)._replace(fragment=""))
|
# url = urljoin(source_url, link['href'])
|
||||||
|
# url = urlunparse(urlparse(url)._replace(fragment=""))
|
||||||
|
|
||||||
if url not in visited and await self.can_process_url(url):
|
if url not in visited and await self.can_process_url(url):
|
||||||
new_depth = depths[source_url] + 1
|
new_depth = depths[source_url] + 1
|
||||||
@@ -202,6 +239,15 @@ class BFSScraperStrategy(ScraperStrategy):
|
|||||||
"""Implement BFS crawling strategy"""
|
"""Implement BFS crawling strategy"""
|
||||||
|
|
||||||
# Initialize crawl state
|
# Initialize crawl state
|
||||||
|
"""
|
||||||
|
queue: A priority queue where items are tuples of (score, depth, url)
|
||||||
|
Score: Determines crawling priority (lower = higher priority)
|
||||||
|
Depth: Current distance from start_url
|
||||||
|
URL: The actual URL to crawl
|
||||||
|
visited: Keeps track of URLs we've already seen to avoid cycles
|
||||||
|
depths: Maps URLs to their depths from the start URL
|
||||||
|
pending_tasks: Tracks currently running crawl tasks
|
||||||
|
"""
|
||||||
queue = asyncio.PriorityQueue()
|
queue = asyncio.PriorityQueue()
|
||||||
await queue.put((0, 0, start_url))
|
await queue.put((0, 0, start_url))
|
||||||
visited: Set[str] = set()
|
visited: Set[str] = set()
|
||||||
@@ -210,8 +256,24 @@ class BFSScraperStrategy(ScraperStrategy):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
while (not queue.empty() or pending_tasks) and not self._cancel_event.is_set():
|
while (not queue.empty() or pending_tasks) and not self._cancel_event.is_set():
|
||||||
|
"""
|
||||||
|
This sets up our main control loop which:
|
||||||
|
- Continues while there are URLs to process (not queue.empty())
|
||||||
|
- Or while there are tasks still running (pending_tasks)
|
||||||
|
- Can be interrupted via cancellation (not self._cancel_event.is_set())
|
||||||
|
"""
|
||||||
# Start new tasks up to max_concurrent
|
# Start new tasks up to max_concurrent
|
||||||
while not queue.empty() and len(pending_tasks) < self.max_concurrent:
|
while not queue.empty() and len(pending_tasks) < self.max_concurrent:
|
||||||
|
"""
|
||||||
|
This section manages task creation:
|
||||||
|
Checks if we can start more tasks (under max_concurrent limit)
|
||||||
|
Gets the next URL from the priority queue
|
||||||
|
Marks URLs as visited immediately to prevent duplicates
|
||||||
|
Updates current depth in stats
|
||||||
|
Either:
|
||||||
|
Creates a new async task (parallel mode)
|
||||||
|
Processes URL directly (sequential mode)
|
||||||
|
"""
|
||||||
_, depth, url = await queue.get()
|
_, depth, url = await queue.get()
|
||||||
if url not in visited:
|
if url not in visited:
|
||||||
visited.add(url)
|
visited.add(url)
|
||||||
@@ -230,6 +292,13 @@ class BFSScraperStrategy(ScraperStrategy):
|
|||||||
yield result
|
yield result
|
||||||
|
|
||||||
# Process completed tasks
|
# Process completed tasks
|
||||||
|
"""
|
||||||
|
This section manages completed tasks:
|
||||||
|
Waits for any task to complete using asyncio.wait
|
||||||
|
Uses FIRST_COMPLETED to handle results as soon as they're ready
|
||||||
|
Yields successful results to the caller
|
||||||
|
Updates pending_tasks to remove completed ones
|
||||||
|
"""
|
||||||
if pending_tasks:
|
if pending_tasks:
|
||||||
done, pending_tasks = await asyncio.wait(
|
done, pending_tasks = await asyncio.wait(
|
||||||
pending_tasks,
|
pending_tasks,
|
||||||
|
|||||||
244
docs/scrapper/bfs_scraper_strategy.md
Normal file
244
docs/scrapper/bfs_scraper_strategy.md
Normal file
@@ -0,0 +1,244 @@
|
|||||||
|
# BFS Scraper Strategy: Smart Web Traversal
|
||||||
|
|
||||||
|
The BFS (Breadth-First Search) Scraper Strategy provides an intelligent way to traverse websites systematically. It crawls websites level by level, ensuring thorough coverage while respecting web crawling etiquette.
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart TB
|
||||||
|
Start([Start]) --> Init[Initialize BFS Strategy]
|
||||||
|
Init --> InitStats[Initialize CrawlStats]
|
||||||
|
InitStats --> InitQueue[Initialize Priority Queue]
|
||||||
|
InitQueue --> AddStart[Add Start URL to Queue]
|
||||||
|
|
||||||
|
AddStart --> CheckState{Queue Empty or\nTasks Pending?}
|
||||||
|
CheckState -->|No| Cleanup[Cleanup & Stats]
|
||||||
|
Cleanup --> End([End])
|
||||||
|
|
||||||
|
CheckState -->|Yes| CheckCancel{Cancel\nRequested?}
|
||||||
|
CheckCancel -->|Yes| Cleanup
|
||||||
|
|
||||||
|
CheckCancel -->|No| CheckConcurrent{Under Max\nConcurrent?}
|
||||||
|
|
||||||
|
CheckConcurrent -->|No| WaitComplete[Wait for Task Completion]
|
||||||
|
WaitComplete --> YieldResult[Yield Result]
|
||||||
|
YieldResult --> CheckState
|
||||||
|
|
||||||
|
CheckConcurrent -->|Yes| GetNextURL[Get Next URL from Queue]
|
||||||
|
|
||||||
|
GetNextURL --> ValidateURL{Already\nVisited?}
|
||||||
|
ValidateURL -->|Yes| CheckState
|
||||||
|
|
||||||
|
ValidateURL -->|No| ProcessURL[Process URL]
|
||||||
|
|
||||||
|
subgraph URL_Processing [URL Processing]
|
||||||
|
ProcessURL --> CheckValid{URL Valid?}
|
||||||
|
CheckValid -->|No| UpdateStats[Update Skip Stats]
|
||||||
|
|
||||||
|
CheckValid -->|Yes| CheckRobots{Allowed by\nrobots.txt?}
|
||||||
|
CheckRobots -->|No| UpdateRobotStats[Update Robot Stats]
|
||||||
|
|
||||||
|
CheckRobots -->|Yes| ApplyDelay[Apply Politeness Delay]
|
||||||
|
ApplyDelay --> FetchContent[Fetch Content with Rate Limit]
|
||||||
|
|
||||||
|
FetchContent --> CheckError{Error?}
|
||||||
|
CheckError -->|Yes| Retry{Retry\nNeeded?}
|
||||||
|
Retry -->|Yes| FetchContent
|
||||||
|
Retry -->|No| UpdateFailStats[Update Fail Stats]
|
||||||
|
|
||||||
|
CheckError -->|No| ExtractLinks[Extract & Process Links]
|
||||||
|
ExtractLinks --> ScoreURLs[Score New URLs]
|
||||||
|
ScoreURLs --> AddToQueue[Add to Priority Queue]
|
||||||
|
end
|
||||||
|
|
||||||
|
ProcessURL --> CreateTask{Parallel\nProcessing?}
|
||||||
|
CreateTask -->|Yes| AddTask[Add to Pending Tasks]
|
||||||
|
CreateTask -->|No| DirectProcess[Process Directly]
|
||||||
|
|
||||||
|
AddTask --> CheckState
|
||||||
|
DirectProcess --> YieldResult
|
||||||
|
|
||||||
|
UpdateStats --> CheckState
|
||||||
|
UpdateRobotStats --> CheckState
|
||||||
|
UpdateFailStats --> CheckState
|
||||||
|
|
||||||
|
classDef process fill:#90caf9,stroke:#000,stroke-width:2px;
|
||||||
|
classDef decision fill:#fff59d,stroke:#000,stroke-width:2px;
|
||||||
|
classDef error fill:#ef9a9a,stroke:#000,stroke-width:2px;
|
||||||
|
classDef stats fill:#a5d6a7,stroke:#000,stroke-width:2px;
|
||||||
|
|
||||||
|
class Start,End stats;
|
||||||
|
class CheckState,CheckCancel,CheckConcurrent,ValidateURL,CheckValid,CheckRobots,CheckError,Retry,CreateTask decision;
|
||||||
|
class UpdateStats,UpdateRobotStats,UpdateFailStats,InitStats,Cleanup stats;
|
||||||
|
class ProcessURL,FetchContent,ExtractLinks,ScoreURLs process;
|
||||||
|
```
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
The BFS strategy crawls a website by:
|
||||||
|
1. Starting from a root URL
|
||||||
|
2. Processing all URLs at the current depth
|
||||||
|
3. Moving to URLs at the next depth level
|
||||||
|
4. Continuing until maximum depth is reached
|
||||||
|
|
||||||
|
This ensures systematic coverage of the website while maintaining control over the crawling process.
|
||||||
|
|
||||||
|
## Key Features
|
||||||
|
|
||||||
|
### 1. Smart URL Processing
|
||||||
|
```python
|
||||||
|
strategy = BFSScraperStrategy(
|
||||||
|
max_depth=2,
|
||||||
|
filter_chain=my_filters,
|
||||||
|
url_scorer=my_scorer,
|
||||||
|
max_concurrent=5
|
||||||
|
)
|
||||||
|
```
|
||||||
|
- Controls crawl depth
|
||||||
|
- Filters unwanted URLs
|
||||||
|
- Scores URLs for priority
|
||||||
|
- Manages concurrent requests
|
||||||
|
|
||||||
|
### 2. Polite Crawling
|
||||||
|
The strategy automatically implements web crawling best practices:
|
||||||
|
- Respects robots.txt
|
||||||
|
- Implements rate limiting
|
||||||
|
- Adds politeness delays
|
||||||
|
- Manages concurrent requests
|
||||||
|
|
||||||
|
### 3. Link Processing Control
|
||||||
|
```python
|
||||||
|
strategy = BFSScraperStrategy(
|
||||||
|
...,
|
||||||
|
process_external_links=False # Only process internal links
|
||||||
|
)
|
||||||
|
```
|
||||||
|
- Control whether to follow external links
|
||||||
|
- Default: internal links only
|
||||||
|
- Enable external links when needed
|
||||||
|
|
||||||
|
## Configuration Options
|
||||||
|
|
||||||
|
| Parameter | Description | Default |
|
||||||
|
|-----------|-------------|---------|
|
||||||
|
| max_depth | Maximum crawl depth | Required |
|
||||||
|
| filter_chain | URL filtering rules | Required |
|
||||||
|
| url_scorer | URL priority scoring | Required |
|
||||||
|
| max_concurrent | Max parallel requests | 5 |
|
||||||
|
| min_crawl_delay | Seconds between requests | 1 |
|
||||||
|
| process_external_links | Follow external links | False |
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
1. **Set Appropriate Depth**
|
||||||
|
- Start with smaller depths (2-3)
|
||||||
|
- Increase based on needs
|
||||||
|
- Consider site structure
|
||||||
|
|
||||||
|
2. **Configure Filters**
|
||||||
|
- Use URL patterns
|
||||||
|
- Filter by content type
|
||||||
|
- Avoid unwanted sections
|
||||||
|
|
||||||
|
3. **Tune Performance**
|
||||||
|
- Adjust max_concurrent
|
||||||
|
- Set appropriate delays
|
||||||
|
- Monitor resource usage
|
||||||
|
|
||||||
|
4. **Handle External Links**
|
||||||
|
- Keep external_links=False for focused crawls
|
||||||
|
- Enable only when needed
|
||||||
|
- Consider additional filtering
|
||||||
|
|
||||||
|
## Example Usage
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai.scraper import BFSScraperStrategy
|
||||||
|
from crawl4ai.scraper.filters import FilterChain
|
||||||
|
from crawl4ai.scraper.scorers import BasicURLScorer
|
||||||
|
|
||||||
|
# Configure strategy
|
||||||
|
strategy = BFSScraperStrategy(
|
||||||
|
max_depth=3,
|
||||||
|
filter_chain=FilterChain([
|
||||||
|
URLPatternFilter("*.example.com/*"),
|
||||||
|
ContentTypeFilter(["text/html"])
|
||||||
|
]),
|
||||||
|
url_scorer=BasicURLScorer(),
|
||||||
|
max_concurrent=5,
|
||||||
|
min_crawl_delay=1,
|
||||||
|
process_external_links=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use with AsyncWebScraper
|
||||||
|
scraper = AsyncWebScraper(crawler, strategy)
|
||||||
|
results = await scraper.ascrape("https://example.com")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common Use Cases
|
||||||
|
|
||||||
|
### 1. Site Mapping
|
||||||
|
```python
|
||||||
|
strategy = BFSScraperStrategy(
|
||||||
|
max_depth=5,
|
||||||
|
filter_chain=site_filter,
|
||||||
|
url_scorer=depth_scorer,
|
||||||
|
process_external_links=False
|
||||||
|
)
|
||||||
|
```
|
||||||
|
Perfect for creating complete site maps or understanding site structure.
|
||||||
|
|
||||||
|
### 2. Content Aggregation
|
||||||
|
```python
|
||||||
|
strategy = BFSScraperStrategy(
|
||||||
|
max_depth=2,
|
||||||
|
filter_chain=content_filter,
|
||||||
|
url_scorer=relevance_scorer,
|
||||||
|
max_concurrent=3
|
||||||
|
)
|
||||||
|
```
|
||||||
|
Ideal for collecting specific types of content (articles, products, etc.).
|
||||||
|
|
||||||
|
### 3. Link Analysis
|
||||||
|
```python
|
||||||
|
strategy = BFSScraperStrategy(
|
||||||
|
max_depth=1,
|
||||||
|
filter_chain=link_filter,
|
||||||
|
url_scorer=link_scorer,
|
||||||
|
process_external_links=True
|
||||||
|
)
|
||||||
|
```
|
||||||
|
Useful for analyzing both internal and external link structures.
|
||||||
|
|
||||||
|
## Advanced Features
|
||||||
|
|
||||||
|
### Progress Monitoring
|
||||||
|
```python
|
||||||
|
async for result in scraper.ascrape(url):
|
||||||
|
print(f"Current depth: {strategy.stats.current_depth}")
|
||||||
|
print(f"Processed URLs: {strategy.stats.urls_processed}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Custom URL Scoring
|
||||||
|
```python
|
||||||
|
class CustomScorer(URLScorer):
|
||||||
|
def score(self, url: str) -> float:
|
||||||
|
# Lower scores = higher priority
|
||||||
|
return score_based_on_criteria(url)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
1. **Slow Crawling**
|
||||||
|
- Increase max_concurrent
|
||||||
|
- Adjust min_crawl_delay
|
||||||
|
- Check network conditions
|
||||||
|
|
||||||
|
2. **Missing Content**
|
||||||
|
- Verify max_depth
|
||||||
|
- Check filter settings
|
||||||
|
- Review URL patterns
|
||||||
|
|
||||||
|
3. **High Resource Usage**
|
||||||
|
- Reduce max_concurrent
|
||||||
|
- Increase crawl delay
|
||||||
|
- Add more specific filters
|
||||||
|
|
||||||
Reference in New Issue
Block a user