diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index 4506dbfe..72935008 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -76,6 +76,7 @@ class BFSScraperStrategy(ScraperStrategy): # Crawl control self.stats = CrawlStats(start_time=datetime.now()) self._cancel_event = asyncio.Event() + self.process_external_links = False # Rate limiting and politeness self.rate_limiter = AsyncLimiter(1, 1) @@ -84,7 +85,14 @@ class BFSScraperStrategy(ScraperStrategy): self.domain_queues: Dict[str, asyncio.Queue] = defaultdict(asyncio.Queue) async def can_process_url(self, url: str) -> bool: - """Check if URL can be processed based on robots.txt and filters""" + """Check if URL can be processed based on robots.txt and filters + This is our gatekeeper method that determines if a URL should be processed. It: + - Validates URL format using the validators library + - Checks robots.txt permissions for the domain + - Applies custom filters from the filter chain + - Updates statistics for blocked URLs + - Returns False early if any check fails + """ if not validators.url(url): self.logger.warning(f"Invalid URL: {url}") return False @@ -98,7 +106,13 @@ class BFSScraperStrategy(ScraperStrategy): return self.filter_chain.apply(url) async def _get_robot_parser(self, url: str) -> Optional[RobotFileParser]: - """Get or create robots.txt parser for domain""" + """Get or create robots.txt parser for domain. + This is our robots.txt manager that: + - Uses domain-level caching of robot parsers + - Creates and caches new parsers as needed + - Handles failed robots.txt fetches gracefully + - Returns None if robots.txt can't be fetched, allowing crawling to proceed + """ domain = urlparse(url).netloc if domain not in self.robot_parsers: parser = RobotFileParser() @@ -136,7 +150,17 @@ class BFSScraperStrategy(ScraperStrategy): visited: Set[str], depths: Dict[str, int] ) -> Optional[CrawlResult]: - """Process a single URL and extract links""" + """Process a single URL and extract links. + This is our main URL processing workhorse that: + - Checks for cancellation + - Validates URLs through can_process_url + - Implements politeness delays per domain + - Applies rate limiting + - Handles crawling with retries + - Updates various statistics + - Processes extracted links + - Returns the crawl result or None on failure + """ if self._cancel_event.is_set(): return None @@ -176,11 +200,24 @@ class BFSScraperStrategy(ScraperStrategy): visited: Set[str], depths: Dict[str, int] ): - """Process extracted links from crawl result""" - for link_type in ["internal", "external"]: + """Process extracted links from crawl result. + This is our link processor that: + Handles both internal and external links + Normalizes URLs (removes fragments) + Checks depth limits + Scores URLs for priority + Updates depth tracking + Adds valid URLs to the queue + Updates maximum depth statistics + """ + links_ro_process = result.links["internal"] + if self.process_external_links: + links_ro_process += result.links["external"] + for link_type in links_ro_process: for link in result.links[link_type]: - url = urljoin(source_url, link['href']) - url = urlunparse(urlparse(url)._replace(fragment="")) + url = link['href'] + # url = urljoin(source_url, link['href']) + # url = urlunparse(urlparse(url)._replace(fragment="")) if url not in visited and await self.can_process_url(url): new_depth = depths[source_url] + 1 @@ -202,6 +239,15 @@ class BFSScraperStrategy(ScraperStrategy): """Implement BFS crawling strategy""" # Initialize crawl state + """ + queue: A priority queue where items are tuples of (score, depth, url) + Score: Determines crawling priority (lower = higher priority) + Depth: Current distance from start_url + URL: The actual URL to crawl + visited: Keeps track of URLs we've already seen to avoid cycles + depths: Maps URLs to their depths from the start URL + pending_tasks: Tracks currently running crawl tasks + """ queue = asyncio.PriorityQueue() await queue.put((0, 0, start_url)) visited: Set[str] = set() @@ -210,8 +256,24 @@ class BFSScraperStrategy(ScraperStrategy): try: while (not queue.empty() or pending_tasks) and not self._cancel_event.is_set(): + """ + This sets up our main control loop which: + - Continues while there are URLs to process (not queue.empty()) + - Or while there are tasks still running (pending_tasks) + - Can be interrupted via cancellation (not self._cancel_event.is_set()) + """ # Start new tasks up to max_concurrent while not queue.empty() and len(pending_tasks) < self.max_concurrent: + """ + This section manages task creation: + Checks if we can start more tasks (under max_concurrent limit) + Gets the next URL from the priority queue + Marks URLs as visited immediately to prevent duplicates + Updates current depth in stats + Either: + Creates a new async task (parallel mode) + Processes URL directly (sequential mode) + """ _, depth, url = await queue.get() if url not in visited: visited.add(url) @@ -230,6 +292,13 @@ class BFSScraperStrategy(ScraperStrategy): yield result # Process completed tasks + """ + This section manages completed tasks: + Waits for any task to complete using asyncio.wait + Uses FIRST_COMPLETED to handle results as soon as they're ready + Yields successful results to the caller + Updates pending_tasks to remove completed ones + """ if pending_tasks: done, pending_tasks = await asyncio.wait( pending_tasks, diff --git a/docs/scrapper/bfs_scraper_strategy.md b/docs/scrapper/bfs_scraper_strategy.md new file mode 100644 index 00000000..7fe1319c --- /dev/null +++ b/docs/scrapper/bfs_scraper_strategy.md @@ -0,0 +1,244 @@ +# BFS Scraper Strategy: Smart Web Traversal + +The BFS (Breadth-First Search) Scraper Strategy provides an intelligent way to traverse websites systematically. It crawls websites level by level, ensuring thorough coverage while respecting web crawling etiquette. + +```mermaid +flowchart TB + Start([Start]) --> Init[Initialize BFS Strategy] + Init --> InitStats[Initialize CrawlStats] + InitStats --> InitQueue[Initialize Priority Queue] + InitQueue --> AddStart[Add Start URL to Queue] + + AddStart --> CheckState{Queue Empty or\nTasks Pending?} + CheckState -->|No| Cleanup[Cleanup & Stats] + Cleanup --> End([End]) + + CheckState -->|Yes| CheckCancel{Cancel\nRequested?} + CheckCancel -->|Yes| Cleanup + + CheckCancel -->|No| CheckConcurrent{Under Max\nConcurrent?} + + CheckConcurrent -->|No| WaitComplete[Wait for Task Completion] + WaitComplete --> YieldResult[Yield Result] + YieldResult --> CheckState + + CheckConcurrent -->|Yes| GetNextURL[Get Next URL from Queue] + + GetNextURL --> ValidateURL{Already\nVisited?} + ValidateURL -->|Yes| CheckState + + ValidateURL -->|No| ProcessURL[Process URL] + + subgraph URL_Processing [URL Processing] + ProcessURL --> CheckValid{URL Valid?} + CheckValid -->|No| UpdateStats[Update Skip Stats] + + CheckValid -->|Yes| CheckRobots{Allowed by\nrobots.txt?} + CheckRobots -->|No| UpdateRobotStats[Update Robot Stats] + + CheckRobots -->|Yes| ApplyDelay[Apply Politeness Delay] + ApplyDelay --> FetchContent[Fetch Content with Rate Limit] + + FetchContent --> CheckError{Error?} + CheckError -->|Yes| Retry{Retry\nNeeded?} + Retry -->|Yes| FetchContent + Retry -->|No| UpdateFailStats[Update Fail Stats] + + CheckError -->|No| ExtractLinks[Extract & Process Links] + ExtractLinks --> ScoreURLs[Score New URLs] + ScoreURLs --> AddToQueue[Add to Priority Queue] + end + + ProcessURL --> CreateTask{Parallel\nProcessing?} + CreateTask -->|Yes| AddTask[Add to Pending Tasks] + CreateTask -->|No| DirectProcess[Process Directly] + + AddTask --> CheckState + DirectProcess --> YieldResult + + UpdateStats --> CheckState + UpdateRobotStats --> CheckState + UpdateFailStats --> CheckState + + classDef process fill:#90caf9,stroke:#000,stroke-width:2px; + classDef decision fill:#fff59d,stroke:#000,stroke-width:2px; + classDef error fill:#ef9a9a,stroke:#000,stroke-width:2px; + classDef stats fill:#a5d6a7,stroke:#000,stroke-width:2px; + + class Start,End stats; + class CheckState,CheckCancel,CheckConcurrent,ValidateURL,CheckValid,CheckRobots,CheckError,Retry,CreateTask decision; + class UpdateStats,UpdateRobotStats,UpdateFailStats,InitStats,Cleanup stats; + class ProcessURL,FetchContent,ExtractLinks,ScoreURLs process; +``` + +## How It Works + +The BFS strategy crawls a website by: +1. Starting from a root URL +2. Processing all URLs at the current depth +3. Moving to URLs at the next depth level +4. Continuing until maximum depth is reached + +This ensures systematic coverage of the website while maintaining control over the crawling process. + +## Key Features + +### 1. Smart URL Processing +```python +strategy = BFSScraperStrategy( + max_depth=2, + filter_chain=my_filters, + url_scorer=my_scorer, + max_concurrent=5 +) +``` +- Controls crawl depth +- Filters unwanted URLs +- Scores URLs for priority +- Manages concurrent requests + +### 2. Polite Crawling +The strategy automatically implements web crawling best practices: +- Respects robots.txt +- Implements rate limiting +- Adds politeness delays +- Manages concurrent requests + +### 3. Link Processing Control +```python +strategy = BFSScraperStrategy( + ..., + process_external_links=False # Only process internal links +) +``` +- Control whether to follow external links +- Default: internal links only +- Enable external links when needed + +## Configuration Options + +| Parameter | Description | Default | +|-----------|-------------|---------| +| max_depth | Maximum crawl depth | Required | +| filter_chain | URL filtering rules | Required | +| url_scorer | URL priority scoring | Required | +| max_concurrent | Max parallel requests | 5 | +| min_crawl_delay | Seconds between requests | 1 | +| process_external_links | Follow external links | False | + +## Best Practices + +1. **Set Appropriate Depth** + - Start with smaller depths (2-3) + - Increase based on needs + - Consider site structure + +2. **Configure Filters** + - Use URL patterns + - Filter by content type + - Avoid unwanted sections + +3. **Tune Performance** + - Adjust max_concurrent + - Set appropriate delays + - Monitor resource usage + +4. **Handle External Links** + - Keep external_links=False for focused crawls + - Enable only when needed + - Consider additional filtering + +## Example Usage + +```python +from crawl4ai.scraper import BFSScraperStrategy +from crawl4ai.scraper.filters import FilterChain +from crawl4ai.scraper.scorers import BasicURLScorer + +# Configure strategy +strategy = BFSScraperStrategy( + max_depth=3, + filter_chain=FilterChain([ + URLPatternFilter("*.example.com/*"), + ContentTypeFilter(["text/html"]) + ]), + url_scorer=BasicURLScorer(), + max_concurrent=5, + min_crawl_delay=1, + process_external_links=False +) + +# Use with AsyncWebScraper +scraper = AsyncWebScraper(crawler, strategy) +results = await scraper.ascrape("https://example.com") +``` + +## Common Use Cases + +### 1. Site Mapping +```python +strategy = BFSScraperStrategy( + max_depth=5, + filter_chain=site_filter, + url_scorer=depth_scorer, + process_external_links=False +) +``` +Perfect for creating complete site maps or understanding site structure. + +### 2. Content Aggregation +```python +strategy = BFSScraperStrategy( + max_depth=2, + filter_chain=content_filter, + url_scorer=relevance_scorer, + max_concurrent=3 +) +``` +Ideal for collecting specific types of content (articles, products, etc.). + +### 3. Link Analysis +```python +strategy = BFSScraperStrategy( + max_depth=1, + filter_chain=link_filter, + url_scorer=link_scorer, + process_external_links=True +) +``` +Useful for analyzing both internal and external link structures. + +## Advanced Features + +### Progress Monitoring +```python +async for result in scraper.ascrape(url): + print(f"Current depth: {strategy.stats.current_depth}") + print(f"Processed URLs: {strategy.stats.urls_processed}") +``` + +### Custom URL Scoring +```python +class CustomScorer(URLScorer): + def score(self, url: str) -> float: + # Lower scores = higher priority + return score_based_on_criteria(url) +``` + +## Troubleshooting + +1. **Slow Crawling** + - Increase max_concurrent + - Adjust min_crawl_delay + - Check network conditions + +2. **Missing Content** + - Verify max_depth + - Check filter settings + - Review URL patterns + +3. **High Resource Usage** + - Reduce max_concurrent + - Increase crawl delay + - Add more specific filters +