From bb6450f458f7bd1a3e7f9903c4726a3a66158828 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Mon, 27 Jan 2025 11:58:54 +0530 Subject: [PATCH] Remove robots.txt compliance from scraper --- crawl4ai/scraper/bfs_scraper_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index 646ff1d1..2a706a28 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -50,7 +50,7 @@ class BFSScraperStrategy(ScraperStrategy): self.process_external_links = process_external_links async def can_process_url(self, url: str, depth: int) -> bool: - """Check if URL can be processed based on robots.txt and filters + """Check if URL can be processed based on filters This is our gatekeeper method that determines if a URL should be processed. It: - Validates URL format using a robust built-in method - Applies custom filters from the filter chain