Remove robots.txt compliance from scraper
This commit is contained in:
@@ -50,7 +50,7 @@ class BFSScraperStrategy(ScraperStrategy):
|
|||||||
self.process_external_links = process_external_links
|
self.process_external_links = process_external_links
|
||||||
|
|
||||||
async def can_process_url(self, url: str, depth: int) -> bool:
|
async def can_process_url(self, url: str, depth: int) -> bool:
|
||||||
"""Check if URL can be processed based on robots.txt and filters
|
"""Check if URL can be processed based on filters
|
||||||
This is our gatekeeper method that determines if a URL should be processed. It:
|
This is our gatekeeper method that determines if a URL should be processed. It:
|
||||||
- Validates URL format using a robust built-in method
|
- Validates URL format using a robust built-in method
|
||||||
- Applies custom filters from the filter chain
|
- Applies custom filters from the filter chain
|
||||||
|
|||||||
Reference in New Issue
Block a user