updated some comments and removed content type checking functionality from core as it's implemented as a filter

This commit is contained in:
Aravind Karnam
2024-10-16 15:59:37 +05:30
parent 159bd875bd
commit 8a7d29ce85

View File

@@ -24,18 +24,17 @@ class BFSScraperStrategy(ScraperStrategy):
self.filter_chain = filter_chain
self.url_scorer = url_scorer
self.max_concurrent = max_concurrent
# 9. Crawl Politeness
# For Crawl Politeness
self.last_crawl_time = defaultdict(float)
self.min_crawl_delay = 1 # 1 second delay between requests to the same domain
# 5. Robots.txt Compliance
# For Robots.txt Compliance
self.robot_parsers = {}
# Robots.txt Parser
def get_robot_parser(self, url: str) -> RobotFileParser:
domain = urlparse(url)
scheme = domain.scheme if domain.scheme else 'http' # Default to 'http' if no scheme provided
netloc = domain.netloc
if netloc not in self.robot_parsers:
rp = RobotFileParser()
rp.set_url(f"{scheme}://{netloc}/robots.txt")
@@ -90,11 +89,6 @@ class BFSScraperStrategy(ScraperStrategy):
await self.add_to_retry_queue(url)
return crawl_result
# Content Type Checking
# if 'text/html' not in crawl_result.response_header.get('Content-Type', ''):
# logging.info(f"Skipping non-HTML content: {url}")
# return crawl_result
visited.add(url)
# Process links