Fixed some bugs in robots.txt processing
This commit is contained in:
@@ -32,13 +32,17 @@ class BFSScraperStrategy(ScraperStrategy):
|
|||||||
|
|
||||||
# Robots.txt Parser
|
# Robots.txt Parser
|
||||||
def get_robot_parser(self, url: str) -> RobotFileParser:
|
def get_robot_parser(self, url: str) -> RobotFileParser:
|
||||||
domain = urlparse(url).netloc
|
domain = urlparse(url)
|
||||||
if domain not in self.robot_parsers:
|
scheme = domain.scheme if domain.scheme else 'http' # Default to 'http' if no scheme provided
|
||||||
|
netloc = domain.netloc
|
||||||
|
|
||||||
|
if netloc not in self.robot_parsers:
|
||||||
rp = RobotFileParser()
|
rp = RobotFileParser()
|
||||||
rp.set_url(f"https://{domain}/robots.txt")
|
rp.set_url(f"{scheme}://{netloc}/robots.txt")
|
||||||
rp.read()
|
rp.read()
|
||||||
self.robot_parsers[domain] = rp
|
self.robot_parsers[netloc] = rp
|
||||||
return self.robot_parsers[domain]
|
return self.robot_parsers[netloc]
|
||||||
|
|
||||||
|
|
||||||
# Retry with exponential backoff
|
# Retry with exponential backoff
|
||||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
||||||
@@ -56,7 +60,7 @@ class BFSScraperStrategy(ScraperStrategy):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
# Robots.txt Compliance
|
# Robots.txt Compliance
|
||||||
if not self.get_robot_parser(url).can_fetch("YourUserAgent", url):
|
if not self.get_robot_parser(url).can_fetch(crawler.crawler_strategy.user_agent, url):
|
||||||
logging.info(f"Skipping {url} as per robots.txt")
|
logging.info(f"Skipping {url} as per robots.txt")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user