updated some comments and removed content type checking functionality from core as it's implemented as a filter
This commit is contained in:
@@ -24,18 +24,17 @@ class BFSScraperStrategy(ScraperStrategy):
|
|||||||
self.filter_chain = filter_chain
|
self.filter_chain = filter_chain
|
||||||
self.url_scorer = url_scorer
|
self.url_scorer = url_scorer
|
||||||
self.max_concurrent = max_concurrent
|
self.max_concurrent = max_concurrent
|
||||||
# 9. Crawl Politeness
|
# For Crawl Politeness
|
||||||
self.last_crawl_time = defaultdict(float)
|
self.last_crawl_time = defaultdict(float)
|
||||||
self.min_crawl_delay = 1 # 1 second delay between requests to the same domain
|
self.min_crawl_delay = 1 # 1 second delay between requests to the same domain
|
||||||
# 5. Robots.txt Compliance
|
# For Robots.txt Compliance
|
||||||
self.robot_parsers = {}
|
self.robot_parsers = {}
|
||||||
|
|
||||||
# Robots.txt Parser
|
# Robots.txt Parser
|
||||||
def get_robot_parser(self, url: str) -> RobotFileParser:
|
def get_robot_parser(self, url: str) -> RobotFileParser:
|
||||||
domain = urlparse(url)
|
domain = urlparse(url)
|
||||||
scheme = domain.scheme if domain.scheme else 'http' # Default to 'http' if no scheme provided
|
scheme = domain.scheme if domain.scheme else 'http' # Default to 'http' if no scheme provided
|
||||||
netloc = domain.netloc
|
netloc = domain.netloc
|
||||||
|
|
||||||
if netloc not in self.robot_parsers:
|
if netloc not in self.robot_parsers:
|
||||||
rp = RobotFileParser()
|
rp = RobotFileParser()
|
||||||
rp.set_url(f"{scheme}://{netloc}/robots.txt")
|
rp.set_url(f"{scheme}://{netloc}/robots.txt")
|
||||||
@@ -90,11 +89,6 @@ class BFSScraperStrategy(ScraperStrategy):
|
|||||||
await self.add_to_retry_queue(url)
|
await self.add_to_retry_queue(url)
|
||||||
return crawl_result
|
return crawl_result
|
||||||
|
|
||||||
# Content Type Checking
|
|
||||||
# if 'text/html' not in crawl_result.response_header.get('Content-Type', ''):
|
|
||||||
# logging.info(f"Skipping non-HTML content: {url}")
|
|
||||||
# return crawl_result
|
|
||||||
|
|
||||||
visited.add(url)
|
visited.add(url)
|
||||||
|
|
||||||
# Process links
|
# Process links
|
||||||
|
|||||||
Reference in New Issue
Block a user