From 8a7d29ce85056a51f03049b37c51d83d5304743c Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Wed, 16 Oct 2024 15:59:37 +0530
Subject: [PATCH] updated some comments and removed content type checking
 functionality from core as it's implemented as a filter

---
 crawl4ai/scraper/bfs_scraper_strategy.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py
index dc89047a..9022cd90 100644
--- a/crawl4ai/scraper/bfs_scraper_strategy.py
+++ b/crawl4ai/scraper/bfs_scraper_strategy.py
@@ -24,18 +24,17 @@ class BFSScraperStrategy(ScraperStrategy):
         self.filter_chain = filter_chain
         self.url_scorer = url_scorer
         self.max_concurrent = max_concurrent
-        # 9. Crawl Politeness
+        # For Crawl Politeness
         self.last_crawl_time = defaultdict(float)
         self.min_crawl_delay = 1  # 1 second delay between requests to the same domain
-        # 5. Robots.txt Compliance
+        # For Robots.txt Compliance
         self.robot_parsers = {}
-    
+
     # Robots.txt Parser
     def get_robot_parser(self, url: str) -> RobotFileParser:
         domain = urlparse(url)
         scheme = domain.scheme if domain.scheme else 'http'  # Default to 'http' if no scheme provided
         netloc = domain.netloc
-
         if netloc not in self.robot_parsers:
             rp = RobotFileParser()
             rp.set_url(f"{scheme}://{netloc}/robots.txt")
@@ -90,11 +89,6 @@ class BFSScraperStrategy(ScraperStrategy):
                 await self.add_to_retry_queue(url)
             return crawl_result
         
-        # Content Type Checking
-        # if 'text/html' not in crawl_result.response_header.get('Content-Type', ''):
-        #     logging.info(f"Skipping non-HTML content: {url}")
-        #     return crawl_result
-
         visited.add(url)
 
         # Process links