From 8a7d29ce85056a51f03049b37c51d83d5304743c Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Wed, 16 Oct 2024 15:59:37 +0530 Subject: [PATCH] updated some comments and removed content type checking functionality from core as it's implemented as a filter --- crawl4ai/scraper/bfs_scraper_strategy.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index dc89047a..9022cd90 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -24,18 +24,17 @@ class BFSScraperStrategy(ScraperStrategy): self.filter_chain = filter_chain self.url_scorer = url_scorer self.max_concurrent = max_concurrent - # 9. Crawl Politeness + # For Crawl Politeness self.last_crawl_time = defaultdict(float) self.min_crawl_delay = 1 # 1 second delay between requests to the same domain - # 5. Robots.txt Compliance + # For Robots.txt Compliance self.robot_parsers = {} - + # Robots.txt Parser def get_robot_parser(self, url: str) -> RobotFileParser: domain = urlparse(url) scheme = domain.scheme if domain.scheme else 'http' # Default to 'http' if no scheme provided netloc = domain.netloc - if netloc not in self.robot_parsers: rp = RobotFileParser() rp.set_url(f"{scheme}://{netloc}/robots.txt") @@ -90,11 +89,6 @@ class BFSScraperStrategy(ScraperStrategy): await self.add_to_retry_queue(url) return crawl_result - # Content Type Checking - # if 'text/html' not in crawl_result.response_header.get('Content-Type', ''): - # logging.info(f"Skipping non-HTML content: {url}") - # return crawl_result - visited.add(url) # Process links