fix: moved depth as a param to can_process_url and applying filter chain only when depth is not zero. This way

filter chain is skipped but other validations are in place even for start URL
2024-11-26 10:22:14 +05:30
parent b13fd71040
commit ee3001b1f7
1 changed files with 8 additions and 4 deletions
--- a/crawl4ai/scraper/bfs_scraper_strategy.py
+++ b/crawl4ai/scraper/bfs_scraper_strategy.py
@@ -61,7 +61,7 @@ class BFSScraperStrategy(ScraperStrategy):
        self.robot_parsers: Dict[str, RobotFileParser] = {}
        self.domain_queues: Dict[str, asyncio.Queue] = defaultdict(asyncio.Queue)

-    async def can_process_url(self, url: str) -> bool:
+    async def can_process_url(self, url: str, depth: int) -> bool:
        """Check if URL can be processed based on robots.txt and filters
        This is our gatekeeper method that determines if a URL should be processed. It:
            - Validates URL format using the validators library
@@ -80,7 +80,11 @@ class BFSScraperStrategy(ScraperStrategy):
            self.logger.info(f"Blocked by robots.txt: {url}")
            return False

-        return self.filter_chain.apply(url)
+        # Apply the filter chain it's not start page
+        if depth != 0 and not self.filter_chain.apply(url):
+            return False
+
+        return True

    async def _get_robot_parser(self, url: str) -> Optional[RobotFileParser]:
        """Get or create robots.txt parser for domain.
@@ -146,7 +150,7 @@ class BFSScraperStrategy(ScraperStrategy):
        if self._cancel_event.is_set():
            return None
            
-        if depth!=0 and not await self.can_process_url(url):
+        if not await self.can_process_url(url, depth):
            self.stats.urls_skipped += 1
            return None

@@ -194,7 +198,7 @@ class BFSScraperStrategy(ScraperStrategy):
            links_to_process += result.links["external"]
        for link in links_to_process:
            url = link['href']
-            if url not in visited and await self.can_process_url(url):
+            if url not in visited and await self.can_process_url(url,depth):
                new_depth = depths[source_url] + 1
                if new_depth <= self.max_depth:
                    if self.url_scorer: