Fixed a bug in _process_links, handled condition for when url_scorer is passed as None, renamed the scrapper folder to scraper.

2024-11-23 13:52:34 +05:30
parent c1797037c0
commit f8e85b1499
6 changed files with 35 additions and 31 deletions
--- a/crawl4ai/scraper/bfs_scraper_strategy.py
+++ b/crawl4ai/scraper/bfs_scraper_strategy.py
@@ -192,22 +192,26 @@ class BFSScraperStrategy(ScraperStrategy):
        links_ro_process = result.links["internal"]
        if self.process_external_links:
            links_ro_process += result.links["external"]
-        for link_type in links_ro_process:
-            for link in result.links[link_type]:
-                url = link['href']
-                # url = urljoin(source_url, link['href'])
-                # url = urlunparse(urlparse(url)._replace(fragment=""))
-                
-                if url not in visited and await self.can_process_url(url):
-                    new_depth = depths[source_url] + 1
-                    if new_depth <= self.max_depth:
+        for link in links_ro_process:
+            url = link['href']
+            # url = urljoin(source_url, link['href'])
+            # url = urlunparse(urlparse(url)._replace(fragment=""))
+            
+            if url not in visited and await self.can_process_url(url):
+                new_depth = depths[source_url] + 1
+                if new_depth <= self.max_depth:
+                    if self.url_scorer:
                        score = self.url_scorer.score(url)
-                        await queue.put((score, new_depth, url))
-                        depths[url] = new_depth
-                        self.stats.total_depth_reached = max(
-                            self.stats.total_depth_reached, 
-                            new_depth
-                        )
+                    else:
+                        # When no url_scorer is provided all urls will have same score of 0.
+                        # Therefore will be process in FIFO order as per URL depth
+                        score = 0
+                    await queue.put((score, new_depth, url))
+                    depths[url] = new_depth
+                    self.stats.total_depth_reached = max(
+                        self.stats.total_depth_reached, 
+                        new_depth
+                    )

    async def ascrape(
        self,