Fixed a bug in _process_links, handled condition for when url_scorer is passed as None, renamed the scrapper folder to scraper.

This commit is contained in:
Aravind Karnam
2024-11-23 13:52:34 +05:30
parent c1797037c0
commit f8e85b1499
6 changed files with 35 additions and 31 deletions

View File

@@ -192,22 +192,26 @@ class BFSScraperStrategy(ScraperStrategy):
links_ro_process = result.links["internal"]
if self.process_external_links:
links_ro_process += result.links["external"]
for link_type in links_ro_process:
for link in result.links[link_type]:
url = link['href']
# url = urljoin(source_url, link['href'])
# url = urlunparse(urlparse(url)._replace(fragment=""))
if url not in visited and await self.can_process_url(url):
new_depth = depths[source_url] + 1
if new_depth <= self.max_depth:
for link in links_ro_process:
url = link['href']
# url = urljoin(source_url, link['href'])
# url = urlunparse(urlparse(url)._replace(fragment=""))
if url not in visited and await self.can_process_url(url):
new_depth = depths[source_url] + 1
if new_depth <= self.max_depth:
if self.url_scorer:
score = self.url_scorer.score(url)
await queue.put((score, new_depth, url))
depths[url] = new_depth
self.stats.total_depth_reached = max(
self.stats.total_depth_reached,
new_depth
)
else:
# When no url_scorer is provided all urls will have same score of 0.
# Therefore will be process in FIFO order as per URL depth
score = 0
await queue.put((score, new_depth, url))
depths[url] = new_depth
self.stats.total_depth_reached = max(
self.stats.total_depth_reached,
new_depth
)
async def ascrape(
self,