chore: 1. Expose process_external_links as a param

2. Removed a few unused imports
3. Removed URL normalisation for external links separately as that won't be necessary
This commit is contained in:
Aravind Karnam
2024-11-26 10:07:11 +05:30
parent 2226ef53c8
commit b13fd71040

View File

@@ -1,10 +1,9 @@
from abc import ABC, abstractmethod from typing import AsyncGenerator, Optional, Dict, Set
from typing import Union, AsyncGenerator, Optional, Dict, Set
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime from datetime import datetime
import asyncio import asyncio
import logging import logging
from urllib.parse import urljoin, urlparse, urlunparse from urllib.parse import urlparse
from urllib.robotparser import RobotFileParser from urllib.robotparser import RobotFileParser
import validators import validators
import time import time
@@ -12,7 +11,7 @@ from aiolimiter import AsyncLimiter
from tenacity import retry, stop_after_attempt, wait_exponential from tenacity import retry, stop_after_attempt, wait_exponential
from collections import defaultdict from collections import defaultdict
from .models import ScraperResult, CrawlResult from .models import CrawlResult
from .filters import FilterChain from .filters import FilterChain
from .scorers import URLScorer from .scorers import URLScorer
from ..async_webcrawler import AsyncWebCrawler from ..async_webcrawler import AsyncWebCrawler
@@ -37,6 +36,7 @@ class BFSScraperStrategy(ScraperStrategy):
max_depth: int, max_depth: int,
filter_chain: FilterChain, filter_chain: FilterChain,
url_scorer: URLScorer, url_scorer: URLScorer,
process_external_links: bool = False,
max_concurrent: int = 5, max_concurrent: int = 5,
min_crawl_delay: int = 1, min_crawl_delay: int = 1,
timeout: int = 30, timeout: int = 30,
@@ -53,7 +53,7 @@ class BFSScraperStrategy(ScraperStrategy):
# Crawl control # Crawl control
self.stats = CrawlStats(start_time=datetime.now()) self.stats = CrawlStats(start_time=datetime.now())
self._cancel_event = asyncio.Event() self._cancel_event = asyncio.Event()
self.process_external_links = False self.process_external_links = process_external_links
# Rate limiting and politeness # Rate limiting and politeness
self.rate_limiter = AsyncLimiter(1, 1) self.rate_limiter = AsyncLimiter(1, 1)
@@ -189,14 +189,11 @@ class BFSScraperStrategy(ScraperStrategy):
Adds valid URLs to the queue Adds valid URLs to the queue
Updates maximum depth statistics Updates maximum depth statistics
""" """
links_ro_process = result.links["internal"] links_to_process = result.links["internal"]
if self.process_external_links: if self.process_external_links:
links_ro_process += result.links["external"] links_to_process += result.links["external"]
for link in links_ro_process: for link in links_to_process:
url = link['href'] url = link['href']
# url = urljoin(source_url, link['href'])
# url = urlunparse(urlparse(url)._replace(fragment=""))
if url not in visited and await self.can_process_url(url): if url not in visited and await self.can_process_url(url):
new_depth = depths[source_url] + 1 new_depth = depths[source_url] + 1
if new_depth <= self.max_depth: if new_depth <= self.max_depth: