From b13fd71040184851c367bd287d07772459eeb07a Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Tue, 26 Nov 2024 10:07:11 +0530
Subject: [PATCH] chore: 1. Expose process_external_links as a param 2. Removed
 a few unused imports 3. Removed URL normalisation for external links
 separately as that won't be necessary

---
 crawl4ai/scraper/bfs_scraper_strategy.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py
index b87412ef..c12bf42e 100644
--- a/crawl4ai/scraper/bfs_scraper_strategy.py
+++ b/crawl4ai/scraper/bfs_scraper_strategy.py
@@ -1,10 +1,9 @@
-from abc import ABC, abstractmethod
-from typing import Union, AsyncGenerator, Optional, Dict, Set
+from typing import AsyncGenerator, Optional, Dict, Set
 from dataclasses import dataclass
 from datetime import datetime
 import asyncio
 import logging
-from urllib.parse import urljoin, urlparse, urlunparse
+from urllib.parse import urlparse
 from urllib.robotparser import RobotFileParser
 import validators
 import time
@@ -12,7 +11,7 @@ from aiolimiter import AsyncLimiter
 from tenacity import retry, stop_after_attempt, wait_exponential
 from collections import defaultdict
 
-from .models import ScraperResult, CrawlResult
+from .models import CrawlResult
 from .filters import FilterChain
 from .scorers import URLScorer
 from ..async_webcrawler import AsyncWebCrawler
@@ -37,6 +36,7 @@ class BFSScraperStrategy(ScraperStrategy):
         max_depth: int,
         filter_chain: FilterChain,
         url_scorer: URLScorer,
+        process_external_links: bool = False,
         max_concurrent: int = 5,
         min_crawl_delay: int = 1,
         timeout: int = 30,
@@ -53,7 +53,7 @@ class BFSScraperStrategy(ScraperStrategy):
         # Crawl control
         self.stats = CrawlStats(start_time=datetime.now())
         self._cancel_event = asyncio.Event()
-        self.process_external_links = False
+        self.process_external_links = process_external_links
         
         # Rate limiting and politeness
         self.rate_limiter = AsyncLimiter(1, 1)
@@ -189,14 +189,11 @@ class BFSScraperStrategy(ScraperStrategy):
             Adds valid URLs to the queue
             Updates maximum depth statistics
         """
-        links_ro_process = result.links["internal"]
+        links_to_process = result.links["internal"]
         if self.process_external_links:
-            links_ro_process += result.links["external"]
-        for link in links_ro_process:
+            links_to_process += result.links["external"]
+        for link in links_to_process:
             url = link['href']
-            # url = urljoin(source_url, link['href'])
-            # url = urlunparse(urlparse(url)._replace(fragment=""))
-            
             if url not in visited and await self.can_process_url(url):
                 new_depth = depths[source_url] + 1
                 if new_depth <= self.max_depth: