From c1797037c02a3d26cd8e71fc3ba088c3a919c6cd Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Sat, 23 Nov 2024 12:39:25 +0530
Subject: [PATCH] Fixed a few bugs, import errors and changed to asyncio
 wait_for instead of timeout to support python versions < 3.11

---
 crawl4ai/scraper/__init__.py             |  4 ++-
 crawl4ai/scraper/bfs_scraper_strategy.py | 41 ++++++------------------
 crawl4ai/scraper/scraper_strategy.py     | 18 ++++++++++-
 3 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/crawl4ai/scraper/__init__.py b/crawl4ai/scraper/__init__.py
index 1138a917..5af7ad6b 100644
--- a/crawl4ai/scraper/__init__.py
+++ b/crawl4ai/scraper/__init__.py
@@ -1,3 +1,5 @@
 from .async_web_scraper import AsyncWebScraper
 from .bfs_scraper_strategy import BFSScraperStrategy
-from .filters import URLFilter, FilterChain, URLPatternFilter, ContentTypeFilter
\ No newline at end of file
+from .filters import URLFilter, FilterChain, URLPatternFilter, ContentTypeFilter, DomainFilter
+from .scorers import KeywordRelevanceScorer, PathDepthScorer, FreshnessScorer, CompositeScorer
+from .scraper_strategy import ScraperStrategy
\ No newline at end of file
diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py
index 72935008..cae7ba90 100644
--- a/crawl4ai/scraper/bfs_scraper_strategy.py
+++ b/crawl4ai/scraper/bfs_scraper_strategy.py
@@ -16,6 +16,7 @@ from .models import ScraperResult, CrawlResult
 from .filters import FilterChain
 from .scorers import URLScorer
 from ..async_webcrawler import AsyncWebCrawler
+from .scraper_strategy import ScraperStrategy
 
 @dataclass
 class CrawlStats:
@@ -28,30 +29,6 @@ class CrawlStats:
     current_depth: int = 0
     robots_blocked: int = 0
 
-class ScraperStrategy(ABC):
-    """Base class for scraping strategies"""
-    
-    @abstractmethod
-    async def ascrape(
-        self, 
-        url: str, 
-        crawler: AsyncWebCrawler, 
-        parallel_processing: bool = True,
-        stream: bool = False
-    ) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
-        """Abstract method for scraping implementation"""
-        pass
-
-    @abstractmethod
-    async def can_process_url(self, url: str) -> bool:
-        """Check if URL can be processed based on strategy rules"""
-        pass
-
-    @abstractmethod
-    async def shutdown(self):
-        """Clean up resources used by the strategy"""
-        pass
-
 class BFSScraperStrategy(ScraperStrategy):
     """Breadth-First Search scraping strategy with politeness controls"""
 
@@ -135,11 +112,15 @@ class BFSScraperStrategy(ScraperStrategy):
     ) -> CrawlResult:
         """Crawl URL with retry logic"""
         try:
-            async with asyncio.timeout(self.timeout):
-                return await crawler.arun(url)
+            return await asyncio.wait_for(crawler.arun(url), timeout=self.timeout)
         except asyncio.TimeoutError:
             self.logger.error(f"Timeout crawling {url}")
             raise
+        except Exception as e:
+            # Catch any other exceptions that may cause retries
+            self.logger.error(f"Error crawling {url}: {e}")
+            raise
+
 
     async def process_url(
         self,
@@ -181,16 +162,14 @@ class BFSScraperStrategy(ScraperStrategy):
             async with self.rate_limiter:
                 result = await self._crawl_with_retry(crawler, url)
                 self.stats.urls_processed += 1
+                 # Process links
+                await self._process_links(result, url, depth, queue, visited, depths)
+                return result
         except Exception as e:
             self.logger.error(f"Error crawling {url}: {e}")
             self.stats.urls_failed += 1
             return None
 
-        # Process links
-        await self._process_links(result, url, depth, queue, visited, depths)
-        
-        return result
-
     async def _process_links(
         self,
         result: CrawlResult,
diff --git a/crawl4ai/scraper/scraper_strategy.py b/crawl4ai/scraper/scraper_strategy.py
index e4872de7..f1588f0c 100644
--- a/crawl4ai/scraper/scraper_strategy.py
+++ b/crawl4ai/scraper/scraper_strategy.py
@@ -6,7 +6,13 @@ from typing import Union, AsyncGenerator
 
 class ScraperStrategy(ABC):
     @abstractmethod
-    async def ascrape(self, url: str, crawler: AsyncWebCrawler, parallel_processing: bool = True, stream: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
+    async def ascrape(
+        self, 
+        url: str, 
+        crawler: AsyncWebCrawler, 
+        parallel_processing: bool = True,
+        stream: bool = False
+    ) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
         """Scrape the given URL using the specified crawler.
 
         Args:
@@ -23,4 +29,14 @@ class ScraperStrategy(ABC):
             ScraperResult: A summary of the scrape results containing the final extracted data 
             and the list of crawled URLs if stream is False.
         """
+        pass
+
+    @abstractmethod
+    async def can_process_url(self, url: str) -> bool:
+        """Check if URL can be processed based on strategy rules"""
+        pass
+
+    @abstractmethod
+    async def shutdown(self):
+        """Clean up resources used by the strategy"""
         pass
\ No newline at end of file