Fixed a few bugs, import errors and changed to asyncio wait_for instead of timeout to support python versions < 3.11

This commit is contained in:
Aravind Karnam
2024-11-23 12:39:25 +05:30
parent 60670b2af6
commit c1797037c0
3 changed files with 30 additions and 33 deletions

View File

@@ -1,3 +1,5 @@
from .async_web_scraper import AsyncWebScraper from .async_web_scraper import AsyncWebScraper
from .bfs_scraper_strategy import BFSScraperStrategy from .bfs_scraper_strategy import BFSScraperStrategy
from .filters import URLFilter, FilterChain, URLPatternFilter, ContentTypeFilter from .filters import URLFilter, FilterChain, URLPatternFilter, ContentTypeFilter, DomainFilter
from .scorers import KeywordRelevanceScorer, PathDepthScorer, FreshnessScorer, CompositeScorer
from .scraper_strategy import ScraperStrategy

View File

@@ -16,6 +16,7 @@ from .models import ScraperResult, CrawlResult
from .filters import FilterChain from .filters import FilterChain
from .scorers import URLScorer from .scorers import URLScorer
from ..async_webcrawler import AsyncWebCrawler from ..async_webcrawler import AsyncWebCrawler
from .scraper_strategy import ScraperStrategy
@dataclass @dataclass
class CrawlStats: class CrawlStats:
@@ -28,30 +29,6 @@ class CrawlStats:
current_depth: int = 0 current_depth: int = 0
robots_blocked: int = 0 robots_blocked: int = 0
class ScraperStrategy(ABC):
"""Base class for scraping strategies"""
@abstractmethod
async def ascrape(
self,
url: str,
crawler: AsyncWebCrawler,
parallel_processing: bool = True,
stream: bool = False
) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
"""Abstract method for scraping implementation"""
pass
@abstractmethod
async def can_process_url(self, url: str) -> bool:
"""Check if URL can be processed based on strategy rules"""
pass
@abstractmethod
async def shutdown(self):
"""Clean up resources used by the strategy"""
pass
class BFSScraperStrategy(ScraperStrategy): class BFSScraperStrategy(ScraperStrategy):
"""Breadth-First Search scraping strategy with politeness controls""" """Breadth-First Search scraping strategy with politeness controls"""
@@ -135,11 +112,15 @@ class BFSScraperStrategy(ScraperStrategy):
) -> CrawlResult: ) -> CrawlResult:
"""Crawl URL with retry logic""" """Crawl URL with retry logic"""
try: try:
async with asyncio.timeout(self.timeout): return await asyncio.wait_for(crawler.arun(url), timeout=self.timeout)
return await crawler.arun(url)
except asyncio.TimeoutError: except asyncio.TimeoutError:
self.logger.error(f"Timeout crawling {url}") self.logger.error(f"Timeout crawling {url}")
raise raise
except Exception as e:
# Catch any other exceptions that may cause retries
self.logger.error(f"Error crawling {url}: {e}")
raise
async def process_url( async def process_url(
self, self,
@@ -181,16 +162,14 @@ class BFSScraperStrategy(ScraperStrategy):
async with self.rate_limiter: async with self.rate_limiter:
result = await self._crawl_with_retry(crawler, url) result = await self._crawl_with_retry(crawler, url)
self.stats.urls_processed += 1 self.stats.urls_processed += 1
# Process links
await self._process_links(result, url, depth, queue, visited, depths)
return result
except Exception as e: except Exception as e:
self.logger.error(f"Error crawling {url}: {e}") self.logger.error(f"Error crawling {url}: {e}")
self.stats.urls_failed += 1 self.stats.urls_failed += 1
return None return None
# Process links
await self._process_links(result, url, depth, queue, visited, depths)
return result
async def _process_links( async def _process_links(
self, self,
result: CrawlResult, result: CrawlResult,

View File

@@ -6,7 +6,13 @@ from typing import Union, AsyncGenerator
class ScraperStrategy(ABC): class ScraperStrategy(ABC):
@abstractmethod @abstractmethod
async def ascrape(self, url: str, crawler: AsyncWebCrawler, parallel_processing: bool = True, stream: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: async def ascrape(
self,
url: str,
crawler: AsyncWebCrawler,
parallel_processing: bool = True,
stream: bool = False
) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
"""Scrape the given URL using the specified crawler. """Scrape the given URL using the specified crawler.
Args: Args:
@@ -23,4 +29,14 @@ class ScraperStrategy(ABC):
ScraperResult: A summary of the scrape results containing the final extracted data ScraperResult: A summary of the scrape results containing the final extracted data
and the list of crawled URLs if stream is False. and the list of crawled URLs if stream is False.
""" """
pass
@abstractmethod
async def can_process_url(self, url: str) -> bool:
"""Check if URL can be processed based on strategy rules"""
pass
@abstractmethod
async def shutdown(self):
"""Clean up resources used by the strategy"""
pass pass