Fixed a few bugs, import errors and changed to asyncio wait_for instead of timeout to support python versions < 3.11
This commit is contained in:
@@ -1,3 +1,5 @@
|
|||||||
from .async_web_scraper import AsyncWebScraper
|
from .async_web_scraper import AsyncWebScraper
|
||||||
from .bfs_scraper_strategy import BFSScraperStrategy
|
from .bfs_scraper_strategy import BFSScraperStrategy
|
||||||
from .filters import URLFilter, FilterChain, URLPatternFilter, ContentTypeFilter
|
from .filters import URLFilter, FilterChain, URLPatternFilter, ContentTypeFilter, DomainFilter
|
||||||
|
from .scorers import KeywordRelevanceScorer, PathDepthScorer, FreshnessScorer, CompositeScorer
|
||||||
|
from .scraper_strategy import ScraperStrategy
|
||||||
@@ -16,6 +16,7 @@ from .models import ScraperResult, CrawlResult
|
|||||||
from .filters import FilterChain
|
from .filters import FilterChain
|
||||||
from .scorers import URLScorer
|
from .scorers import URLScorer
|
||||||
from ..async_webcrawler import AsyncWebCrawler
|
from ..async_webcrawler import AsyncWebCrawler
|
||||||
|
from .scraper_strategy import ScraperStrategy
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class CrawlStats:
|
class CrawlStats:
|
||||||
@@ -28,30 +29,6 @@ class CrawlStats:
|
|||||||
current_depth: int = 0
|
current_depth: int = 0
|
||||||
robots_blocked: int = 0
|
robots_blocked: int = 0
|
||||||
|
|
||||||
class ScraperStrategy(ABC):
|
|
||||||
"""Base class for scraping strategies"""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
async def ascrape(
|
|
||||||
self,
|
|
||||||
url: str,
|
|
||||||
crawler: AsyncWebCrawler,
|
|
||||||
parallel_processing: bool = True,
|
|
||||||
stream: bool = False
|
|
||||||
) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
|
|
||||||
"""Abstract method for scraping implementation"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
async def can_process_url(self, url: str) -> bool:
|
|
||||||
"""Check if URL can be processed based on strategy rules"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
async def shutdown(self):
|
|
||||||
"""Clean up resources used by the strategy"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
class BFSScraperStrategy(ScraperStrategy):
|
class BFSScraperStrategy(ScraperStrategy):
|
||||||
"""Breadth-First Search scraping strategy with politeness controls"""
|
"""Breadth-First Search scraping strategy with politeness controls"""
|
||||||
|
|
||||||
@@ -135,11 +112,15 @@ class BFSScraperStrategy(ScraperStrategy):
|
|||||||
) -> CrawlResult:
|
) -> CrawlResult:
|
||||||
"""Crawl URL with retry logic"""
|
"""Crawl URL with retry logic"""
|
||||||
try:
|
try:
|
||||||
async with asyncio.timeout(self.timeout):
|
return await asyncio.wait_for(crawler.arun(url), timeout=self.timeout)
|
||||||
return await crawler.arun(url)
|
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
self.logger.error(f"Timeout crawling {url}")
|
self.logger.error(f"Timeout crawling {url}")
|
||||||
raise
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
# Catch any other exceptions that may cause retries
|
||||||
|
self.logger.error(f"Error crawling {url}: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
async def process_url(
|
async def process_url(
|
||||||
self,
|
self,
|
||||||
@@ -181,16 +162,14 @@ class BFSScraperStrategy(ScraperStrategy):
|
|||||||
async with self.rate_limiter:
|
async with self.rate_limiter:
|
||||||
result = await self._crawl_with_retry(crawler, url)
|
result = await self._crawl_with_retry(crawler, url)
|
||||||
self.stats.urls_processed += 1
|
self.stats.urls_processed += 1
|
||||||
|
# Process links
|
||||||
|
await self._process_links(result, url, depth, queue, visited, depths)
|
||||||
|
return result
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f"Error crawling {url}: {e}")
|
self.logger.error(f"Error crawling {url}: {e}")
|
||||||
self.stats.urls_failed += 1
|
self.stats.urls_failed += 1
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Process links
|
|
||||||
await self._process_links(result, url, depth, queue, visited, depths)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
async def _process_links(
|
async def _process_links(
|
||||||
self,
|
self,
|
||||||
result: CrawlResult,
|
result: CrawlResult,
|
||||||
|
|||||||
@@ -6,7 +6,13 @@ from typing import Union, AsyncGenerator
|
|||||||
|
|
||||||
class ScraperStrategy(ABC):
|
class ScraperStrategy(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def ascrape(self, url: str, crawler: AsyncWebCrawler, parallel_processing: bool = True, stream: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
|
async def ascrape(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
crawler: AsyncWebCrawler,
|
||||||
|
parallel_processing: bool = True,
|
||||||
|
stream: bool = False
|
||||||
|
) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
|
||||||
"""Scrape the given URL using the specified crawler.
|
"""Scrape the given URL using the specified crawler.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -23,4 +29,14 @@ class ScraperStrategy(ABC):
|
|||||||
ScraperResult: A summary of the scrape results containing the final extracted data
|
ScraperResult: A summary of the scrape results containing the final extracted data
|
||||||
and the list of crawled URLs if stream is False.
|
and the list of crawled URLs if stream is False.
|
||||||
"""
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def can_process_url(self, url: str) -> bool:
|
||||||
|
"""Check if URL can be processed based on strategy rules"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def shutdown(self):
|
||||||
|
"""Clean up resources used by the strategy"""
|
||||||
pass
|
pass
|
||||||
Reference in New Issue
Block a user