1. Added a flag to yield each crawl result,as they become ready along with the final scraper result as another option

2. Removed ascrape_many method, as I'm currently not focusing on it in the first cut of scraper
3. Added some error handling for cases where robots.txt cannot be fetched or parsed.
This commit is contained in:
Aravind Karnam
2024-10-16 22:05:29 +05:30
parent 8a7d29ce85
commit 2943feeecf
3 changed files with 62 additions and 40 deletions

View File

@@ -1,35 +1,35 @@
import asyncio
from typing import List, Dict
from .scraper_strategy import ScraperStrategy
from .bfs_scraper_strategy import BFSScraperStrategy
from .models import ScraperResult
from .models import ScraperResult, CrawlResult
from ..async_webcrawler import AsyncWebCrawler
class BatchProcessor:
def __init__(self, batch_size: int, concurrency_limit: int):
self.batch_size = batch_size
self.concurrency_limit = concurrency_limit
async def process_batch(self, scraper: 'AsyncWebScraper', urls: List[str]) -> List[ScraperResult]:
semaphore = asyncio.Semaphore(self.concurrency_limit)
async def scrape_with_semaphore(url):
async with semaphore:
return await scraper.ascrape(url)
return await asyncio.gather(*[scrape_with_semaphore(url) for url in urls])
from typing import Union, AsyncGenerator
class AsyncWebScraper:
def __init__(self, crawler: AsyncWebCrawler, strategy: ScraperStrategy, batch_size: int = 10, concurrency_limit: int = 5):
self.crawler = crawler
self.strategy = strategy
self.batch_processor = BatchProcessor(batch_size, concurrency_limit)
async def ascrape(self, url: str, parallel_processing: bool = True) -> ScraperResult:
return await self.strategy.ascrape(url, self.crawler, parallel_processing)
async def ascrape(self, url: str, parallel_processing: bool = True, yield_results: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
if yield_results:
return self._ascrape_yielding(url, parallel_processing)
else:
return await self._ascrape_collecting(url, parallel_processing)
async def ascrape_many(self, urls: List[str]) -> List[ScraperResult]:
all_results = []
for i in range(0, len(urls), self.batch_processor.batch_size):
batch = urls[i:i+self.batch_processor.batch_size]
batch_results = await self.batch_processor.process_batch(self, batch)
all_results.extend(batch_results)
return all_results
async def _ascrape_yielding(self, url: str, parallel_processing: bool) -> AsyncGenerator[CrawlResult, None]:
result_generator = self.strategy.ascrape(url, self.crawler, parallel_processing)
async for res in result_generator: # Consume the async generator
yield res # Yielding individual results
async def _ascrape_collecting(self, url: str, parallel_processing: bool) -> ScraperResult:
extracted_data = {}
result_generator = self.strategy.ascrape(url, self.crawler, parallel_processing)
async for res in result_generator: # Consume the async generator
extracted_data[res.url] = res
# Return a final ScraperResult
return ScraperResult(
url=url,
crawled_urls=list(extracted_data.keys()),
extracted_data=extracted_data
)

View File

@@ -1,7 +1,6 @@
from .scraper_strategy import ScraperStrategy
from .filters import FilterChain
from .scorers import URLScorer
from .models import ScraperResult
from ..models import CrawlResult
from ..async_webcrawler import AsyncWebCrawler
import asyncio
@@ -13,7 +12,7 @@ from aiolimiter import AsyncLimiter
from tenacity import retry, stop_after_attempt, wait_exponential
from collections import defaultdict
import logging
from typing import Dict
from typing import Dict, AsyncGenerator
logging.basicConfig(level=logging.DEBUG)
rate_limiter = AsyncLimiter(1, 1) # 1 request per second
@@ -38,7 +37,12 @@ class BFSScraperStrategy(ScraperStrategy):
if netloc not in self.robot_parsers:
rp = RobotFileParser()
rp.set_url(f"{scheme}://{netloc}/robots.txt")
rp.read()
try:
rp.read()
except Exception as e:
# Log the type of error, message, and the URL
logging.warning(f"Error {type(e).__name__} occurred while fetching robots.txt for {netloc}: {e}")
return None
self.robot_parsers[netloc] = rp
return self.robot_parsers[netloc]
@@ -48,7 +52,7 @@ class BFSScraperStrategy(ScraperStrategy):
async def retry_crawl(self, crawler: AsyncWebCrawler, url: str) -> CrawlResult:
return await crawler.arun(url)
async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set, depths: Dict[str, int]) -> CrawlResult:
async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set, depths: Dict[str, int]) -> AsyncGenerator[CrawlResult, None]:
def normalize_url(url: str) -> str:
parsed = urlparse(url)
return urlunparse(parsed._replace(fragment=""))
@@ -59,9 +63,14 @@ class BFSScraperStrategy(ScraperStrategy):
return None
# Robots.txt Compliance
if not self.get_robot_parser(url).can_fetch(crawler.crawler_strategy.user_agent, url):
logging.info(f"Skipping {url} as per robots.txt")
return None
robot_parser = self.get_robot_parser(url)
if robot_parser is None:
logging.info(f"Could not retrieve robots.txt for {url}, hence proceeding with crawl.")
else:
# If robots.txt was fetched, check if crawling is allowed
if not robot_parser.can_fetch(crawler.crawler_strategy.user_agent, url):
logging.info(f"Skipping {url} as per robots.txt")
return None
# Crawl Politeness
domain = urlparse(url).netloc
@@ -103,14 +112,12 @@ class BFSScraperStrategy(ScraperStrategy):
score = self.url_scorer.score(normalized_link)
await queue.put((score, new_depth, normalized_link))
depths[normalized_link] = new_depth
return crawl_result
async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> ScraperResult:
async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> CrawlResult:
queue = asyncio.PriorityQueue()
queue.put_nowait((0, 0, start_url))
visited = set()
extracted_data = {}
depths = {start_url: 0}
while not queue.empty():
@@ -124,12 +131,10 @@ class BFSScraperStrategy(ScraperStrategy):
else:
result = await self.process_url(url, depth, crawler, queue, visited, depths)
if result:
extracted_data[result.url] = result
yield result
if parallel_processing and tasks:
results = await asyncio.gather(*tasks)
for result in results:
if result:
extracted_data[result.url] = result
return ScraperResult(url=start_url, crawled_urls=list(visited), extracted_data=extracted_data)
yield result

View File

@@ -1,9 +1,26 @@
from abc import ABC, abstractmethod
from .models import ScraperResult
from .models import ScraperResult, CrawlResult
from ..models import CrawlResult
from ..async_webcrawler import AsyncWebCrawler
from typing import Union, AsyncGenerator
class ScraperStrategy(ABC):
@abstractmethod
async def ascrape(self, url: str, crawler: AsyncWebCrawler) -> ScraperResult:
async def ascrape(self, url: str, crawler: AsyncWebCrawler, parallel_processing: bool=True, yield_results: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
"""Scrape the given URL using the specified crawler.
Args:
url (str): The starting URL for the scrape.
crawler (AsyncWebCrawler): The web crawler instance.
parallel_processing (bool): Whether to use parallel processing. Defaults to True.
yield_results (bool): If True, yields individual crawl results as they are ready;
if False, accumulates results and returns a final ScraperResult.
Yields:
CrawlResult: Individual crawl results if yield_results is True.
Returns:
ScraperResult: A summary of the scrape results containing the final extracted data
and the list of crawled URLs if yield_results is False.
"""
pass