feat: change input params to scraper, Add asynchronous context manager to AsyncWebScraper, Optimise filter application

This commit is contained in:
Aravind Karnam
2025-01-27 18:13:33 +05:30
parent bb6450f458
commit 0ff95c83bc
4 changed files with 104 additions and 75 deletions

View File

@@ -1,10 +1,11 @@
from typing import Union, AsyncGenerator, Optional from typing import Union, AsyncGenerator, Optional
from .scraper_strategy import ScraperStrategy from .scraper_strategy import ScraperStrategy
from .models import ScraperResult, CrawlResult from .models import ScraperResult, CrawlResult
from ..async_webcrawler import AsyncWebCrawler from ..async_configs import BrowserConfig, CrawlerRunConfig
import logging import logging
from dataclasses import dataclass from dataclasses import dataclass
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from contextlib import AbstractAsyncContextManager
@dataclass @dataclass
@@ -16,28 +17,38 @@ class ScrapingProgress:
current_url: Optional[str] = None current_url: Optional[str] = None
class AsyncWebScraper: class AsyncWebScraper(AbstractAsyncContextManager):
""" """
A high-level web scraper that combines an async crawler with a scraping strategy. A high-level web scraper that combines an async crawler with a scraping strategy.
Args: Args:
crawler (AsyncWebCrawler): The async web crawler implementation crawler_config (CrawlerRunConfig): Configuration for the crawler run
browser_config (BrowserConfig): Configuration for the browser
strategy (ScraperStrategy): The scraping strategy to use strategy (ScraperStrategy): The scraping strategy to use
logger (Optional[logging.Logger]): Custom logger for the scraper logger (Optional[logging.Logger]): Custom logger for the scraper
""" """
async def __aenter__(self):
# Initialize resources, if any
self.logger.info("Starting the async web scraper.")
return self
def __init__( def __init__(
self, self,
crawler: AsyncWebCrawler, crawler_config: CrawlerRunConfig,
browser_config: BrowserConfig,
strategy: ScraperStrategy, strategy: ScraperStrategy,
logger: Optional[logging.Logger] = None, logger: Optional[logging.Logger] = None,
): ):
if not isinstance(crawler, AsyncWebCrawler): if not isinstance(browser_config, BrowserConfig):
raise TypeError("crawler must be an instance of AsyncWebCrawler") raise TypeError("browser_config must be an instance of BrowserConfig")
if not isinstance(crawler_config, CrawlerRunConfig):
raise TypeError("crawler must be an instance of CrawlerRunConfig")
if not isinstance(strategy, ScraperStrategy): if not isinstance(strategy, ScraperStrategy):
raise TypeError("strategy must be an instance of ScraperStrategy") raise TypeError("strategy must be an instance of ScraperStrategy")
self.crawler = crawler self.crawler_config = crawler_config
self.browser_config = browser_config
self.strategy = strategy self.strategy = strategy
self.logger = logger or logging.getLogger(__name__) self.logger = logger or logging.getLogger(__name__)
self._progress = ScrapingProgress() self._progress = ScrapingProgress()
@@ -83,7 +94,9 @@ class AsyncWebScraper:
) -> AsyncGenerator[CrawlResult, None]: ) -> AsyncGenerator[CrawlResult, None]:
"""Stream scraping results as they become available.""" """Stream scraping results as they become available."""
try: try:
result_generator = self.strategy.ascrape(url, self.crawler) result_generator = self.strategy.ascrape(
url, self.crawler_config, self.browser_config
)
async for res in result_generator: async for res in result_generator:
self._progress.processed_urls += 1 self._progress.processed_urls += 1
self._progress.current_url = res.url self._progress.current_url = res.url
@@ -100,7 +113,9 @@ class AsyncWebScraper:
extracted_data = {} extracted_data = {}
try: try:
result_generator = self.strategy.ascrape(url, self.crawler) result_generator = self.strategy.ascrape(
url, self.crawler_config, self.browser_config
)
async for res in result_generator: async for res in result_generator:
self._progress.processed_urls += 1 self._progress.processed_urls += 1
self._progress.current_url = res.url self._progress.current_url = res.url
@@ -118,3 +133,11 @@ class AsyncWebScraper:
except Exception as e: except Exception as e:
self.logger.error(f"Error in collecting scrape: {str(e)}") self.logger.error(f"Error in collecting scrape: {str(e)}")
raise raise
async def __aexit__(self, exc_type, exc_val, exc_tb):
# Cleanup resources or tasks
await self.close() # Assuming you have a close method to cleanup
async def close(self):
# Perform cleanup tasks
pass

View File

@@ -5,13 +5,11 @@ import asyncio
import logging import logging
from urllib.parse import urlparse from urllib.parse import urlparse
# import validators from ..async_webcrawler import AsyncWebCrawler
from ..async_configs import BrowserConfig, CrawlerRunConfig
from ..async_configs import CrawlerRunConfig
from .models import CrawlResult from .models import CrawlResult
from .filters import FilterChain from .filters import FilterChain
from .scorers import URLScorer from .scorers import URLScorer
from ..async_webcrawler import AsyncWebCrawler
from .scraper_strategy import ScraperStrategy from .scraper_strategy import ScraperStrategy
from ..config import SCRAPER_BATCH_SIZE from ..config import SCRAPER_BATCH_SIZE
@@ -99,28 +97,26 @@ class BFSScraperStrategy(ScraperStrategy):
links_to_process += result.links["external"] links_to_process += result.links["external"]
for link in links_to_process: for link in links_to_process:
url = link["href"] url = link["href"]
if not await self.can_process_url(url, depth): if url in visited:
continue
new_depth = depths[source_url] + 1
if new_depth > self.max_depth:
continue
if not await self.can_process_url(url, new_depth):
self.stats.urls_skipped += 1 self.stats.urls_skipped += 1
continue continue
if url not in visited: score = self.url_scorer.score(url) if self.url_scorer else 0
new_depth = depths[source_url] + 1 await queue.put((score, new_depth, url))
if new_depth <= self.max_depth: depths[url] = new_depth
if self.url_scorer: self.stats.total_depth_reached = max(
score = self.url_scorer.score(url) self.stats.total_depth_reached, new_depth
else: )
# When no url_scorer is provided all urls will have same score of 0.
# Therefore will be process in FIFO order as per URL depth
score = 0
await queue.put((score, new_depth, url))
depths[url] = new_depth
self.stats.total_depth_reached = max(
self.stats.total_depth_reached, new_depth
)
async def ascrape( async def ascrape(
self, self,
start_url: str, start_url: str,
crawler: AsyncWebCrawler, crawler_config: CrawlerRunConfig,
browser_config: BrowserConfig,
) -> AsyncGenerator[CrawlResult, None]: ) -> AsyncGenerator[CrawlResult, None]:
"""Implement BFS crawling strategy""" """Implement BFS crawling strategy"""
@@ -164,34 +160,39 @@ class BFSScraperStrategy(ScraperStrategy):
if active_crawls: if active_crawls:
await asyncio.sleep(0.1) await asyncio.sleep(0.1)
continue continue
# Process batch # Process batch
crawler_config = CrawlerRunConfig(cache_mode="BYPASS", stream=True) async with AsyncWebCrawler(
try: config=browser_config,
async for result in await crawler.arun_many( verbose=True,
urls=[url for _, _, url in jobs], config=crawler_config ) as crawler:
): try:
source_url, depth = next( async for result in await crawler.arun_many(
(url, depth) for _, depth, url in jobs if url == result.url urls=[url for _, _, url in jobs],
) config=crawler_config.clone(stream=True),
active_crawls.remove(source_url) # Remove from active set ):
source_url, depth = next(
(url, depth)
for _, depth, url in jobs
if url == result.url
)
active_crawls.remove(source_url) # Remove from active set
if result.success: if result.success:
await self._process_links( await self._process_links(
result, source_url, depth, queue, visited, depths result, source_url, depth, queue, visited, depths
) )
yield result yield result
else: else:
self.logger.warning( self.logger.warning(
f"Failed to crawl {result.url}: {result.error_message}" f"Failed to crawl {result.url}: {result.error_message}"
) )
except Exception as e: except Exception as e:
# Remove failed URLs from active set # Remove failed URLs from active set
for _, _, url in jobs: for _, _, url in jobs:
active_crawls.discard(url) active_crawls.discard(url)
self.logger.error(f"Batch processing error: {e}") self.logger.error(f"Batch processing error: {e}")
# Continue processing other batches # Continue processing other batches
continue continue
except Exception as e: except Exception as e:
self.logger.error(f"Error in crawl process: {e}") self.logger.error(f"Error in crawl process: {e}")
@@ -199,6 +200,7 @@ class BFSScraperStrategy(ScraperStrategy):
finally: finally:
self.stats.end_time = datetime.now() self.stats.end_time = datetime.now()
await crawler.close()
async def shutdown(self): async def shutdown(self):
"""Clean up resources and stop crawling""" """Clean up resources and stop crawling"""

View File

@@ -1,23 +1,23 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from .models import ScraperResult, CrawlResult from .models import ScraperResult, CrawlResult
from ..models import CrawlResult from ..models import CrawlResult
from ..async_webcrawler import AsyncWebCrawler from ..async_configs import BrowserConfig, CrawlerRunConfig
from typing import Union, AsyncGenerator from typing import Union, AsyncGenerator
class ScraperStrategy(ABC): class ScraperStrategy(ABC):
@abstractmethod @abstractmethod
async def ascrape( async def ascrape(
self, self,
url: str, url: str,
crawler: AsyncWebCrawler, crawler_config: CrawlerRunConfig,
browser_config: BrowserConfig,
stream: bool = False, stream: bool = False,
) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: ) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
"""Scrape the given URL using the specified crawler. """Scrape the given URL using the specified crawler.
Args: Args:
url (str): The starting URL for the scrape. url (str): The starting URL for the scrape.
crawler (AsyncWebCrawler): The web crawler instance. crawler_config (CrawlerRunConfig): Configuration for the crawler run.
browser_config (BrowserConfig): Configuration for the browser.
stream (bool): If True, yields individual crawl results as they are ready; stream (bool): If True, yields individual crawl results as they are ready;
if False, accumulates results and returns a final ScraperResult. if False, accumulates results and returns a final ScraperResult.

View File

@@ -1,4 +1,5 @@
# basic_scraper_example.py # basic_scraper_example.py
from crawl4ai.async_configs import CrawlerRunConfig
from crawl4ai.scraper import ( from crawl4ai.scraper import (
AsyncWebScraper, AsyncWebScraper,
BFSScraperStrategy, BFSScraperStrategy,
@@ -24,14 +25,14 @@ async def basic_scraper_example():
filter_chain = FilterChain( filter_chain = FilterChain(
[ [
# Only crawl pages within the blog section # Only crawl pages within the blog section
URLPatternFilter("*/tutorial/*"), URLPatternFilter("*/basic/*"),
# Only process HTML pages # Only process HTML pages
ContentTypeFilter(["text/html"]), ContentTypeFilter(["text/html"]),
] ]
) )
# Initialize the strategy with basic configuration # Initialize the strategy with basic configuration
strategy = BFSScraperStrategy( bfs_strategy = BFSScraperStrategy(
max_depth=2, # Only go 2 levels deep max_depth=2, # Only go 2 levels deep
filter_chain=filter_chain, filter_chain=filter_chain,
url_scorer=None, # Use default scoring url_scorer=None, # Use default scoring
@@ -39,8 +40,11 @@ async def basic_scraper_example():
) )
# Create the crawler and scraper # Create the crawler and scraper
async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler: async with AsyncWebScraper(
scraper = AsyncWebScraper(crawler, strategy) crawler_config=CrawlerRunConfig(bypass_cache=True),
browser_config=browser_config,
strategy=bfs_strategy,
) as scraper:
# Start scraping # Start scraping
try: try:
result = await scraper.ascrape("https://crawl4ai.com/mkdocs") result = await scraper.ascrape("https://crawl4ai.com/mkdocs")
@@ -69,7 +73,6 @@ from crawl4ai.scraper import (
FreshnessScorer, FreshnessScorer,
CompositeScorer, CompositeScorer,
) )
from crawl4ai.async_webcrawler import AsyncWebCrawler
async def advanced_scraper_example(): async def advanced_scraper_example():
@@ -121,13 +124,14 @@ async def advanced_scraper_example():
) )
# Initialize strategy with advanced configuration # Initialize strategy with advanced configuration
strategy = BFSScraperStrategy( bfs_strategy = BFSScraperStrategy(
max_depth=2, filter_chain=filter_chain, url_scorer=scorer max_depth=2, filter_chain=filter_chain, url_scorer=scorer
) )
# Create crawler and scraper # Create crawler and scraper
async with AsyncWebCrawler(verbose=True, config=browser_config) as crawler: async with AsyncWebScraper(crawler_config=CrawlerRunConfig(bypass_cache=True),
scraper = AsyncWebScraper(crawler, strategy) browser_config=browser_config,
strategy=bfs_strategy) as scraper:
# Track statistics # Track statistics
stats = {"processed": 0, "errors": 0, "total_size": 0} stats = {"processed": 0, "errors": 0, "total_size": 0}
@@ -182,12 +186,12 @@ if __name__ == "__main__":
import time import time
# Run basic example # Run basic example
start_time = time.perf_counter() # start_time = time.perf_counter()
print("Running basic scraper example...") # print("Running basic scraper example...")
asyncio.run(basic_scraper_example()) # asyncio.run(basic_scraper_example())
end_time = time.perf_counter() # end_time = time.perf_counter()
print(f"Basic scraper example completed in {end_time - start_time:.2f} seconds") # print(f"Basic scraper example completed in {end_time - start_time:.2f} seconds")
# # Run advanced example # # Run advanced example
# print("\nRunning advanced scraper example...") print("\nRunning advanced scraper example...")
# asyncio.run(advanced_scraper_example()) asyncio.run(advanced_scraper_example())