feat: change input params to scraper, Add asynchronous context manager to AsyncWebScraper, Optimise filter application

This commit is contained in:
Aravind Karnam
2025-01-27 18:13:33 +05:30
parent bb6450f458
commit 0ff95c83bc
4 changed files with 104 additions and 75 deletions

View File

@@ -1,10 +1,11 @@
from typing import Union, AsyncGenerator, Optional
from .scraper_strategy import ScraperStrategy
from .models import ScraperResult, CrawlResult
from ..async_webcrawler import AsyncWebCrawler
from ..async_configs import BrowserConfig, CrawlerRunConfig
import logging
from dataclasses import dataclass
from contextlib import asynccontextmanager
from contextlib import AbstractAsyncContextManager
@dataclass
@@ -16,28 +17,38 @@ class ScrapingProgress:
current_url: Optional[str] = None
class AsyncWebScraper:
class AsyncWebScraper(AbstractAsyncContextManager):
"""
A high-level web scraper that combines an async crawler with a scraping strategy.
Args:
crawler (AsyncWebCrawler): The async web crawler implementation
crawler_config (CrawlerRunConfig): Configuration for the crawler run
browser_config (BrowserConfig): Configuration for the browser
strategy (ScraperStrategy): The scraping strategy to use
logger (Optional[logging.Logger]): Custom logger for the scraper
"""
async def __aenter__(self):
# Initialize resources, if any
self.logger.info("Starting the async web scraper.")
return self
def __init__(
self,
crawler: AsyncWebCrawler,
crawler_config: CrawlerRunConfig,
browser_config: BrowserConfig,
strategy: ScraperStrategy,
logger: Optional[logging.Logger] = None,
):
if not isinstance(crawler, AsyncWebCrawler):
raise TypeError("crawler must be an instance of AsyncWebCrawler")
if not isinstance(browser_config, BrowserConfig):
raise TypeError("browser_config must be an instance of BrowserConfig")
if not isinstance(crawler_config, CrawlerRunConfig):
raise TypeError("crawler must be an instance of CrawlerRunConfig")
if not isinstance(strategy, ScraperStrategy):
raise TypeError("strategy must be an instance of ScraperStrategy")
self.crawler = crawler
self.crawler_config = crawler_config
self.browser_config = browser_config
self.strategy = strategy
self.logger = logger or logging.getLogger(__name__)
self._progress = ScrapingProgress()
@@ -83,7 +94,9 @@ class AsyncWebScraper:
) -> AsyncGenerator[CrawlResult, None]:
"""Stream scraping results as they become available."""
try:
result_generator = self.strategy.ascrape(url, self.crawler)
result_generator = self.strategy.ascrape(
url, self.crawler_config, self.browser_config
)
async for res in result_generator:
self._progress.processed_urls += 1
self._progress.current_url = res.url
@@ -100,7 +113,9 @@ class AsyncWebScraper:
extracted_data = {}
try:
result_generator = self.strategy.ascrape(url, self.crawler)
result_generator = self.strategy.ascrape(
url, self.crawler_config, self.browser_config
)
async for res in result_generator:
self._progress.processed_urls += 1
self._progress.current_url = res.url
@@ -118,3 +133,11 @@ class AsyncWebScraper:
except Exception as e:
self.logger.error(f"Error in collecting scrape: {str(e)}")
raise
async def __aexit__(self, exc_type, exc_val, exc_tb):
# Cleanup resources or tasks
await self.close() # Assuming you have a close method to cleanup
async def close(self):
# Perform cleanup tasks
pass

View File

@@ -5,13 +5,11 @@ import asyncio
import logging
from urllib.parse import urlparse
# import validators
from ..async_configs import CrawlerRunConfig
from ..async_webcrawler import AsyncWebCrawler
from ..async_configs import BrowserConfig, CrawlerRunConfig
from .models import CrawlResult
from .filters import FilterChain
from .scorers import URLScorer
from ..async_webcrawler import AsyncWebCrawler
from .scraper_strategy import ScraperStrategy
from ..config import SCRAPER_BATCH_SIZE
@@ -99,28 +97,26 @@ class BFSScraperStrategy(ScraperStrategy):
links_to_process += result.links["external"]
for link in links_to_process:
url = link["href"]
if not await self.can_process_url(url, depth):
if url in visited:
continue
new_depth = depths[source_url] + 1
if new_depth > self.max_depth:
continue
if not await self.can_process_url(url, new_depth):
self.stats.urls_skipped += 1
continue
if url not in visited:
new_depth = depths[source_url] + 1
if new_depth <= self.max_depth:
if self.url_scorer:
score = self.url_scorer.score(url)
else:
# When no url_scorer is provided all urls will have same score of 0.
# Therefore will be process in FIFO order as per URL depth
score = 0
await queue.put((score, new_depth, url))
depths[url] = new_depth
self.stats.total_depth_reached = max(
self.stats.total_depth_reached, new_depth
)
score = self.url_scorer.score(url) if self.url_scorer else 0
await queue.put((score, new_depth, url))
depths[url] = new_depth
self.stats.total_depth_reached = max(
self.stats.total_depth_reached, new_depth
)
async def ascrape(
self,
start_url: str,
crawler: AsyncWebCrawler,
crawler_config: CrawlerRunConfig,
browser_config: BrowserConfig,
) -> AsyncGenerator[CrawlResult, None]:
"""Implement BFS crawling strategy"""
@@ -164,34 +160,39 @@ class BFSScraperStrategy(ScraperStrategy):
if active_crawls:
await asyncio.sleep(0.1)
continue
# Process batch
crawler_config = CrawlerRunConfig(cache_mode="BYPASS", stream=True)
try:
async for result in await crawler.arun_many(
urls=[url for _, _, url in jobs], config=crawler_config
):
source_url, depth = next(
(url, depth) for _, depth, url in jobs if url == result.url
)
active_crawls.remove(source_url) # Remove from active set
async with AsyncWebCrawler(
config=browser_config,
verbose=True,
) as crawler:
try:
async for result in await crawler.arun_many(
urls=[url for _, _, url in jobs],
config=crawler_config.clone(stream=True),
):
source_url, depth = next(
(url, depth)
for _, depth, url in jobs
if url == result.url
)
active_crawls.remove(source_url) # Remove from active set
if result.success:
await self._process_links(
result, source_url, depth, queue, visited, depths
)
yield result
else:
self.logger.warning(
f"Failed to crawl {result.url}: {result.error_message}"
)
except Exception as e:
# Remove failed URLs from active set
for _, _, url in jobs:
active_crawls.discard(url)
self.logger.error(f"Batch processing error: {e}")
# Continue processing other batches
continue
if result.success:
await self._process_links(
result, source_url, depth, queue, visited, depths
)
yield result
else:
self.logger.warning(
f"Failed to crawl {result.url}: {result.error_message}"
)
except Exception as e:
# Remove failed URLs from active set
for _, _, url in jobs:
active_crawls.discard(url)
self.logger.error(f"Batch processing error: {e}")
# Continue processing other batches
continue
except Exception as e:
self.logger.error(f"Error in crawl process: {e}")
@@ -199,6 +200,7 @@ class BFSScraperStrategy(ScraperStrategy):
finally:
self.stats.end_time = datetime.now()
await crawler.close()
async def shutdown(self):
"""Clean up resources and stop crawling"""

View File

@@ -1,23 +1,23 @@
from abc import ABC, abstractmethod
from .models import ScraperResult, CrawlResult
from ..models import CrawlResult
from ..async_webcrawler import AsyncWebCrawler
from ..async_configs import BrowserConfig, CrawlerRunConfig
from typing import Union, AsyncGenerator
class ScraperStrategy(ABC):
@abstractmethod
async def ascrape(
self,
url: str,
crawler: AsyncWebCrawler,
crawler_config: CrawlerRunConfig,
browser_config: BrowserConfig,
stream: bool = False,
) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
"""Scrape the given URL using the specified crawler.
Args:
url (str): The starting URL for the scrape.
crawler (AsyncWebCrawler): The web crawler instance.
crawler_config (CrawlerRunConfig): Configuration for the crawler run.
browser_config (BrowserConfig): Configuration for the browser.
stream (bool): If True, yields individual crawl results as they are ready;
if False, accumulates results and returns a final ScraperResult.

View File

@@ -1,4 +1,5 @@
# basic_scraper_example.py
from crawl4ai.async_configs import CrawlerRunConfig
from crawl4ai.scraper import (
AsyncWebScraper,
BFSScraperStrategy,
@@ -24,14 +25,14 @@ async def basic_scraper_example():
filter_chain = FilterChain(
[
# Only crawl pages within the blog section
URLPatternFilter("*/tutorial/*"),
URLPatternFilter("*/basic/*"),
# Only process HTML pages
ContentTypeFilter(["text/html"]),
]
)
# Initialize the strategy with basic configuration
strategy = BFSScraperStrategy(
bfs_strategy = BFSScraperStrategy(
max_depth=2, # Only go 2 levels deep
filter_chain=filter_chain,
url_scorer=None, # Use default scoring
@@ -39,8 +40,11 @@ async def basic_scraper_example():
)
# Create the crawler and scraper
async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler:
scraper = AsyncWebScraper(crawler, strategy)
async with AsyncWebScraper(
crawler_config=CrawlerRunConfig(bypass_cache=True),
browser_config=browser_config,
strategy=bfs_strategy,
) as scraper:
# Start scraping
try:
result = await scraper.ascrape("https://crawl4ai.com/mkdocs")
@@ -69,7 +73,6 @@ from crawl4ai.scraper import (
FreshnessScorer,
CompositeScorer,
)
from crawl4ai.async_webcrawler import AsyncWebCrawler
async def advanced_scraper_example():
@@ -121,13 +124,14 @@ async def advanced_scraper_example():
)
# Initialize strategy with advanced configuration
strategy = BFSScraperStrategy(
bfs_strategy = BFSScraperStrategy(
max_depth=2, filter_chain=filter_chain, url_scorer=scorer
)
# Create crawler and scraper
async with AsyncWebCrawler(verbose=True, config=browser_config) as crawler:
scraper = AsyncWebScraper(crawler, strategy)
async with AsyncWebScraper(crawler_config=CrawlerRunConfig(bypass_cache=True),
browser_config=browser_config,
strategy=bfs_strategy) as scraper:
# Track statistics
stats = {"processed": 0, "errors": 0, "total_size": 0}
@@ -182,12 +186,12 @@ if __name__ == "__main__":
import time
# Run basic example
start_time = time.perf_counter()
print("Running basic scraper example...")
asyncio.run(basic_scraper_example())
end_time = time.perf_counter()
print(f"Basic scraper example completed in {end_time - start_time:.2f} seconds")
# start_time = time.perf_counter()
# print("Running basic scraper example...")
# asyncio.run(basic_scraper_example())
# end_time = time.perf_counter()
# print(f"Basic scraper example completed in {end_time - start_time:.2f} seconds")
# # Run advanced example
# print("\nRunning advanced scraper example...")
# asyncio.run(advanced_scraper_example())
print("\nRunning advanced scraper example...")
asyncio.run(advanced_scraper_example())