diff --git a/crawl4ai/scraper/async_web_scraper.py b/crawl4ai/scraper/async_web_scraper.py index a5710306..9e21a4e7 100644 --- a/crawl4ai/scraper/async_web_scraper.py +++ b/crawl4ai/scraper/async_web_scraper.py @@ -1,6 +1,6 @@ from typing import Union, AsyncGenerator, Optional from .scraper_strategy import ScraperStrategy -from .models import ScraperResult, CrawlResult +from .models import ScraperResult, CrawlResult, ScraperPageResult from ..async_configs import BrowserConfig, CrawlerRunConfig import logging from dataclasses import dataclass @@ -35,17 +35,23 @@ class AsyncWebScraper(AbstractAsyncContextManager): def __init__( self, - crawler_config: CrawlerRunConfig, - browser_config: BrowserConfig, strategy: ScraperStrategy, + crawler_config: Optional[CrawlerRunConfig] = None, + browser_config: Optional[BrowserConfig] = None, logger: Optional[logging.Logger] = None, ): - if not isinstance(browser_config, BrowserConfig): - raise TypeError("browser_config must be an instance of BrowserConfig") - if not isinstance(crawler_config, CrawlerRunConfig): - raise TypeError("crawler must be an instance of CrawlerRunConfig") if not isinstance(strategy, ScraperStrategy): raise TypeError("strategy must be an instance of ScraperStrategy") + if browser_config is not None and not isinstance(browser_config, BrowserConfig): + raise TypeError( + "browser_config must be None or an instance of BrowserConfig" + ) + if crawler_config is not None and not isinstance( + crawler_config, CrawlerRunConfig + ): + raise TypeError( + "crawler_config must be None or an instance of CrawlerRunConfig" + ) self.crawler_config = crawler_config self.browser_config = browser_config @@ -70,7 +76,7 @@ class AsyncWebScraper(AbstractAsyncContextManager): async def ascrape( self, url: str, stream: bool = False - ) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: + ) -> Union[AsyncGenerator[ScraperPageResult, None], ScraperResult]: """ Scrape a website starting from the given URL. @@ -82,7 +88,6 @@ class AsyncWebScraper(AbstractAsyncContextManager): Either an async generator yielding CrawlResults or a final ScraperResult """ self._progress = ScrapingProgress() # Reset progress - async with self._error_handling_context(url): if stream: return self._ascrape_yielding(url) @@ -91,16 +96,16 @@ class AsyncWebScraper(AbstractAsyncContextManager): async def _ascrape_yielding( self, url: str, - ) -> AsyncGenerator[CrawlResult, None]: + ) -> AsyncGenerator[ScraperPageResult, None]: """Stream scraping results as they become available.""" try: result_generator = self.strategy.ascrape( url, self.crawler_config, self.browser_config ) - async for res in result_generator: + async for page_result in result_generator: self._progress.processed_urls += 1 - self._progress.current_url = res.url - yield res + self._progress.current_url = page_result.result.url + yield page_result except Exception as e: self.logger.error(f"Error in streaming scrape: {str(e)}") raise @@ -117,9 +122,10 @@ class AsyncWebScraper(AbstractAsyncContextManager): url, self.crawler_config, self.browser_config ) async for res in result_generator: + url = res.result.url self._progress.processed_urls += 1 - self._progress.current_url = res.url - extracted_data[res.url] = res + self._progress.current_url = url + extracted_data[url] = res return ScraperResult( url=url, diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index 93100d1d..068583b9 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -7,7 +7,7 @@ from urllib.parse import urlparse from ..async_webcrawler import AsyncWebCrawler from ..async_configs import BrowserConfig, CrawlerRunConfig -from .models import CrawlResult +from .models import CrawlResult, ScraperPageResult from .filters import FilterChain from .scorers import URLScorer from .scraper_strategy import ScraperStrategy @@ -46,7 +46,6 @@ class BFSScraperStrategy(ScraperStrategy): self.stats = CrawlStats(start_time=datetime.now()) self._cancel_event = asyncio.Event() self.process_external_links = process_external_links - self._active_crawls_lock = asyncio.Lock() async def can_process_url(self, url: str, depth: int) -> bool: """Check if URL can be processed based on filters @@ -117,8 +116,8 @@ class BFSScraperStrategy(ScraperStrategy): async def ascrape( self, start_url: str, - crawler_config: CrawlerRunConfig, - browser_config: BrowserConfig, + crawler_config: Optional[CrawlerRunConfig] = None, + browser_config: Optional[BrowserConfig] = None, ) -> AsyncGenerator[CrawlResult, None]: """Implement BFS crawling strategy""" @@ -137,6 +136,11 @@ class BFSScraperStrategy(ScraperStrategy): visited: Set[str] = set() depths = {start_url: 0} active_crawls = {} # Track URLs currently being processed with depth and score + active_crawls_lock = asyncio.Lock() # Create the lock within the same event loop + + # Update crawler_config to stream back results to scraper + crawler_config = crawler_config.clone(stream=True) if crawler_config else CrawlerRunConfig(stream=True) + async with AsyncWebCrawler( config=browser_config, verbose=True, @@ -152,7 +156,7 @@ class BFSScraperStrategy(ScraperStrategy): - Can be interrupted via cancellation (not self._cancel_event.is_set()) """ # Collect batch of URLs into active_crawls to process - async with self._active_crawls_lock: + async with active_crawls_lock: while len(active_crawls) < SCRAPER_BATCH_SIZE and not queue.empty(): score, depth, url = await queue.get() active_crawls[url] = {"depth": depth, "score": score} @@ -170,14 +174,19 @@ class BFSScraperStrategy(ScraperStrategy): ): source_url = result.url depth = active_crawls[source_url]["depth"] - async with self._active_crawls_lock: + score=active_crawls[source_url]["score"] + async with active_crawls_lock: active_crawls.pop(source_url, None) if result.success: await self._process_links( result, source_url, queue, visited, depths ) - yield result + yield ScraperPageResult( + result = result, + depth=depth, + score=score, + ) else: self.logger.warning( f"Failed to crawl {result.url}: {result.error_message}" diff --git a/crawl4ai/scraper/models.py b/crawl4ai/scraper/models.py index c779f81a..7b5137a4 100644 --- a/crawl4ai/scraper/models.py +++ b/crawl4ai/scraper/models.py @@ -2,8 +2,11 @@ from pydantic import BaseModel from typing import List, Dict from ..models import CrawlResult - +class ScraperPageResult(BaseModel): + result: CrawlResult + depth: int + score: float class ScraperResult(BaseModel): url: str crawled_urls: List[str] - extracted_data: Dict[str, CrawlResult] + extracted_data: Dict[str, ScraperPageResult] diff --git a/crawl4ai/scraper/scraper_strategy.py b/crawl4ai/scraper/scraper_strategy.py index 3015cbcd..74d82a3b 100644 --- a/crawl4ai/scraper/scraper_strategy.py +++ b/crawl4ai/scraper/scraper_strategy.py @@ -1,6 +1,5 @@ from abc import ABC, abstractmethod -from .models import ScraperResult, CrawlResult -from ..models import CrawlResult +from .models import ScraperResult, ScraperPageResult from ..async_configs import BrowserConfig, CrawlerRunConfig from typing import Union, AsyncGenerator class ScraperStrategy(ABC): @@ -11,7 +10,7 @@ class ScraperStrategy(ABC): crawler_config: CrawlerRunConfig, browser_config: BrowserConfig, stream: bool = False, - ) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: + ) -> Union[AsyncGenerator[ScraperPageResult, None], ScraperResult]: """Scrape the given URL using the specified crawler. Args: @@ -22,7 +21,7 @@ class ScraperStrategy(ABC): if False, accumulates results and returns a final ScraperResult. Yields: - CrawlResult: Individual crawl results if stream is True. + ScraperPageResult: Individual page results if stream is True. Returns: ScraperResult: A summary of the scrape results containing the final extracted data diff --git a/docs/scraper/scraper_quickstart.py b/docs/scraper/scraper_quickstart.py index 749914e0..e9394e3c 100644 --- a/docs/scraper/scraper_quickstart.py +++ b/docs/scraper/scraper_quickstart.py @@ -1,5 +1,6 @@ # basic_scraper_example.py from crawl4ai.async_configs import CrawlerRunConfig +from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy from crawl4ai.scraper import ( AsyncWebScraper, BFSScraperStrategy, @@ -7,7 +8,7 @@ from crawl4ai.scraper import ( URLPatternFilter, ContentTypeFilter, ) -from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig +from crawl4ai.async_webcrawler import BrowserConfig import re import time @@ -41,8 +42,6 @@ async def basic_scraper_example(): # Create the crawler and scraper async with AsyncWebScraper( - crawler_config=CrawlerRunConfig(bypass_cache=True), - browser_config=browser_config, strategy=bfs_strategy, ) as scraper: # Start scraping @@ -51,8 +50,8 @@ async def basic_scraper_example(): # Process results print(f"Crawled {len(result.crawled_urls)} pages:") - for url, data in result.extracted_data.items(): - print(f"- {url}: {len(data.html)} bytes") + for url, page_result in result.extracted_data.items(): + print(f"- {url}: {len(page_result.result.html)} bytes") except Exception as e: print(f"Error during scraping: {e}") @@ -130,9 +129,9 @@ async def advanced_scraper_example(): # Create crawler and scraper async with AsyncWebScraper( - crawler_config=CrawlerRunConfig(bypass_cache=True), - browser_config=browser_config, strategy=bfs_strategy, + crawler_config=CrawlerRunConfig(bypass_cache=True, scraping_strategy=LXMLWebScrapingStrategy(),), + browser_config=browser_config, ) as scraper: # Track statistics @@ -143,12 +142,15 @@ async def advanced_scraper_example(): result_generator = await scraper.ascrape( "https://techcrunch.com", stream=True ) - async for result in result_generator: + async for page_result in result_generator: + result = page_result.result + score = page_result.score + depth = page_result.depth stats["processed"] += 1 if result.success: stats["total_size"] += len(result.html) - logger.info(f"Processed: {result.url}") + logger.info(f"Processed at depth: {depth} with score: {score:.3f} : \n {result.url}") else: stats["errors"] += 1 logger.error(