feat: create ScraperPageResult model to attach score and depth attributes to yielded/returned crawl results

2025-01-28 16:47:30 +05:30
parent 60ce8bbf55
commit 78223bc847
5 changed files with 56 additions and 37 deletions
--- a/crawl4ai/scraper/async_web_scraper.py
+++ b/crawl4ai/scraper/async_web_scraper.py
@@ -1,6 +1,6 @@
 from typing import Union, AsyncGenerator, Optional
 from .scraper_strategy import ScraperStrategy
-from .models import ScraperResult, CrawlResult
+from .models import ScraperResult, CrawlResult, ScraperPageResult
 from ..async_configs import BrowserConfig, CrawlerRunConfig
 import logging
 from dataclasses import dataclass
@@ -35,17 +35,23 @@ class AsyncWebScraper(AbstractAsyncContextManager):

    def __init__(
        self,
-        crawler_config: CrawlerRunConfig,
-        browser_config: BrowserConfig,
        strategy: ScraperStrategy,
+        crawler_config: Optional[CrawlerRunConfig] = None,
+        browser_config: Optional[BrowserConfig] = None,
        logger: Optional[logging.Logger] = None,
    ):
-        if not isinstance(browser_config, BrowserConfig):
-            raise TypeError("browser_config must be an instance of BrowserConfig")
-        if not isinstance(crawler_config, CrawlerRunConfig):
-            raise TypeError("crawler must be an instance of CrawlerRunConfig")
        if not isinstance(strategy, ScraperStrategy):
            raise TypeError("strategy must be an instance of ScraperStrategy")
+        if browser_config is not None and not isinstance(browser_config, BrowserConfig):
+            raise TypeError(
+                "browser_config must be None or an instance of BrowserConfig"
+            )
+        if crawler_config is not None and not isinstance(
+            crawler_config, CrawlerRunConfig
+        ):
+            raise TypeError(
+                "crawler_config must be None or an instance of CrawlerRunConfig"
+            )

        self.crawler_config = crawler_config
        self.browser_config = browser_config
@@ -70,7 +76,7 @@ class AsyncWebScraper(AbstractAsyncContextManager):

    async def ascrape(
        self, url: str, stream: bool = False
-    ) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
+    ) -> Union[AsyncGenerator[ScraperPageResult, None], ScraperResult]:
        """
        Scrape a website starting from the given URL.

@@ -82,7 +88,6 @@ class AsyncWebScraper(AbstractAsyncContextManager):
            Either an async generator yielding CrawlResults or a final ScraperResult
        """
        self._progress = ScrapingProgress()  # Reset progress
-
        async with self._error_handling_context(url):
            if stream:
                return self._ascrape_yielding(url)
@@ -91,16 +96,16 @@ class AsyncWebScraper(AbstractAsyncContextManager):
    async def _ascrape_yielding(
        self,
        url: str,
-    ) -> AsyncGenerator[CrawlResult, None]:
+    ) -> AsyncGenerator[ScraperPageResult, None]:
        """Stream scraping results as they become available."""
        try:
            result_generator = self.strategy.ascrape(
                url, self.crawler_config, self.browser_config
            )
-            async for res in result_generator:
+            async for page_result in result_generator:
                self._progress.processed_urls += 1
-                self._progress.current_url = res.url
-                yield res
+                self._progress.current_url = page_result.result.url
+                yield page_result
        except Exception as e:
            self.logger.error(f"Error in streaming scrape: {str(e)}")
            raise
@@ -117,9 +122,10 @@ class AsyncWebScraper(AbstractAsyncContextManager):
                url, self.crawler_config, self.browser_config
            )
            async for res in result_generator:
+                url = res.result.url
                self._progress.processed_urls += 1
-                self._progress.current_url = res.url
-                extracted_data[res.url] = res
+                self._progress.current_url = url
+                extracted_data[url] = res

            return ScraperResult(
                url=url,
--- a/crawl4ai/scraper/bfs_scraper_strategy.py
+++ b/crawl4ai/scraper/bfs_scraper_strategy.py
@@ -7,7 +7,7 @@ from urllib.parse import urlparse

 from ..async_webcrawler import AsyncWebCrawler
 from ..async_configs import BrowserConfig, CrawlerRunConfig
-from .models import CrawlResult
+from .models import CrawlResult, ScraperPageResult
 from .filters import FilterChain
 from .scorers import URLScorer
 from .scraper_strategy import ScraperStrategy
@@ -46,7 +46,6 @@ class BFSScraperStrategy(ScraperStrategy):
        self.stats = CrawlStats(start_time=datetime.now())
        self._cancel_event = asyncio.Event()
        self.process_external_links = process_external_links
-        self._active_crawls_lock = asyncio.Lock()

    async def can_process_url(self, url: str, depth: int) -> bool:
        """Check if URL can be processed based on filters
@@ -117,8 +116,8 @@ class BFSScraperStrategy(ScraperStrategy):
    async def ascrape(
        self,
        start_url: str,
-        crawler_config: CrawlerRunConfig,
-        browser_config: BrowserConfig,
+        crawler_config: Optional[CrawlerRunConfig] = None,
+        browser_config: Optional[BrowserConfig] = None,
    ) -> AsyncGenerator[CrawlResult, None]:
        """Implement BFS crawling strategy"""

@@ -137,6 +136,11 @@ class BFSScraperStrategy(ScraperStrategy):
        visited: Set[str] = set()
        depths = {start_url: 0}
        active_crawls = {}  # Track URLs currently being processed with depth and score
+        active_crawls_lock = asyncio.Lock()  # Create the lock within the same event loop
+
+        # Update crawler_config to stream back results to scraper
+        crawler_config = crawler_config.clone(stream=True) if crawler_config else CrawlerRunConfig(stream=True)
+
        async with AsyncWebCrawler(
            config=browser_config,
            verbose=True,
@@ -152,7 +156,7 @@ class BFSScraperStrategy(ScraperStrategy):
                        - Can be interrupted via cancellation (not self._cancel_event.is_set())
                    """
                    # Collect batch of URLs into active_crawls to process
-                    async with self._active_crawls_lock:
+                    async with active_crawls_lock:
                        while len(active_crawls) < SCRAPER_BATCH_SIZE and not queue.empty():
                            score, depth, url = await queue.get()
                            active_crawls[url] = {"depth": depth, "score": score}
@@ -170,14 +174,19 @@ class BFSScraperStrategy(ScraperStrategy):
                        ):
                            source_url = result.url
                            depth = active_crawls[source_url]["depth"]
-                            async with self._active_crawls_lock:
+                            score=active_crawls[source_url]["score"]
+                            async with active_crawls_lock:
                                active_crawls.pop(source_url, None)

                            if result.success:
                                await self._process_links(
                                    result, source_url, queue, visited, depths
                                )
-                                yield result
+                                yield ScraperPageResult(
+                                    result = result,
+                                    depth=depth,
+                                    score=score,
+                                )
                            else:
                                self.logger.warning(
                                    f"Failed to crawl {result.url}: {result.error_message}"
--- a/crawl4ai/scraper/models.py
+++ b/crawl4ai/scraper/models.py
@@ -2,8 +2,11 @@ from pydantic import BaseModel
 from typing import List, Dict
 from ..models import CrawlResult

-
+class ScraperPageResult(BaseModel):
+    result: CrawlResult
+    depth: int
+    score: float
 class ScraperResult(BaseModel):
    url: str
    crawled_urls: List[str]
-    extracted_data: Dict[str, CrawlResult]
+    extracted_data: Dict[str, ScraperPageResult]
--- a/crawl4ai/scraper/scraper_strategy.py
+++ b/crawl4ai/scraper/scraper_strategy.py
@@ -1,6 +1,5 @@
 from abc import ABC, abstractmethod
-from .models import ScraperResult, CrawlResult
-from ..models import CrawlResult
+from .models import ScraperResult, ScraperPageResult
 from ..async_configs import BrowserConfig, CrawlerRunConfig
 from typing import Union, AsyncGenerator
 class ScraperStrategy(ABC):
@@ -11,7 +10,7 @@ class ScraperStrategy(ABC):
        crawler_config: CrawlerRunConfig,
        browser_config: BrowserConfig,
        stream: bool = False,
-    ) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
+    ) -> Union[AsyncGenerator[ScraperPageResult, None], ScraperResult]:
        """Scrape the given URL using the specified crawler.

        Args:
@@ -22,7 +21,7 @@ class ScraperStrategy(ABC):
                                if False, accumulates results and returns a final ScraperResult.

        Yields:
-            CrawlResult: Individual crawl results if stream is True.
+            ScraperPageResult: Individual page results if stream is True.

        Returns:
            ScraperResult: A summary of the scrape results containing the final extracted data
--- a/docs/scraper/scraper_quickstart.py
+++ b/docs/scraper/scraper_quickstart.py
@@ -1,5 +1,6 @@
 # basic_scraper_example.py
 from crawl4ai.async_configs import CrawlerRunConfig
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
 from crawl4ai.scraper import (
    AsyncWebScraper,
    BFSScraperStrategy,
@@ -7,7 +8,7 @@ from crawl4ai.scraper import (
    URLPatternFilter,
    ContentTypeFilter,
 )
-from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig
+from crawl4ai.async_webcrawler import BrowserConfig
 import re
 import time

@@ -41,8 +42,6 @@ async def basic_scraper_example():

    # Create the crawler and scraper
    async with AsyncWebScraper(
-        crawler_config=CrawlerRunConfig(bypass_cache=True),
-        browser_config=browser_config,
        strategy=bfs_strategy,
    ) as scraper:
        # Start scraping
@@ -51,8 +50,8 @@ async def basic_scraper_example():

            # Process results
            print(f"Crawled {len(result.crawled_urls)} pages:")
-            for url, data in result.extracted_data.items():
-                print(f"- {url}: {len(data.html)} bytes")
+            for url, page_result in result.extracted_data.items():
+                print(f"- {url}: {len(page_result.result.html)} bytes")

        except Exception as e:
            print(f"Error during scraping: {e}")
@@ -130,9 +129,9 @@ async def advanced_scraper_example():

    # Create crawler and scraper
    async with AsyncWebScraper(
-        crawler_config=CrawlerRunConfig(bypass_cache=True),
-        browser_config=browser_config,
        strategy=bfs_strategy,
+        crawler_config=CrawlerRunConfig(bypass_cache=True, scraping_strategy=LXMLWebScrapingStrategy(),),
+        browser_config=browser_config,
    ) as scraper:

        # Track statistics
@@ -143,12 +142,15 @@ async def advanced_scraper_example():
            result_generator = await scraper.ascrape(
                "https://techcrunch.com", stream=True
            )
-            async for result in result_generator:
+            async for page_result in result_generator:
+                result = page_result.result
+                score = page_result.score
+                depth = page_result.depth
                stats["processed"] += 1

                if result.success:
                    stats["total_size"] += len(result.html)
-                    logger.info(f"Processed: {result.url}")
+                    logger.info(f"Processed at depth: {depth} with score: {score:.3f} : \n {result.url}")
                else:
                    stats["errors"] += 1
                    logger.error(