feat: create ScraperPageResult model to attach score and depth attributes to yielded/returned crawl results
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
from typing import Union, AsyncGenerator, Optional
|
||||
from .scraper_strategy import ScraperStrategy
|
||||
from .models import ScraperResult, CrawlResult
|
||||
from .models import ScraperResult, CrawlResult, ScraperPageResult
|
||||
from ..async_configs import BrowserConfig, CrawlerRunConfig
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
@@ -35,17 +35,23 @@ class AsyncWebScraper(AbstractAsyncContextManager):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
crawler_config: CrawlerRunConfig,
|
||||
browser_config: BrowserConfig,
|
||||
strategy: ScraperStrategy,
|
||||
crawler_config: Optional[CrawlerRunConfig] = None,
|
||||
browser_config: Optional[BrowserConfig] = None,
|
||||
logger: Optional[logging.Logger] = None,
|
||||
):
|
||||
if not isinstance(browser_config, BrowserConfig):
|
||||
raise TypeError("browser_config must be an instance of BrowserConfig")
|
||||
if not isinstance(crawler_config, CrawlerRunConfig):
|
||||
raise TypeError("crawler must be an instance of CrawlerRunConfig")
|
||||
if not isinstance(strategy, ScraperStrategy):
|
||||
raise TypeError("strategy must be an instance of ScraperStrategy")
|
||||
if browser_config is not None and not isinstance(browser_config, BrowserConfig):
|
||||
raise TypeError(
|
||||
"browser_config must be None or an instance of BrowserConfig"
|
||||
)
|
||||
if crawler_config is not None and not isinstance(
|
||||
crawler_config, CrawlerRunConfig
|
||||
):
|
||||
raise TypeError(
|
||||
"crawler_config must be None or an instance of CrawlerRunConfig"
|
||||
)
|
||||
|
||||
self.crawler_config = crawler_config
|
||||
self.browser_config = browser_config
|
||||
@@ -70,7 +76,7 @@ class AsyncWebScraper(AbstractAsyncContextManager):
|
||||
|
||||
async def ascrape(
|
||||
self, url: str, stream: bool = False
|
||||
) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
|
||||
) -> Union[AsyncGenerator[ScraperPageResult, None], ScraperResult]:
|
||||
"""
|
||||
Scrape a website starting from the given URL.
|
||||
|
||||
@@ -82,7 +88,6 @@ class AsyncWebScraper(AbstractAsyncContextManager):
|
||||
Either an async generator yielding CrawlResults or a final ScraperResult
|
||||
"""
|
||||
self._progress = ScrapingProgress() # Reset progress
|
||||
|
||||
async with self._error_handling_context(url):
|
||||
if stream:
|
||||
return self._ascrape_yielding(url)
|
||||
@@ -91,16 +96,16 @@ class AsyncWebScraper(AbstractAsyncContextManager):
|
||||
async def _ascrape_yielding(
|
||||
self,
|
||||
url: str,
|
||||
) -> AsyncGenerator[CrawlResult, None]:
|
||||
) -> AsyncGenerator[ScraperPageResult, None]:
|
||||
"""Stream scraping results as they become available."""
|
||||
try:
|
||||
result_generator = self.strategy.ascrape(
|
||||
url, self.crawler_config, self.browser_config
|
||||
)
|
||||
async for res in result_generator:
|
||||
async for page_result in result_generator:
|
||||
self._progress.processed_urls += 1
|
||||
self._progress.current_url = res.url
|
||||
yield res
|
||||
self._progress.current_url = page_result.result.url
|
||||
yield page_result
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error in streaming scrape: {str(e)}")
|
||||
raise
|
||||
@@ -117,9 +122,10 @@ class AsyncWebScraper(AbstractAsyncContextManager):
|
||||
url, self.crawler_config, self.browser_config
|
||||
)
|
||||
async for res in result_generator:
|
||||
url = res.result.url
|
||||
self._progress.processed_urls += 1
|
||||
self._progress.current_url = res.url
|
||||
extracted_data[res.url] = res
|
||||
self._progress.current_url = url
|
||||
extracted_data[url] = res
|
||||
|
||||
return ScraperResult(
|
||||
url=url,
|
||||
|
||||
@@ -7,7 +7,7 @@ from urllib.parse import urlparse
|
||||
|
||||
from ..async_webcrawler import AsyncWebCrawler
|
||||
from ..async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .models import CrawlResult
|
||||
from .models import CrawlResult, ScraperPageResult
|
||||
from .filters import FilterChain
|
||||
from .scorers import URLScorer
|
||||
from .scraper_strategy import ScraperStrategy
|
||||
@@ -46,7 +46,6 @@ class BFSScraperStrategy(ScraperStrategy):
|
||||
self.stats = CrawlStats(start_time=datetime.now())
|
||||
self._cancel_event = asyncio.Event()
|
||||
self.process_external_links = process_external_links
|
||||
self._active_crawls_lock = asyncio.Lock()
|
||||
|
||||
async def can_process_url(self, url: str, depth: int) -> bool:
|
||||
"""Check if URL can be processed based on filters
|
||||
@@ -117,8 +116,8 @@ class BFSScraperStrategy(ScraperStrategy):
|
||||
async def ascrape(
|
||||
self,
|
||||
start_url: str,
|
||||
crawler_config: CrawlerRunConfig,
|
||||
browser_config: BrowserConfig,
|
||||
crawler_config: Optional[CrawlerRunConfig] = None,
|
||||
browser_config: Optional[BrowserConfig] = None,
|
||||
) -> AsyncGenerator[CrawlResult, None]:
|
||||
"""Implement BFS crawling strategy"""
|
||||
|
||||
@@ -137,6 +136,11 @@ class BFSScraperStrategy(ScraperStrategy):
|
||||
visited: Set[str] = set()
|
||||
depths = {start_url: 0}
|
||||
active_crawls = {} # Track URLs currently being processed with depth and score
|
||||
active_crawls_lock = asyncio.Lock() # Create the lock within the same event loop
|
||||
|
||||
# Update crawler_config to stream back results to scraper
|
||||
crawler_config = crawler_config.clone(stream=True) if crawler_config else CrawlerRunConfig(stream=True)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
config=browser_config,
|
||||
verbose=True,
|
||||
@@ -152,7 +156,7 @@ class BFSScraperStrategy(ScraperStrategy):
|
||||
- Can be interrupted via cancellation (not self._cancel_event.is_set())
|
||||
"""
|
||||
# Collect batch of URLs into active_crawls to process
|
||||
async with self._active_crawls_lock:
|
||||
async with active_crawls_lock:
|
||||
while len(active_crawls) < SCRAPER_BATCH_SIZE and not queue.empty():
|
||||
score, depth, url = await queue.get()
|
||||
active_crawls[url] = {"depth": depth, "score": score}
|
||||
@@ -170,14 +174,19 @@ class BFSScraperStrategy(ScraperStrategy):
|
||||
):
|
||||
source_url = result.url
|
||||
depth = active_crawls[source_url]["depth"]
|
||||
async with self._active_crawls_lock:
|
||||
score=active_crawls[source_url]["score"]
|
||||
async with active_crawls_lock:
|
||||
active_crawls.pop(source_url, None)
|
||||
|
||||
if result.success:
|
||||
await self._process_links(
|
||||
result, source_url, queue, visited, depths
|
||||
)
|
||||
yield result
|
||||
yield ScraperPageResult(
|
||||
result = result,
|
||||
depth=depth,
|
||||
score=score,
|
||||
)
|
||||
else:
|
||||
self.logger.warning(
|
||||
f"Failed to crawl {result.url}: {result.error_message}"
|
||||
|
||||
@@ -2,8 +2,11 @@ from pydantic import BaseModel
|
||||
from typing import List, Dict
|
||||
from ..models import CrawlResult
|
||||
|
||||
|
||||
class ScraperPageResult(BaseModel):
|
||||
result: CrawlResult
|
||||
depth: int
|
||||
score: float
|
||||
class ScraperResult(BaseModel):
|
||||
url: str
|
||||
crawled_urls: List[str]
|
||||
extracted_data: Dict[str, CrawlResult]
|
||||
extracted_data: Dict[str, ScraperPageResult]
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from .models import ScraperResult, CrawlResult
|
||||
from ..models import CrawlResult
|
||||
from .models import ScraperResult, ScraperPageResult
|
||||
from ..async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from typing import Union, AsyncGenerator
|
||||
class ScraperStrategy(ABC):
|
||||
@@ -11,7 +10,7 @@ class ScraperStrategy(ABC):
|
||||
crawler_config: CrawlerRunConfig,
|
||||
browser_config: BrowserConfig,
|
||||
stream: bool = False,
|
||||
) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
|
||||
) -> Union[AsyncGenerator[ScraperPageResult, None], ScraperResult]:
|
||||
"""Scrape the given URL using the specified crawler.
|
||||
|
||||
Args:
|
||||
@@ -22,7 +21,7 @@ class ScraperStrategy(ABC):
|
||||
if False, accumulates results and returns a final ScraperResult.
|
||||
|
||||
Yields:
|
||||
CrawlResult: Individual crawl results if stream is True.
|
||||
ScraperPageResult: Individual page results if stream is True.
|
||||
|
||||
Returns:
|
||||
ScraperResult: A summary of the scrape results containing the final extracted data
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# basic_scraper_example.py
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
from crawl4ai.scraper import (
|
||||
AsyncWebScraper,
|
||||
BFSScraperStrategy,
|
||||
@@ -7,7 +8,7 @@ from crawl4ai.scraper import (
|
||||
URLPatternFilter,
|
||||
ContentTypeFilter,
|
||||
)
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig
|
||||
from crawl4ai.async_webcrawler import BrowserConfig
|
||||
import re
|
||||
import time
|
||||
|
||||
@@ -41,8 +42,6 @@ async def basic_scraper_example():
|
||||
|
||||
# Create the crawler and scraper
|
||||
async with AsyncWebScraper(
|
||||
crawler_config=CrawlerRunConfig(bypass_cache=True),
|
||||
browser_config=browser_config,
|
||||
strategy=bfs_strategy,
|
||||
) as scraper:
|
||||
# Start scraping
|
||||
@@ -51,8 +50,8 @@ async def basic_scraper_example():
|
||||
|
||||
# Process results
|
||||
print(f"Crawled {len(result.crawled_urls)} pages:")
|
||||
for url, data in result.extracted_data.items():
|
||||
print(f"- {url}: {len(data.html)} bytes")
|
||||
for url, page_result in result.extracted_data.items():
|
||||
print(f"- {url}: {len(page_result.result.html)} bytes")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during scraping: {e}")
|
||||
@@ -130,9 +129,9 @@ async def advanced_scraper_example():
|
||||
|
||||
# Create crawler and scraper
|
||||
async with AsyncWebScraper(
|
||||
crawler_config=CrawlerRunConfig(bypass_cache=True),
|
||||
browser_config=browser_config,
|
||||
strategy=bfs_strategy,
|
||||
crawler_config=CrawlerRunConfig(bypass_cache=True, scraping_strategy=LXMLWebScrapingStrategy(),),
|
||||
browser_config=browser_config,
|
||||
) as scraper:
|
||||
|
||||
# Track statistics
|
||||
@@ -143,12 +142,15 @@ async def advanced_scraper_example():
|
||||
result_generator = await scraper.ascrape(
|
||||
"https://techcrunch.com", stream=True
|
||||
)
|
||||
async for result in result_generator:
|
||||
async for page_result in result_generator:
|
||||
result = page_result.result
|
||||
score = page_result.score
|
||||
depth = page_result.depth
|
||||
stats["processed"] += 1
|
||||
|
||||
if result.success:
|
||||
stats["total_size"] += len(result.html)
|
||||
logger.info(f"Processed: {result.url}")
|
||||
logger.info(f"Processed at depth: {depth} with score: {score:.3f} : \n {result.url}")
|
||||
else:
|
||||
stats["errors"] += 1
|
||||
logger.error(
|
||||
|
||||
Reference in New Issue
Block a user