diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index 3a6d09a5..eb7f4cd8 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -14,7 +14,7 @@ from collections import defaultdict from .models import CrawlResult from .filters import FilterChain from .scorers import URLScorer -from ..async_webcrawler import AsyncWebCrawler +from ..async_webcrawler import AsyncWebCrawler, CrawlerRunConfig from .scraper_strategy import ScraperStrategy @dataclass @@ -116,7 +116,8 @@ class BFSScraperStrategy(ScraperStrategy): ) -> CrawlResult: """Crawl URL with retry logic""" try: - return await asyncio.wait_for(crawler.arun(url), timeout=self.timeout) + crawler_config = CrawlerRunConfig(cache_mode="BYPASS") + return await asyncio.wait_for(crawler.arun(url, config=crawler_config), timeout=self.timeout) except asyncio.TimeoutError: self.logger.error(f"Timeout crawling {url}") raise diff --git a/docs/scraper/scraper_quickstart.py b/docs/scraper/scraper_quickstart.py index 9e380cf5..f6100e51 100644 --- a/docs/scraper/scraper_quickstart.py +++ b/docs/scraper/scraper_quickstart.py @@ -6,9 +6,11 @@ from crawl4ai.scraper import ( URLPatternFilter, ContentTypeFilter ) -from crawl4ai.async_webcrawler import AsyncWebCrawler +from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig import re +browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600) + async def basic_scraper_example(): """ Basic example: Scrape a blog site for articles @@ -34,7 +36,7 @@ async def basic_scraper_example(): ) # Create the crawler and scraper - async with AsyncWebCrawler(verbose=True) as crawler: + async with AsyncWebCrawler(config=browser_config,verbose=True) as crawler: scraper = AsyncWebScraper(crawler, strategy) # Start scraping try: @@ -118,12 +120,12 @@ async def advanced_scraper_example(): max_depth=2, filter_chain=filter_chain, url_scorer=scorer, - max_concurrent=5, + max_concurrent=2, min_crawl_delay=1 ) # Create crawler and scraper - async with AsyncWebCrawler(verbose=True) as crawler: + async with AsyncWebCrawler(verbose=True, config=browser_config) as crawler: scraper = AsyncWebScraper(crawler, strategy) # Track statistics