From 7a5f83b76f29a21ba8a0d1b66b822e8d04905f39 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Wed, 18 Dec 2024 10:33:09 +0530 Subject: [PATCH] fix: Added browser config and crawler run config from 0.4.22 --- crawl4ai/scraper/bfs_scraper_strategy.py | 5 +++-- docs/scraper/scraper_quickstart.py | 10 ++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index 3a6d09a5..eb7f4cd8 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -14,7 +14,7 @@ from collections import defaultdict from .models import CrawlResult from .filters import FilterChain from .scorers import URLScorer -from ..async_webcrawler import AsyncWebCrawler +from ..async_webcrawler import AsyncWebCrawler, CrawlerRunConfig from .scraper_strategy import ScraperStrategy @dataclass @@ -116,7 +116,8 @@ class BFSScraperStrategy(ScraperStrategy): ) -> CrawlResult: """Crawl URL with retry logic""" try: - return await asyncio.wait_for(crawler.arun(url), timeout=self.timeout) + crawler_config = CrawlerRunConfig(cache_mode="BYPASS") + return await asyncio.wait_for(crawler.arun(url, config=crawler_config), timeout=self.timeout) except asyncio.TimeoutError: self.logger.error(f"Timeout crawling {url}") raise diff --git a/docs/scraper/scraper_quickstart.py b/docs/scraper/scraper_quickstart.py index 9e380cf5..f6100e51 100644 --- a/docs/scraper/scraper_quickstart.py +++ b/docs/scraper/scraper_quickstart.py @@ -6,9 +6,11 @@ from crawl4ai.scraper import ( URLPatternFilter, ContentTypeFilter ) -from crawl4ai.async_webcrawler import AsyncWebCrawler +from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig import re +browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600) + async def basic_scraper_example(): """ Basic example: Scrape a blog site for articles @@ -34,7 +36,7 @@ async def basic_scraper_example(): ) # Create the crawler and scraper - async with AsyncWebCrawler(verbose=True) as crawler: + async with AsyncWebCrawler(config=browser_config,verbose=True) as crawler: scraper = AsyncWebScraper(crawler, strategy) # Start scraping try: @@ -118,12 +120,12 @@ async def advanced_scraper_example(): max_depth=2, filter_chain=filter_chain, url_scorer=scorer, - max_concurrent=5, + max_concurrent=2, min_crawl_delay=1 ) # Create crawler and scraper - async with AsyncWebCrawler(verbose=True) as crawler: + async with AsyncWebCrawler(verbose=True, config=browser_config) as crawler: scraper = AsyncWebScraper(crawler, strategy) # Track statistics