fix: Added browser config and crawler run config from 0.4.22

This commit is contained in:
Aravind Karnam
2024-12-18 10:33:09 +05:30
parent 7c0fa269a6
commit 7a5f83b76f
2 changed files with 9 additions and 6 deletions

View File

@@ -14,7 +14,7 @@ from collections import defaultdict
from .models import CrawlResult from .models import CrawlResult
from .filters import FilterChain from .filters import FilterChain
from .scorers import URLScorer from .scorers import URLScorer
from ..async_webcrawler import AsyncWebCrawler from ..async_webcrawler import AsyncWebCrawler, CrawlerRunConfig
from .scraper_strategy import ScraperStrategy from .scraper_strategy import ScraperStrategy
@dataclass @dataclass
@@ -116,7 +116,8 @@ class BFSScraperStrategy(ScraperStrategy):
) -> CrawlResult: ) -> CrawlResult:
"""Crawl URL with retry logic""" """Crawl URL with retry logic"""
try: try:
return await asyncio.wait_for(crawler.arun(url), timeout=self.timeout) crawler_config = CrawlerRunConfig(cache_mode="BYPASS")
return await asyncio.wait_for(crawler.arun(url, config=crawler_config), timeout=self.timeout)
except asyncio.TimeoutError: except asyncio.TimeoutError:
self.logger.error(f"Timeout crawling {url}") self.logger.error(f"Timeout crawling {url}")
raise raise

View File

@@ -6,9 +6,11 @@ from crawl4ai.scraper import (
URLPatternFilter, URLPatternFilter,
ContentTypeFilter ContentTypeFilter
) )
from crawl4ai.async_webcrawler import AsyncWebCrawler from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig
import re import re
browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
async def basic_scraper_example(): async def basic_scraper_example():
""" """
Basic example: Scrape a blog site for articles Basic example: Scrape a blog site for articles
@@ -34,7 +36,7 @@ async def basic_scraper_example():
) )
# Create the crawler and scraper # Create the crawler and scraper
async with AsyncWebCrawler(verbose=True) as crawler: async with AsyncWebCrawler(config=browser_config,verbose=True) as crawler:
scraper = AsyncWebScraper(crawler, strategy) scraper = AsyncWebScraper(crawler, strategy)
# Start scraping # Start scraping
try: try:
@@ -118,12 +120,12 @@ async def advanced_scraper_example():
max_depth=2, max_depth=2,
filter_chain=filter_chain, filter_chain=filter_chain,
url_scorer=scorer, url_scorer=scorer,
max_concurrent=5, max_concurrent=2,
min_crawl_delay=1 min_crawl_delay=1
) )
# Create crawler and scraper # Create crawler and scraper
async with AsyncWebCrawler(verbose=True) as crawler: async with AsyncWebCrawler(verbose=True, config=browser_config) as crawler:
scraper = AsyncWebScraper(crawler, strategy) scraper = AsyncWebScraper(crawler, strategy)
# Track statistics # Track statistics