fix: Added browser config and crawler run config from 0.4.22
This commit is contained in:
@@ -14,7 +14,7 @@ from collections import defaultdict
|
|||||||
from .models import CrawlResult
|
from .models import CrawlResult
|
||||||
from .filters import FilterChain
|
from .filters import FilterChain
|
||||||
from .scorers import URLScorer
|
from .scorers import URLScorer
|
||||||
from ..async_webcrawler import AsyncWebCrawler
|
from ..async_webcrawler import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from .scraper_strategy import ScraperStrategy
|
from .scraper_strategy import ScraperStrategy
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -116,7 +116,8 @@ class BFSScraperStrategy(ScraperStrategy):
|
|||||||
) -> CrawlResult:
|
) -> CrawlResult:
|
||||||
"""Crawl URL with retry logic"""
|
"""Crawl URL with retry logic"""
|
||||||
try:
|
try:
|
||||||
return await asyncio.wait_for(crawler.arun(url), timeout=self.timeout)
|
crawler_config = CrawlerRunConfig(cache_mode="BYPASS")
|
||||||
|
return await asyncio.wait_for(crawler.arun(url, config=crawler_config), timeout=self.timeout)
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
self.logger.error(f"Timeout crawling {url}")
|
self.logger.error(f"Timeout crawling {url}")
|
||||||
raise
|
raise
|
||||||
|
|||||||
@@ -6,9 +6,11 @@ from crawl4ai.scraper import (
|
|||||||
URLPatternFilter,
|
URLPatternFilter,
|
||||||
ContentTypeFilter
|
ContentTypeFilter
|
||||||
)
|
)
|
||||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
|
||||||
|
|
||||||
async def basic_scraper_example():
|
async def basic_scraper_example():
|
||||||
"""
|
"""
|
||||||
Basic example: Scrape a blog site for articles
|
Basic example: Scrape a blog site for articles
|
||||||
@@ -34,7 +36,7 @@ async def basic_scraper_example():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Create the crawler and scraper
|
# Create the crawler and scraper
|
||||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
async with AsyncWebCrawler(config=browser_config,verbose=True) as crawler:
|
||||||
scraper = AsyncWebScraper(crawler, strategy)
|
scraper = AsyncWebScraper(crawler, strategy)
|
||||||
# Start scraping
|
# Start scraping
|
||||||
try:
|
try:
|
||||||
@@ -118,12 +120,12 @@ async def advanced_scraper_example():
|
|||||||
max_depth=2,
|
max_depth=2,
|
||||||
filter_chain=filter_chain,
|
filter_chain=filter_chain,
|
||||||
url_scorer=scorer,
|
url_scorer=scorer,
|
||||||
max_concurrent=5,
|
max_concurrent=2,
|
||||||
min_crawl_delay=1
|
min_crawl_delay=1
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create crawler and scraper
|
# Create crawler and scraper
|
||||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
async with AsyncWebCrawler(verbose=True, config=browser_config) as crawler:
|
||||||
scraper = AsyncWebScraper(crawler, strategy)
|
scraper = AsyncWebScraper(crawler, strategy)
|
||||||
|
|
||||||
# Track statistics
|
# Track statistics
|
||||||
|
|||||||
Reference in New Issue
Block a user