# basic_scraper_example.py from crawl4ai.scraper import ( AsyncWebScraper, BFSScraperStrategy, FilterChain, URLPatternFilter, ContentTypeFilter, ) from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig import re import time browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600) async def basic_scraper_example(): """ Basic example: Scrape a blog site for articles - Crawls only HTML pages - Stays within the blog section - Collects all results at once """ # Create a simple filter chain filter_chain = FilterChain( [ # Only crawl pages within the blog section URLPatternFilter("*/tutorial/*"), # Only process HTML pages ContentTypeFilter(["text/html"]), ] ) # Initialize the strategy with basic configuration strategy = BFSScraperStrategy( max_depth=2, # Only go 2 levels deep filter_chain=filter_chain, url_scorer=None, # Use default scoring process_external_links=True, ) # Create the crawler and scraper async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler: scraper = AsyncWebScraper(crawler, strategy) # Start scraping try: result = await scraper.ascrape("https://crawl4ai.com/mkdocs") # Process results print(f"Crawled {len(result.crawled_urls)} pages:") for url, data in result.extracted_data.items(): print(f"- {url}: {len(data.html)} bytes") except Exception as e: print(f"Error during scraping: {e}") # advanced_scraper_example.py import logging from crawl4ai.scraper import ( AsyncWebScraper, BFSScraperStrategy, FilterChain, URLPatternFilter, ContentTypeFilter, DomainFilter, KeywordRelevanceScorer, PathDepthScorer, FreshnessScorer, CompositeScorer, ) from crawl4ai.async_webcrawler import AsyncWebCrawler async def advanced_scraper_example(): """ Advanced example: Intelligent news site scraping - Uses all filter types - Implements sophisticated scoring - Streams results - Includes monitoring and logging """ # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger("advanced_scraper") # Create sophisticated filter chain filter_chain = FilterChain( [ # Domain control DomainFilter( allowed_domains=["techcrunch.com"], blocked_domains=["login.techcrunch.com", "legal.yahoo.com"], ), # URL patterns URLPatternFilter( [ "*/article/*", "*/news/*", "*/blog/*", re.compile(r"\d{4}/\d{2}/.*"), # Date-based URLs ] ), # Content types ContentTypeFilter(["text/html", "application/xhtml+xml"]), ] ) # Create composite scorer scorer = CompositeScorer( [ # Prioritize by keywords KeywordRelevanceScorer( keywords=["news", "breaking", "update", "latest"], weight=1.0 ), # Prefer optimal URL structure PathDepthScorer(optimal_depth=3, weight=0.7), # Prioritize fresh content FreshnessScorer(weight=0.9), ] ) # Initialize strategy with advanced configuration strategy = BFSScraperStrategy( max_depth=2, filter_chain=filter_chain, url_scorer=scorer ) # Create crawler and scraper async with AsyncWebCrawler(verbose=True, config=browser_config) as crawler: scraper = AsyncWebScraper(crawler, strategy) # Track statistics stats = {"processed": 0, "errors": 0, "total_size": 0} try: # Use streaming mode result_generator = await scraper.ascrape( "https://techcrunch.com", stream=True ) async for result in result_generator: stats["processed"] += 1 if result.success: stats["total_size"] += len(result.html) logger.info(f"Processed: {result.url}") else: stats["errors"] += 1 logger.error( f"Failed to process {result.url}: {result.error_message}" ) # Log progress regularly if stats["processed"] % 10 == 0: logger.info(f"Progress: {stats['processed']} URLs processed") except Exception as e: logger.error(f"Scraping error: {e}") finally: # Print final statistics logger.info("Scraping completed:") logger.info(f"- URLs processed: {stats['processed']}") logger.info(f"- Errors: {stats['errors']}") logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB") # Print filter statistics for filter_ in filter_chain.filters: logger.info(f"{filter_.name} stats:") logger.info(f"- Passed: {filter_.stats.passed_urls}") logger.info(f"- Rejected: {filter_.stats.rejected_urls}") # Print scorer statistics logger.info("Scoring statistics:") logger.info(f"- Average score: {scorer.stats.average_score:.2f}") logger.info( f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}" ) if __name__ == "__main__": import asyncio import time # Run basic example start_time = time.perf_counter() print("Running basic scraper example...") asyncio.run(basic_scraper_example()) end_time = time.perf_counter() print(f"Basic scraper example completed in {end_time - start_time:.2f} seconds") # # Run advanced example # print("\nRunning advanced scraper example...") # asyncio.run(advanced_scraper_example())