From 9530ded83a5fbad05f376e2bcb09cd6ae08cd79c Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 26 Nov 2024 17:05:54 +0530 Subject: [PATCH] fixed the final scraper_quickstart.py example --- docs/scraper/scraper_quickstart.py | 107 ++++++++++++++--------------- 1 file changed, 53 insertions(+), 54 deletions(-) diff --git a/docs/scraper/scraper_quickstart.py b/docs/scraper/scraper_quickstart.py index 811f997e..d92124f2 100644 --- a/docs/scraper/scraper_quickstart.py +++ b/docs/scraper/scraper_quickstart.py @@ -19,7 +19,7 @@ async def basic_scraper_example(): # Create a simple filter chain filter_chain = FilterChain([ # Only crawl pages within the blog section - # URLPatternFilter("*/tutorial/*"), + URLPatternFilter("*/tutorial/*"), # Only process HTML pages ContentTypeFilter(["text/html"]) ]) @@ -29,7 +29,8 @@ async def basic_scraper_example(): max_depth=2, # Only go 2 levels deep filter_chain=filter_chain, url_scorer=None, # Use default scoring - max_concurrent=3 # Limit concurrent requests + max_concurrent=3, # Limit concurrent requests + process_external_links=True ) # Create the crawler and scraper @@ -79,8 +80,8 @@ async def advanced_scraper_example(): filter_chain = FilterChain([ # Domain control DomainFilter( - allowed_domains=["example.com", "blog.example.com"], - blocked_domains=["ads.example.com", "tracker.example.com"] + allowed_domains=["techcrunch.com"], + blocked_domains=["login.techcrunch.com","legal.yahoo.com"] ), # URL patterns URLPatternFilter([ @@ -114,7 +115,7 @@ async def advanced_scraper_example(): # Initialize strategy with advanced configuration strategy = BFSScraperStrategy( - max_depth=4, + max_depth=2, filter_chain=filter_chain, url_scorer=scorer, max_concurrent=5, @@ -122,63 +123,61 @@ async def advanced_scraper_example(): ) # Create crawler and scraper - crawler = AsyncWebCrawler() - scraper = AsyncWebScraper(crawler, strategy) + async with AsyncWebCrawler(verbose=True) as crawler: + scraper = AsyncWebScraper(crawler, strategy) - # Track statistics - stats = { - 'processed': 0, - 'errors': 0, - 'total_size': 0 - } + # Track statistics + stats = { + 'processed': 0, + 'errors': 0, + 'total_size': 0 + } - try: - # Use streaming mode - async for result in scraper.ascrape("https://example.com/news/", stream=True): - stats['processed'] += 1 - - if result.success: - stats['total_size'] += len(result.html) - logger.info(f"Processed: {result.url}") + try: + # Use streaming mode + result_generator = await scraper.ascrape("https://techcrunch.com", parallel_processing=True, stream=True) + async for result in result_generator: + stats['processed'] += 1 - # Print scoring information - for scorer_name, score in result.scores.items(): - logger.debug(f"{scorer_name}: {score:.2f}") - else: - stats['errors'] += 1 - logger.error(f"Failed to process {result.url}: {result.error_message}") + if result.success: + stats['total_size'] += len(result.html) + logger.info(f"Processed: {result.url}") + else: + stats['errors'] += 1 + logger.error(f"Failed to process {result.url}: {result.error_message}") - # Log progress regularly - if stats['processed'] % 10 == 0: - logger.info(f"Progress: {stats['processed']} URLs processed") + # Log progress regularly + if stats['processed'] % 10 == 0: + logger.info(f"Progress: {stats['processed']} URLs processed") - except Exception as e: - logger.error(f"Scraping error: {e}") - - finally: - # Print final statistics - logger.info("Scraping completed:") - logger.info(f"- URLs processed: {stats['processed']}") - logger.info(f"- Errors: {stats['errors']}") - logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB") + except Exception as e: + logger.error(f"Scraping error: {e}") - # Print filter statistics - for filter_ in filter_chain.filters: - logger.info(f"{filter_.name} stats:") - logger.info(f"- Passed: {filter_.stats.passed_urls}") - logger.info(f"- Rejected: {filter_.stats.rejected_urls}") - - # Print scorer statistics - logger.info("Scoring statistics:") - logger.info(f"- Average score: {scorer.stats.average_score:.2f}") - logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}") + finally: + # Print final statistics + logger.info("Scraping completed:") + logger.info(f"- URLs processed: {stats['processed']}") + logger.info(f"- Errors: {stats['errors']}") + logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB") + + # Print filter statistics + for filter_ in filter_chain.filters: + logger.info(f"{filter_.name} stats:") + logger.info(f"- Passed: {filter_.stats.passed_urls}") + logger.info(f"- Rejected: {filter_.stats.rejected_urls}") + + # Print scorer statistics + logger.info("Scoring statistics:") + logger.info(f"- Average score: {scorer.stats.average_score:.2f}") + logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}") if __name__ == "__main__": import asyncio # Run basic example - print("Running basic scraper example...") - asyncio.run(basic_scraper_example()) - - # print("\nRunning advanced scraper example...") - # asyncio.run(advanced_scraper_example()) \ No newline at end of file + # print("Running basic scraper example...") + # asyncio.run(basic_scraper_example()) + + # Run advanced example + print("\nRunning advanced scraper example...") + asyncio.run(advanced_scraper_example()) \ No newline at end of file