fixed the final scraper_quickstart.py example

2024-11-26 17:05:54 +05:30
parent 155c756238
commit 9530ded83a
1 changed files with 53 additions and 54 deletions
--- a/docs/scraper/scraper_quickstart.py
+++ b/docs/scraper/scraper_quickstart.py
@@ -19,7 +19,7 @@ async def basic_scraper_example():
    # Create a simple filter chain
    filter_chain = FilterChain([
        # Only crawl pages within the blog section
-        # URLPatternFilter("*/tutorial/*"),
+        URLPatternFilter("*/tutorial/*"),
        # Only process HTML pages
        ContentTypeFilter(["text/html"])
    ])
@@ -29,7 +29,8 @@ async def basic_scraper_example():
        max_depth=2,  # Only go 2 levels deep
        filter_chain=filter_chain,
        url_scorer=None,  # Use default scoring
-        max_concurrent=3  # Limit concurrent requests
+        max_concurrent=3,  # Limit concurrent requests
        process_external_links=True
    )
    # Create the crawler and scraper
@@ -79,8 +80,8 @@ async def advanced_scraper_example():
    filter_chain = FilterChain([
        # Domain control
        DomainFilter(
-            allowed_domains=["example.com", "blog.example.com"],
+            allowed_domains=["techcrunch.com"],
-            blocked_domains=["ads.example.com", "tracker.example.com"]
+            blocked_domains=["login.techcrunch.com","legal.yahoo.com"]
        ),
        # URL patterns
        URLPatternFilter([
@@ -114,7 +115,7 @@ async def advanced_scraper_example():
    # Initialize strategy with advanced configuration
    strategy = BFSScraperStrategy(
-        max_depth=4,
+        max_depth=2,
        filter_chain=filter_chain,
        url_scorer=scorer,
        max_concurrent=5,
@@ -122,63 +123,61 @@ async def advanced_scraper_example():
    )
    # Create crawler and scraper
-    crawler = AsyncWebCrawler()
+    async with AsyncWebCrawler(verbose=True) as crawler:
-    scraper = AsyncWebScraper(crawler, strategy)
+        scraper = AsyncWebScraper(crawler, strategy)
-    # Track statistics
+        # Track statistics
-    stats = {
+        stats = {
-        'processed': 0,
+            'processed': 0,
-        'errors': 0,
+            'errors': 0,
-        'total_size': 0
+            'total_size': 0
-    }
+        }
-    try:
+        try:
-        # Use streaming mode
+            # Use streaming mode
-        async for result in scraper.ascrape("https://example.com/news/", stream=True):
+            result_generator = await scraper.ascrape("https://techcrunch.com", parallel_processing=True, stream=True)
-            stats['processed'] += 1
+            async for result in result_generator:
-            
+                stats['processed'] += 1
            if result.success:
                stats['total_size'] += len(result.html)
                logger.info(f"Processed: {result.url}")
-                # Print scoring information
+                if result.success:
-                for scorer_name, score in result.scores.items():
+                    stats['total_size'] += len(result.html)
-                    logger.debug(f"{scorer_name}: {score:.2f}")
+                    logger.info(f"Processed: {result.url}")
-            else:
+                else:
-                stats['errors'] += 1
+                    stats['errors'] += 1
-                logger.error(f"Failed to process {result.url}: {result.error_message}")
+                    logger.error(f"Failed to process {result.url}: {result.error_message}")
-            # Log progress regularly
+                # Log progress regularly
-            if stats['processed'] % 10 == 0:
+                if stats['processed'] % 10 == 0:
-                logger.info(f"Progress: {stats['processed']} URLs processed")
+                    logger.info(f"Progress: {stats['processed']} URLs processed")
-    except Exception as e:
+        except Exception as e:
-        logger.error(f"Scraping error: {e}")
+            logger.error(f"Scraping error: {e}")
    finally:
        # Print final statistics
        logger.info("Scraping completed:")
        logger.info(f"- URLs processed: {stats['processed']}")
        logger.info(f"- Errors: {stats['errors']}")
        logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
-        # Print filter statistics
+        finally:
-        for filter_ in filter_chain.filters:
+            # Print final statistics
-            logger.info(f"{filter_.name} stats:")
+            logger.info("Scraping completed:")
-            logger.info(f"- Passed: {filter_.stats.passed_urls}")
+            logger.info(f"- URLs processed: {stats['processed']}")
-            logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
+            logger.info(f"- Errors: {stats['errors']}")
-        
+            logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
-        # Print scorer statistics
+            
-        logger.info("Scoring statistics:")
+            # Print filter statistics
-        logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
+            for filter_ in filter_chain.filters:
-        logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}")
+                logger.info(f"{filter_.name} stats:")
                logger.info(f"- Passed: {filter_.stats.passed_urls}")
                logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
            # Print scorer statistics
            logger.info("Scoring statistics:")
            logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
            logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}")
 if __name__ == "__main__":
    import asyncio
    # Run basic example
-    print("Running basic scraper example...")
+    # print("Running basic scraper example...")
-    asyncio.run(basic_scraper_example())
+    # asyncio.run(basic_scraper_example())
-    
+
-    # print("\nRunning advanced scraper example...")
+    # Run advanced example
-    # asyncio.run(advanced_scraper_example())
+    print("\nRunning advanced scraper example...")
    asyncio.run(advanced_scraper_example())