Refactor: Removed all scheduling logic from scraper. From now scraper expects arun_many to handle all scheduling. Scraper will only do traversal, validations, compliance checks, URL filtering and scoring etc. Reformatted some of the scraper files with Black code formatter

2025-01-21 17:49:51 +05:30
parent 26d78d8512
commit 67fa06c09b
9 changed files with 316 additions and 243 deletions
--- a/docs/scraper/scraper_quickstart.py
+++ b/docs/scraper/scraper_quickstart.py
@@ -4,13 +4,14 @@ from crawl4ai.scraper import (
    BFSScraperStrategy,
    FilterChain,
    URLPatternFilter,
-    ContentTypeFilter
+    ContentTypeFilter,
 )
 from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig
 import re

 browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)

+
 async def basic_scraper_example():
    """
    Basic example: Scrape a blog site for articles
@@ -19,37 +20,39 @@ async def basic_scraper_example():
    - Collects all results at once
    """
    # Create a simple filter chain
-    filter_chain = FilterChain([
-        # Only crawl pages within the blog section
-        URLPatternFilter("*/tutorial/*"),
-        # Only process HTML pages
-        ContentTypeFilter(["text/html"])
-    ])
+    filter_chain = FilterChain(
+        [
+            # Only crawl pages within the blog section
+            URLPatternFilter("*/tutorial/*"),
+            # Only process HTML pages
+            ContentTypeFilter(["text/html"]),
+        ]
+    )

    # Initialize the strategy with basic configuration
    strategy = BFSScraperStrategy(
        max_depth=2,  # Only go 2 levels deep
        filter_chain=filter_chain,
        url_scorer=None,  # Use default scoring
-        max_concurrent=3,  # Limit concurrent requests
-        process_external_links=True
+        process_external_links=True,
    )

    # Create the crawler and scraper
-    async with AsyncWebCrawler(config=browser_config,verbose=True) as crawler:
+    async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler:
        scraper = AsyncWebScraper(crawler, strategy)
        # Start scraping
        try:
            result = await scraper.ascrape("https://crawl4ai.com/mkdocs")
-            
+
            # Process results
            print(f"Crawled {len(result.crawled_urls)} pages:")
            for url, data in result.extracted_data.items():
                print(f"- {url}: {len(data.html)} bytes")
-            
+
        except Exception as e:
            print(f"Error during scraping: {e}")

+
 # advanced_scraper_example.py
 import logging
 from crawl4ai.scraper import (
@@ -62,10 +65,11 @@ from crawl4ai.scraper import (
    KeywordRelevanceScorer,
    PathDepthScorer,
    FreshnessScorer,
-    CompositeScorer
+    CompositeScorer,
 )
 from crawl4ai.async_webcrawler import AsyncWebCrawler

+
 async def advanced_scraper_example():
    """
    Advanced example: Intelligent news site scraping
@@ -79,49 +83,44 @@ async def advanced_scraper_example():
    logger = logging.getLogger("advanced_scraper")

    # Create sophisticated filter chain
-    filter_chain = FilterChain([
-        # Domain control
-        DomainFilter(
-            allowed_domains=["techcrunch.com"],
-            blocked_domains=["login.techcrunch.com","legal.yahoo.com"]
-        ),
-        # URL patterns
-        URLPatternFilter([
-            "*/article/*",
-            "*/news/*",
-            "*/blog/*",
-            re.compile(r"\d{4}/\d{2}/.*")  # Date-based URLs
-        ]),
-        # Content types
-        ContentTypeFilter([
-            "text/html",
-            "application/xhtml+xml"
-        ])
-    ])
+    filter_chain = FilterChain(
+        [
+            # Domain control
+            DomainFilter(
+                allowed_domains=["techcrunch.com"],
+                blocked_domains=["login.techcrunch.com", "legal.yahoo.com"],
+            ),
+            # URL patterns
+            URLPatternFilter(
+                [
+                    "*/article/*",
+                    "*/news/*",
+                    "*/blog/*",
+                    re.compile(r"\d{4}/\d{2}/.*"),  # Date-based URLs
+                ]
+            ),
+            # Content types
+            ContentTypeFilter(["text/html", "application/xhtml+xml"]),
+        ]
+    )

    # Create composite scorer
-    scorer = CompositeScorer([
-        # Prioritize by keywords
-        KeywordRelevanceScorer(
-            keywords=["news", "breaking", "update", "latest"],
-            weight=1.0
-        ),
-        # Prefer optimal URL structure
-        PathDepthScorer(
-            optimal_depth=3,
-            weight=0.7
-        ),
-        # Prioritize fresh content
-        FreshnessScorer(weight=0.9)
-    ])
+    scorer = CompositeScorer(
+        [
+            # Prioritize by keywords
+            KeywordRelevanceScorer(
+                keywords=["news", "breaking", "update", "latest"], weight=1.0
+            ),
+            # Prefer optimal URL structure
+            PathDepthScorer(optimal_depth=3, weight=0.7),
+            # Prioritize fresh content
+            FreshnessScorer(weight=0.9),
+        ]
+    )

    # Initialize strategy with advanced configuration
    strategy = BFSScraperStrategy(
-        max_depth=2,
-        filter_chain=filter_chain,
-        url_scorer=scorer,
-        max_concurrent=2,
-        min_crawl_delay=1
+        max_depth=2, filter_chain=filter_chain, url_scorer=scorer
    )

    # Create crawler and scraper
@@ -129,57 +128,60 @@ async def advanced_scraper_example():
        scraper = AsyncWebScraper(crawler, strategy)

        # Track statistics
-        stats = {
-            'processed': 0,
-            'errors': 0,
-            'total_size': 0
-        }
+        stats = {"processed": 0, "errors": 0, "total_size": 0}

        try:
            # Use streaming mode
-            result_generator = await scraper.ascrape("https://techcrunch.com", parallel_processing=True, stream=True)
+            result_generator = await scraper.ascrape(
+                "https://techcrunch.com", stream=True
+            )
            async for result in result_generator:
-                stats['processed'] += 1
-                
+                stats["processed"] += 1
+
                if result.success:
-                    stats['total_size'] += len(result.html)
+                    stats["total_size"] += len(result.html)
                    logger.info(f"Processed: {result.url}")
                else:
-                    stats['errors'] += 1
-                    logger.error(f"Failed to process {result.url}: {result.error_message}")
+                    stats["errors"] += 1
+                    logger.error(
+                        f"Failed to process {result.url}: {result.error_message}"
+                    )

                # Log progress regularly
-                if stats['processed'] % 10 == 0:
+                if stats["processed"] % 10 == 0:
                    logger.info(f"Progress: {stats['processed']} URLs processed")

        except Exception as e:
            logger.error(f"Scraping error: {e}")
-        
+
        finally:
            # Print final statistics
            logger.info("Scraping completed:")
            logger.info(f"- URLs processed: {stats['processed']}")
            logger.info(f"- Errors: {stats['errors']}")
            logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
-            
+
            # Print filter statistics
            for filter_ in filter_chain.filters:
                logger.info(f"{filter_.name} stats:")
                logger.info(f"- Passed: {filter_.stats.passed_urls}")
                logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
-            
+
            # Print scorer statistics
            logger.info("Scoring statistics:")
            logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
-            logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}")
+            logger.info(
+                f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}"
+            )
+

 if __name__ == "__main__":
    import asyncio
-    
+
    # Run basic example
    print("Running basic scraper example...")
    asyncio.run(basic_scraper_example())

    # Run advanced example
-    print("\nRunning advanced scraper example...")
-    asyncio.run(advanced_scraper_example())
+    # print("\nRunning advanced scraper example...")
+    # asyncio.run(advanced_scraper_example())