crawl4ai/tests/test_scraper.py

# basic_scraper_example.py
from crawl4ai.scraper import (
    AsyncWebScraper,
    BFSScraperStrategy,
    FilterChain,
    URLPatternFilter,
    ContentTypeFilter
)
from crawl4ai.async_webcrawler import AsyncWebCrawler

async def basic_scraper_example():
    """
    Basic example: Scrape a blog site for articles
    - Crawls only HTML pages
    - Stays within the blog section
    - Collects all results at once
    """
    # Create a simple filter chain
    filter_chain = FilterChain([
        # Only crawl pages within the blog section
        URLPatternFilter("*/blog/*"),
        # Only process HTML pages
        ContentTypeFilter(["text/html"])
    ])

    # Initialize the strategy with basic configuration
    strategy = BFSScraperStrategy(
        max_depth=2,  # Only go 2 levels deep
        filter_chain=filter_chain,
        url_scorer=None,  # Use default scoring
        max_concurrent=3  # Limit concurrent requests
    )

    # Create the crawler and scraper
    crawler = AsyncWebCrawler()
    scraper = AsyncWebScraper(crawler, strategy)

    # Start scraping
    try:
        result = await scraper.ascrape("https://example.com/blog/")

        # Process results
        print(f"Crawled {len(result.crawled_urls)} pages:")
        for url, data in result.extracted_data.items():
            print(f"- {url}: {len(data.html)} bytes")

    except Exception as e:
        print(f"Error during scraping: {e}")

# advanced_scraper_example.py
import logging
from crawl4ai.scraper import (
    AsyncWebScraper,
    BFSScraperStrategy,
    FilterChain,
    URLPatternFilter,
    ContentTypeFilter,
    DomainFilter,
    KeywordRelevanceScorer,
    PathDepthScorer,
    FreshnessScorer,
    CompositeScorer
)
from crawl4ai.async_webcrawler import AsyncWebCrawler

async def advanced_scraper_example():
    """
    Advanced example: Intelligent news site scraping
    - Uses all filter types
    - Implements sophisticated scoring
    - Streams results
    - Includes monitoring and logging
    """
    # Set up logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger("advanced_scraper")

    # Create sophisticated filter chain
    filter_chain = FilterChain([
        # Domain control
        DomainFilter(
            allowed_domains=["example.com", "blog.example.com"],
            blocked_domains=["ads.example.com", "tracker.example.com"]
        ),
        # URL patterns
        URLPatternFilter([
            "*/article/*",
            "*/news/*",
            "*/blog/*",
            re.compile(r"\d{4}/\d{2}/.*")  # Date-based URLs
        ]),
        # Content types
        ContentTypeFilter([
            "text/html",
            "application/xhtml+xml"
        ])
    ])

    # Create composite scorer
    scorer = CompositeScorer([
        # Prioritize by keywords
        KeywordRelevanceScorer(
            keywords=["news", "breaking", "update", "latest"],
            weight=1.0
        ),
        # Prefer optimal URL structure
        PathDepthScorer(
            optimal_depth=3,
            weight=0.7
        ),
        # Prioritize fresh content
        FreshnessScorer(weight=0.9)
    ])

    # Initialize strategy with advanced configuration
    strategy = BFSScraperStrategy(
        max_depth=4,
        filter_chain=filter_chain,
        url_scorer=scorer,
        max_concurrent=5,
        min_crawl_delay=1
    )

    # Create crawler and scraper
    crawler = AsyncWebCrawler()
    scraper = AsyncWebScraper(crawler, strategy)

    # Track statistics
    stats = {
        'processed': 0,
        'errors': 0,
        'total_size': 0
    }

    try:
        # Use streaming mode
        async for result in scraper.ascrape("https://example.com/news/", stream=True):
            stats['processed'] += 1

            if result.success:
                stats['total_size'] += len(result.html)
                logger.info(f"Processed: {result.url}")

                # Print scoring information
                for scorer_name, score in result.scores.items():
                    logger.debug(f"{scorer_name}: {score:.2f}")
            else:
                stats['errors'] += 1
                logger.error(f"Failed to process {result.url}: {result.error_message}")

            # Log progress regularly
            if stats['processed'] % 10 == 0:
                logger.info(f"Progress: {stats['processed']} URLs processed")

    except Exception as e:
        logger.error(f"Scraping error: {e}")

    finally:
        # Print final statistics
        logger.info("Scraping completed:")
        logger.info(f"- URLs processed: {stats['processed']}")
        logger.info(f"- Errors: {stats['errors']}")
        logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")

        # Print filter statistics
        for filter_ in filter_chain.filters:
            logger.info(f"{filter_.name} stats:")
            logger.info(f"- Passed: {filter_.stats.passed_urls}")
            logger.info(f"- Rejected: {filter_.stats.rejected_urls}")

        # Print scorer statistics
        logger.info("Scoring statistics:")
        logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
        logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}")

if __name__ == "__main__":
    import asyncio

    # Run basic example
    print("Running basic scraper example...")
    asyncio.run(basic_scraper_example())

    print("\nRunning advanced scraper example...")
    asyncio.run(advanced_scraper_example())