crawl4ai/docs/deep_crawl/deep_crawl_quickstart.py

from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawl import (
    BFSDeepCrawlStrategy,
    FilterChain,
    URLPatternFilter,
    ContentTypeFilter,
    DomainFilter,
    KeywordRelevanceScorer,
    PathDepthScorer,
    FreshnessScorer,
    CompositeScorer,
)
from crawl4ai.async_webcrawler import AsyncWebCrawler
import re
import time
import logging

browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)


async def basic_example():
    """
    Basic example: Deep crawl a blog site for articles
    - Crawls only HTML pages
    - Stays within the blog section
    - Collects all results at once
    """
    # Create a simple filter chain
    filter_chain = FilterChain(
        [
            # Only crawl pages within the blog section
            URLPatternFilter("*/basic/*"),
            # Only process HTML pages
            ContentTypeFilter(["text/html"]),
        ]
    )

    # Initialize the strategy with basic configuration
    bfs_strategy = BFSDeepCrawlStrategy(
        max_depth=2,  # Only go 2 levels deep
        filter_chain=filter_chain,
        url_scorer=None,  # Use default scoring
        process_external_links=True,
    )

    # Create the crawler
    async with AsyncWebCrawler(
        config=browser_config,
    ) as crawler:
        # Start scraping
        try:
            results = await crawler.arun(
                "https://crawl4ai.com/mkdocs",
                CrawlerRunConfig(deep_crawl_strategy=bfs_strategy),
            )
            # Process results
            print(f"Crawled {len(results)} pages:")
            for result in results:
                print(f"- {result.url}: {len(result.html)} bytes")

        except Exception as e:
            print(f"Error during scraping: {e}")


async def advanced_example():
    """
    Advanced example: Intelligent news site crawling
    - Uses all filter types
    - Implements sophisticated scoring
    - Streams results
    - Includes monitoring and logging
    """
    # Set up logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger("advanced_deep_crawler")

    # Create sophisticated filter chain
    filter_chain = FilterChain(
        [
            # Domain control
            DomainFilter(
                allowed_domains=["techcrunch.com"],
                blocked_domains=["login.techcrunch.com", "legal.yahoo.com"],
            ),
            # URL patterns
            URLPatternFilter(
                [
                    "*/article/*",
                    "*/news/*",
                    "*/blog/*",
                    re.compile(r"\d{4}/\d{2}/.*"),  # Date-based URLs
                ]
            ),
            # Content types
            ContentTypeFilter(["text/html", "application/xhtml+xml"]),
        ]
    )

    # Create composite scorer
    scorer = CompositeScorer(
        [
            # Prioritize by keywords
            KeywordRelevanceScorer(
                keywords=["news", "breaking", "update", "latest"], weight=1.0
            ),
            # Prefer optimal URL structure
            PathDepthScorer(optimal_depth=3, weight=0.7),
            # Prioritize fresh content
            FreshnessScorer(weight=0.9),
        ]
    )

    # Initialize strategy with advanced configuration
    bfs_strategy = BFSDeepCrawlStrategy(
        max_depth=2, filter_chain=filter_chain, url_scorer=scorer
    )

    # Create crawler
    async with AsyncWebCrawler(
        config=browser_config,
    ) as crawler:

        # Track statistics
        stats = {"processed": 0, "errors": 0, "total_size": 0}

        try:
            # Use streaming mode
            results = []
            result_generator = await crawler.arun(
                "https://techcrunch.com",
                config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy, stream=True),
            )
            async for result in result_generator:
                stats["processed"] += 1

                if result.success:
                    stats["total_size"] += len(result.html)
                    logger.info(
                        f"Processed at depth: {result.depth} with score: {result.score:.3f} : \n {result.url}"
                    )
                    results.append(result)
                else:
                    stats["errors"] += 1
                    logger.error(
                        f"Failed to process {result.url}: {result.error_message}"
                    )

                # Log progress regularly
                if stats["processed"] % 10 == 0:
                    logger.info(f"Progress: {stats['processed']} URLs processed")

        except Exception as e:
            logger.error(f"Scraping error: {e}")

        finally:
            # Print final statistics
            logger.info("Scraping completed:")
            logger.info(f"- URLs processed: {stats['processed']}")
            logger.info(f"- Errors: {stats['errors']}")
            logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")

            # Print filter statistics
            for filter_ in filter_chain.filters:
                logger.info(f"{filter_.name} stats:")
                logger.info(f"- Passed: {filter_.stats.passed_urls}")
                logger.info(f"- Rejected: {filter_.stats.rejected_urls}")

            # Print scorer statistics
            logger.info("Scoring statistics:")
            logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
            logger.info(
                f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}"
            )


async def basic_example_many_urls():
    filter_chain = FilterChain(
        [
            URLPatternFilter("*/basic/*"),
            ContentTypeFilter(["text/html"]),
        ]
    )
    # Initialize the strategy with basic configuration
    bfs_strategy = BFSDeepCrawlStrategy(
        max_depth=2,  # Only go 2 levels deep
        filter_chain=filter_chain,
        url_scorer=None,  # Use default scoring
        process_external_links=False,
    )

    # Create the crawler
    async with AsyncWebCrawler(
        config=browser_config,
    ) as crawler:
        # Start scraping
        try:
            results = await crawler.arun_many(
                urls=["https://crawl4ai.com/mkdocs","https://aravindkarnam.com"],
                config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy),
            )
            # Process results
            print(f"Crawled {len(results)} pages:")
            for url_result in results:
                for result in url_result:
                    print(f"- {result.url}: {len(result.html)} bytes")

        except Exception as e:
            print(f"Error during scraping: {e}")

async def basic_example_many_urls_stream():
    filter_chain = FilterChain(
        [
            URLPatternFilter("*/basic/*"),
            ContentTypeFilter(["text/html"]),
        ]
    )
    # Initialize the strategy with basic configuration
    bfs_strategy = BFSDeepCrawlStrategy(
        max_depth=2,  # Only go 2 levels deep
        filter_chain=filter_chain,
        url_scorer=None,  # Use default scoring
        process_external_links=False,
    )

    # Create the crawler
    async with AsyncWebCrawler(
        config=browser_config,
    ) as crawler:
        # Start scraping
        try:
            async for result in await crawler.arun_many(
                urls=["https://crawl4ai.com/mkdocs","https://aravindkarnam.com"],
                config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy,stream=True),
            ):
            # Process results
                print(f"- {result.url}: {len(result.html)} bytes")
        except Exception as e:
            print(f"Error during scraping: {e}")

if __name__ == "__main__":
    import asyncio
    import time

    # Run basic example
    start_time = time.perf_counter()
    print("Running basic Deep crawl example...")
    asyncio.run(basic_example())
    end_time = time.perf_counter()
    print(f"Basic deep crawl example completed in {end_time - start_time:.2f} seconds")

    # Run advanced example
    print("\nRunning advanced deep crawl example...")
    asyncio.run(advanced_example())

    print("\nRunning advanced deep crawl example with arun_many...")
    asyncio.run(basic_example_many_urls())

    print("\nRunning advanced deep crawl example with arun_many streaming enabled...")
    asyncio.run(basic_example_many_urls_stream())