diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 44c83262..d5a935a9 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -10,6 +10,7 @@ from .config import ( from .user_agent_generator import UserAgentGenerator, UAGen, ValidUAGenerator, OnlineUAGenerator from .extraction_strategy import ExtractionStrategy from .chunking_strategy import ChunkingStrategy, RegexChunking +from .deep_crawl import DeepCrawlStrategy from .markdown_generation_strategy import MarkdownGenerationStrategy from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, LLMContentFilter, PruningContentFilter from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy @@ -395,6 +396,7 @@ class CrawlerRunConfig: word_count_threshold: int = MIN_WORD_THRESHOLD, extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), + deep_crawl_strategy: DeepCrawlStrategy = None, markdown_generator: MarkdownGenerationStrategy = None, content_filter : RelevantContentFilter = None, only_text: bool = False, @@ -468,6 +470,7 @@ class CrawlerRunConfig: self.word_count_threshold = word_count_threshold self.extraction_strategy = extraction_strategy self.chunking_strategy = chunking_strategy + self.deep_crawl_strategy = deep_crawl_strategy self.markdown_generator = markdown_generator self.content_filter = content_filter self.only_text = only_text @@ -555,6 +558,14 @@ class CrawlerRunConfig: raise ValueError( "extraction_strategy must be an instance of ExtractionStrategy" ) + + if self.deep_crawl_strategy is not None and not isinstance( + self.deep_crawl_strategy, DeepCrawlStrategy + ): + raise ValueError( + "deep_crawl_strategy must be an instance of DeepCrawlStrategy" + ) + if self.chunking_strategy is not None and not isinstance( self.chunking_strategy, ChunkingStrategy ): @@ -573,6 +584,7 @@ class CrawlerRunConfig: word_count_threshold=kwargs.get("word_count_threshold", 200), extraction_strategy=kwargs.get("extraction_strategy"), chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()), + deep_crawl_strategy=kwargs.get("deep_crawl_strategy"), markdown_generator=kwargs.get("markdown_generator"), content_filter=kwargs.get("content_filter"), only_text=kwargs.get("only_text", False), @@ -656,6 +668,7 @@ class CrawlerRunConfig: "word_count_threshold": self.word_count_threshold, "extraction_strategy": self.extraction_strategy, "chunking_strategy": self.chunking_strategy, + "deep_crawl_strategy": self.deep_crawl_strategy, "markdown_generator": self.markdown_generator, "content_filter": self.content_filter, "only_text": self.only_text, diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 7b83d2d5..8ef93381 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -38,7 +38,7 @@ from .async_logger import AsyncLogger from .async_configs import BrowserConfig, CrawlerRunConfig from .async_dispatcher import * # noqa: F403 from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter -from .traversal import TraversalStrategy +from .deep_crawl import DeepCrawlStrategy from .config import MIN_WORD_THRESHOLD from .utils import ( @@ -53,11 +53,17 @@ from .utils import ( from typing import Union, AsyncGenerator, List, TypeVar from collections.abc import AsyncGenerator -CrawlResultT = TypeVar("CrawlResultT", bound=CrawlResult) -RunManyReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] from .__version__ import __version__ as crawl4ai_version +CrawlResultT = TypeVar("CrawlResultT", bound=CrawlResult) +RunManyReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] + +DeepCrawlSingleReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] +DeepCrawlManyReturn = Union[ + List[List[CrawlResultT]], + AsyncGenerator[CrawlResultT, None], +] class AsyncWebCrawler: """ @@ -289,7 +295,7 @@ class AsyncWebCrawler: user_agent: str = None, verbose=True, **kwargs, - ) -> CrawlResult: + ) -> Union[CrawlResult, DeepCrawlSingleReturn]: """ Runs the crawler for a single source: URL (web, local file, or raw HTML). @@ -391,6 +397,23 @@ class AsyncWebCrawler: extracted_content = None start_time = time.perf_counter() + if crawler_config.deep_crawl_strategy: + if crawler_config.stream: + return crawler_config.deep_crawl_strategy.arun( + start_url=url, + crawler=self, + crawler_run_config=crawler_config, + ) + else: + results = [] + async for result in crawler_config.deep_crawl_strategy.arun( + start_url=url, + crawler=self, + crawler_run_config=crawler_config, + ): + results.append(result) + return results + # Try to get cached result if appropriate if cache_context.should_read(): cached_result = await async_db_manager.aget_cached_url(url) @@ -743,7 +766,7 @@ class AsyncWebCrawler: user_agent: str = None, verbose=True, **kwargs, - ) -> RunManyReturn: + ) -> Union[RunManyReturn, DeepCrawlManyReturn]: """ Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy. @@ -830,7 +853,7 @@ class AsyncWebCrawler: async def adeep_crawl( self, url: str, - strategy: TraversalStrategy, + strategy: DeepCrawlStrategy, crawler_run_config: Optional[CrawlerRunConfig] = None, stream: Optional[bool] = False, ) -> Union[AsyncGenerator[CrawlResult,None],List[CrawlResult]]: diff --git a/crawl4ai/traversal/__init__.py b/crawl4ai/deep_crawl/__init__.py similarity index 73% rename from crawl4ai/traversal/__init__.py rename to crawl4ai/deep_crawl/__init__.py index cf2fc0d4..53a30bf1 100644 --- a/crawl4ai/traversal/__init__.py +++ b/crawl4ai/deep_crawl/__init__.py @@ -1,4 +1,4 @@ -from .bfs_traversal_strategy import BFSTraversalStrategy +from .bfs_deep_crawl_strategy import BFSDeepCrawlStrategy from .filters import ( URLFilter, FilterChain, @@ -12,10 +12,10 @@ from .scorers import ( FreshnessScorer, CompositeScorer, ) -from .traversal_strategy import TraversalStrategy +from .deep_crawl_strategty import DeepCrawlStrategy __all__ = [ - "BFSTraversalStrategy", + "BFSDeepCrawlStrategy", "FilterChain", "URLFilter", "URLPatternFilter", @@ -25,5 +25,5 @@ __all__ = [ "PathDepthScorer", "FreshnessScorer", "CompositeScorer", - "TraversalStrategy", + "DeepCrawlStrategy", ] diff --git a/crawl4ai/traversal/bfs_traversal_strategy.py b/crawl4ai/deep_crawl/bfs_deep_crawl_strategy.py similarity index 91% rename from crawl4ai/traversal/bfs_traversal_strategy.py rename to crawl4ai/deep_crawl/bfs_deep_crawl_strategy.py index f613033e..da70c6d9 100644 --- a/crawl4ai/traversal/bfs_traversal_strategy.py +++ b/crawl4ai/deep_crawl/bfs_deep_crawl_strategy.py @@ -3,15 +3,14 @@ from datetime import datetime import asyncio import logging from urllib.parse import urlparse -from ..async_configs import CrawlerRunConfig from ..models import CrawlResult, TraversalStats from .filters import FilterChain from .scorers import URLScorer -from .traversal_strategy import TraversalStrategy +from .deep_crawl_strategty import DeepCrawlStrategy from ..config import DEEP_CRAWL_BATCH_SIZE -class BFSTraversalStrategy(TraversalStrategy): +class BFSDeepCrawlStrategy(DeepCrawlStrategy): """Best-First Search traversal strategy with filtering and scoring.""" def __init__( @@ -98,11 +97,11 @@ class BFSTraversalStrategy(TraversalStrategy): self.stats.total_depth_reached, next_depth ) - async def deep_crawl( + async def arun( self, start_url: str, crawler: "AsyncWebCrawler", - crawler_run_config: Optional[CrawlerRunConfig] = None, + crawler_run_config: Optional["CrawlerRunConfig"] = None, ) -> AsyncGenerator[CrawlResult, None]: """Implement BFS traversal strategy""" @@ -136,7 +135,9 @@ class BFSTraversalStrategy(TraversalStrategy): """ # Collect batch of URLs into active_crawls to process async with active_crawls_lock: - while len(active_crawls) < DEEP_CRAWL_BATCH_SIZE and not queue.empty(): + while ( + len(active_crawls) < DEEP_CRAWL_BATCH_SIZE and not queue.empty() + ): score, depth, url, parent_url = await queue.get() active_crawls[url] = { "depth": depth, @@ -151,14 +152,14 @@ class BFSTraversalStrategy(TraversalStrategy): continue # Process batch try: - stream_config = ( - crawler_run_config.clone(stream=True) - if crawler_run_config - else CrawlerRunConfig(stream=True) - ) + # This is very important to ensure recursively you don't deep_crawl down the children. + if crawler_run_config: + crawler_run_config = crawler_run_config.clone( + deep_crawl_strategy=None, stream=True + ) async for result in await crawler.arun_many( urls=list(active_crawls.keys()), - config=stream_config, + config=crawler_run_config ): async with active_crawls_lock: crawl_info = active_crawls.pop(result.url, None) diff --git a/crawl4ai/traversal/traversal_strategy.py b/crawl4ai/deep_crawl/deep_crawl_strategty.py similarity index 80% rename from crawl4ai/traversal/traversal_strategy.py rename to crawl4ai/deep_crawl/deep_crawl_strategty.py index dc067317..a9135d25 100644 --- a/crawl4ai/traversal/traversal_strategy.py +++ b/crawl4ai/deep_crawl/deep_crawl_strategty.py @@ -1,17 +1,16 @@ from abc import ABC, abstractmethod -from typing import AsyncGenerator +from typing import AsyncGenerator, Optional -from ..async_configs import CrawlerRunConfig from ..models import CrawlResult -class TraversalStrategy(ABC): +class DeepCrawlStrategy(ABC): @abstractmethod - async def deep_crawl( + async def arun( self, url: str, crawler: "AsyncWebCrawler", - crawler_run_config: CrawlerRunConfig = None, + crawler_run_config: Optional["CrawlerRunConfig"] = None, ) -> AsyncGenerator[CrawlResult, None]: """Traverse the given URL using the specified crawler. diff --git a/crawl4ai/traversal/filters.py b/crawl4ai/deep_crawl/filters.py similarity index 100% rename from crawl4ai/traversal/filters.py rename to crawl4ai/deep_crawl/filters.py diff --git a/crawl4ai/traversal/scorers.py b/crawl4ai/deep_crawl/scorers.py similarity index 100% rename from crawl4ai/traversal/scorers.py rename to crawl4ai/deep_crawl/scorers.py diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 1da85582..2b49973e 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -140,7 +140,6 @@ class CrawlResult(BaseModel): class Config: arbitrary_types_allowed = True - class AsyncCrawlResponse(BaseModel): html: str response_headers: Dict[str, str] diff --git a/docs/deep_crawl/deep_crawl_quickstart.py b/docs/deep_crawl/deep_crawl_quickstart.py index 63a0bf9b..7fe604e3 100644 --- a/docs/deep_crawl/deep_crawl_quickstart.py +++ b/docs/deep_crawl/deep_crawl_quickstart.py @@ -1,18 +1,25 @@ # basic_scraper_example.py from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy -from crawl4ai.traversal import ( - BFSTraversalStrategy, +from crawl4ai.deep_crawl import ( + BFSDeepCrawlStrategy, FilterChain, URLPatternFilter, ContentTypeFilter, + DomainFilter, + KeywordRelevanceScorer, + PathDepthScorer, + FreshnessScorer, + CompositeScorer, ) from crawl4ai.async_webcrawler import AsyncWebCrawler import re import time +import logging browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600) + async def basic_scraper_example(): """ Basic example: Scrape a blog site for articles @@ -31,7 +38,7 @@ async def basic_scraper_example(): ) # Initialize the strategy with basic configuration - bfs_strategy = BFSTraversalStrategy( + bfs_strategy = BFSDeepCrawlStrategy( max_depth=2, # Only go 2 levels deep filter_chain=filter_chain, url_scorer=None, # Use default scoring @@ -44,8 +51,8 @@ async def basic_scraper_example(): ) as crawler: # Start scraping try: - results = await crawler.adeep_crawl( - "https://crawl4ai.com/mkdocs", strategy=bfs_strategy + results = await crawler.arun( + "https://crawl4ai.com/mkdocs", CrawlerRunConfig(deep_crawl_strategy=bfs_strategy) ) # Process results print(f"Crawled {len(results)} pages:") @@ -55,23 +62,6 @@ async def basic_scraper_example(): except Exception as e: print(f"Error during scraping: {e}") - -# advanced_scraper_example.py -import logging - -from crawl4ai.traversal import ( - BFSTraversalStrategy, - FilterChain, - URLPatternFilter, - ContentTypeFilter, - DomainFilter, - KeywordRelevanceScorer, - PathDepthScorer, - FreshnessScorer, - CompositeScorer, -) - - async def advanced_scraper_example(): """ Advanced example: Intelligent news site scraping @@ -121,7 +111,7 @@ async def advanced_scraper_example(): ) # Initialize strategy with advanced configuration - bfs_strategy = BFSTraversalStrategy( + bfs_strategy = BFSDeepCrawlStrategy( max_depth=2, filter_chain=filter_chain, url_scorer=scorer ) @@ -136,13 +126,10 @@ async def advanced_scraper_example(): try: # Use streaming mode results = [] - result_generator = await crawler.adeep_crawl( + result_generator = await crawler.arun( "https://techcrunch.com", - strategy=bfs_strategy, - crawler_run_config=CrawlerRunConfig( - scraping_strategy=LXMLWebScrapingStrategy() - ), - stream=True, + config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy, + stream=True) ) async for result in result_generator: stats["processed"] += 1