From d02474963323e2e7c8d1ab4ac181c30e2d68b79a Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 3 Mar 2025 21:51:11 +0800 Subject: [PATCH] refactor(deep-crawl): add max_pages limit and improve crawl control Add max_pages parameter to all deep crawling strategies to limit total pages crawled. Add score_threshold parameter to BFS/DFS strategies for quality control. Remove legacy parameter handling in AsyncWebCrawler. Improve error handling and logging in crawl strategies. BREAKING CHANGE: Removed support for legacy parameters in AsyncWebCrawler.run_many() --- CHANGELOG.md | 21 ++++ crawl4ai/async_webcrawler.py | 103 ++++++++-------- crawl4ai/deep_crawling/bff_strategy.py | 51 ++++++-- crawl4ai/deep_crawling/bfs_strategy.py | 77 ++++++++++-- crawl4ai/deep_crawling/dfs_strategy.py | 36 ++++-- .../{deepcrawl.py => deepcrawl_example.py} | 111 +++++++++++++++++- docs/md_v2/core/deep-crawling.md | 64 +++++++++- 7 files changed, 372 insertions(+), 91 deletions(-) rename docs/examples/{deepcrawl.py => deepcrawl_example.py} (76%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1dd26623..cebaab96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,27 @@ All notable changes to Crawl4AI will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## Version 0.5.0 (2025-03-02) + +### Added + +- *(profiles)* Add BrowserProfiler class for dedicated browser profile management +- *(cli)* Add interactive profile management to CLI with rich UI +- *(profiles)* Add ability to crawl directly from profile management interface +- *(browser)* Support identity-based browsing with persistent profiles + +### Changed + +- *(browser)* Refactor profile management from ManagedBrowser to BrowserProfiler class +- *(cli)* Enhance CLI with profile selection and status display for crawling +- *(examples)* Update identity-based browsing example to use BrowserProfiler class +- *(docs)* Update identity-based crawling documentation + +### Fixed + +- *(browser)* Fix profile detection and management on different platforms +- *(cli)* Fix CLI command structure for better user experience + ## Version 0.5.0 (2025-02-21) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index c294eebd..b5a646c9 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -224,22 +224,22 @@ class AsyncWebCrawler: url: str, config: CrawlerRunConfig = None, # Legacy parameters maintained for backwards compatibility - word_count_threshold=MIN_WORD_THRESHOLD, - extraction_strategy: ExtractionStrategy = None, - chunking_strategy: ChunkingStrategy = RegexChunking(), - content_filter: RelevantContentFilter = None, - cache_mode: Optional[CacheMode] = None, + # word_count_threshold=MIN_WORD_THRESHOLD, + # extraction_strategy: ExtractionStrategy = None, + # chunking_strategy: ChunkingStrategy = RegexChunking(), + # content_filter: RelevantContentFilter = None, + # cache_mode: Optional[CacheMode] = None, # Deprecated cache parameters - bypass_cache: bool = False, - disable_cache: bool = False, - no_cache_read: bool = False, - no_cache_write: bool = False, + # bypass_cache: bool = False, + # disable_cache: bool = False, + # no_cache_read: bool = False, + # no_cache_write: bool = False, # Other legacy parameters - css_selector: str = None, - screenshot: bool = False, - pdf: bool = False, - user_agent: str = None, - verbose=True, + # css_selector: str = None, + # screenshot: bool = False, + # pdf: bool = False, + # user_agent: str = None, + # verbose=True, **kwargs, ) -> RunManyReturn: """ @@ -276,39 +276,41 @@ class AsyncWebCrawler: async with self._lock or self.nullcontext(): try: + self.logger.verbose = crawler_config.verbose # Handle configuration if crawler_config is not None: config = crawler_config else: # Merge all parameters into a single kwargs dict for config creation - config_kwargs = { - "word_count_threshold": word_count_threshold, - "extraction_strategy": extraction_strategy, - "chunking_strategy": chunking_strategy, - "content_filter": content_filter, - "cache_mode": cache_mode, - "bypass_cache": bypass_cache, - "disable_cache": disable_cache, - "no_cache_read": no_cache_read, - "no_cache_write": no_cache_write, - "css_selector": css_selector, - "screenshot": screenshot, - "pdf": pdf, - "verbose": verbose, - **kwargs, - } - config = CrawlerRunConfig.from_kwargs(config_kwargs) + # config_kwargs = { + # "word_count_threshold": word_count_threshold, + # "extraction_strategy": extraction_strategy, + # "chunking_strategy": chunking_strategy, + # "content_filter": content_filter, + # "cache_mode": cache_mode, + # "bypass_cache": bypass_cache, + # "disable_cache": disable_cache, + # "no_cache_read": no_cache_read, + # "no_cache_write": no_cache_write, + # "css_selector": css_selector, + # "screenshot": screenshot, + # "pdf": pdf, + # "verbose": verbose, + # **kwargs, + # } + # config = CrawlerRunConfig.from_kwargs(config_kwargs) + pass # Handle deprecated cache parameters - if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): - # Convert legacy parameters if cache_mode not provided - if config.cache_mode is None: - config.cache_mode = _legacy_to_cache_mode( - disable_cache=disable_cache, - bypass_cache=bypass_cache, - no_cache_read=no_cache_read, - no_cache_write=no_cache_write, - ) + # if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): + # # Convert legacy parameters if cache_mode not provided + # if config.cache_mode is None: + # config.cache_mode = _legacy_to_cache_mode( + # disable_cache=disable_cache, + # bypass_cache=bypass_cache, + # no_cache_read=no_cache_read, + # no_cache_write=no_cache_write, + # ) # Default to ENABLED if no cache mode specified if config.cache_mode is None: @@ -344,7 +346,11 @@ class AsyncWebCrawler: # If screenshot is requested but its not in cache, then set cache_result to None screenshot_data = cached_result.screenshot pdf_data = cached_result.pdf - if config.screenshot and not screenshot or config.pdf and not pdf: + # if config.screenshot and not screenshot or config.pdf and not pdf: + if config.screenshot and not screenshot_data: + cached_result = None + + if config.pdf and not pdf_data: cached_result = None self.logger.url_status( @@ -358,12 +364,11 @@ class AsyncWebCrawler: if config and config.proxy_rotation_strategy: next_proxy = await config.proxy_rotation_strategy.get_next_proxy() if next_proxy: - if verbose: - self.logger.info( - message="Switch proxy: {proxy}", - tag="PROXY", - params={"proxy": next_proxy.server}, - ) + self.logger.info( + message="Switch proxy: {proxy}", + tag="PROXY", + params={"proxy": next_proxy.server}, + ) config.proxy_config = next_proxy # config = config.clone(proxy_config=next_proxy) @@ -371,8 +376,8 @@ class AsyncWebCrawler: if not cached_result or not html: t1 = time.perf_counter() - if user_agent: - self.crawler_strategy.update_user_agent(user_agent) + if config.user_agent: + self.crawler_strategy.update_user_agent(config.user_agent) # Check robots.txt if enabled if config and config.check_robots_txt: diff --git a/crawl4ai/deep_crawling/bff_strategy.py b/crawl4ai/deep_crawling/bff_strategy.py index f1e871ee..fc2047d2 100644 --- a/crawl4ai/deep_crawling/bff_strategy.py +++ b/crawl4ai/deep_crawling/bff_strategy.py @@ -37,15 +37,18 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): filter_chain: FilterChain = FilterChain(), url_scorer: Optional[URLScorer] = None, include_external: bool = False, + max_pages: int = float('inf'), logger: Optional[logging.Logger] = None, ): self.max_depth = max_depth self.filter_chain = filter_chain self.url_scorer = url_scorer self.include_external = include_external + self.max_pages = max_pages self.logger = logger or logging.getLogger(__name__) self.stats = TraversalStats(start_time=datetime.now()) self._cancel_event = asyncio.Event() + self._pages_crawled = 0 async def can_process_url(self, url: str, depth: int) -> bool: """ @@ -86,12 +89,20 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): new_depth = current_depth + 1 if new_depth > self.max_depth: return + + # If we've reached the max pages limit, don't discover new links + remaining_capacity = self.max_pages - self._pages_crawled + if remaining_capacity <= 0: + self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping link discovery") + return # Retrieve internal links; include external links if enabled. links = result.links.get("internal", []) if self.include_external: links += result.links.get("external", []) + # If we have more links than remaining capacity, limit how many we'll process + valid_links = [] for link in links: url = link.get("href") if url in visited: @@ -99,8 +110,16 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): if not await self.can_process_url(url, new_depth): self.stats.urls_skipped += 1 continue - - # Record the new depth. + + valid_links.append(url) + + # If we have more valid links than capacity, limit them + if len(valid_links) > remaining_capacity: + valid_links = valid_links[:remaining_capacity] + self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit") + + # Record the new depths and add to next_links + for url in valid_links: depths[url] = new_depth next_links.append((url, source_url)) @@ -123,6 +142,11 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): depths: Dict[str, int] = {start_url: 0} while not queue.empty() and not self._cancel_event.is_set(): + # Stop if we've reached the max pages limit + if self._pages_crawled >= self.max_pages: + self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl") + break + batch: List[Tuple[float, int, str, Optional[str]]] = [] # Retrieve up to BATCH_SIZE items from the priority queue. for _ in range(BATCH_SIZE): @@ -153,14 +177,23 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): result.metadata["depth"] = depth result.metadata["parent_url"] = parent_url result.metadata["score"] = score + + # Count only successful crawls toward max_pages limit + if result.success: + self._pages_crawled += 1 + yield result - # Discover new links from this result. - new_links: List[Tuple[str, Optional[str]]] = [] - await self.link_discovery(result, result_url, depth, visited, new_links, depths) - for new_url, new_parent in new_links: - new_depth = depths.get(new_url, depth + 1) - new_score = self.url_scorer.score(new_url) if self.url_scorer else 0 - await queue.put((new_score, new_depth, new_url, new_parent)) + + # Only discover links from successful crawls + if result.success: + # Discover new links from this result + new_links: List[Tuple[str, Optional[str]]] = [] + await self.link_discovery(result, result_url, depth, visited, new_links, depths) + + for new_url, new_parent in new_links: + new_depth = depths.get(new_url, depth + 1) + new_score = self.url_scorer.score(new_url) if self.url_scorer else 0 + await queue.put((new_score, new_depth, new_url, new_parent)) # End of crawl. diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py index 48c0c240..dcdee30c 100644 --- a/crawl4ai/deep_crawling/bfs_strategy.py +++ b/crawl4ai/deep_crawling/bfs_strategy.py @@ -24,17 +24,22 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): self, max_depth: int, filter_chain: FilterChain = FilterChain(), - url_scorer: Optional[URLScorer] = None, + url_scorer: Optional[URLScorer] = None, include_external: bool = False, + score_threshold: float = float('-inf'), + max_pages: int = float('inf'), logger: Optional[logging.Logger] = None, ): self.max_depth = max_depth self.filter_chain = filter_chain self.url_scorer = url_scorer self.include_external = include_external + self.score_threshold = score_threshold + self.max_pages = max_pages self.logger = logger or logging.getLogger(__name__) self.stats = TraversalStats(start_time=datetime.now()) self._cancel_event = asyncio.Event() + self._pages_crawled = 0 async def can_process_url(self, url: str, depth: int) -> bool: """ @@ -72,16 +77,25 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): prepares the next level of URLs. Each valid URL is appended to next_level as a tuple (url, parent_url) and its depth is tracked. - """ + """ next_depth = current_depth + 1 if next_depth > self.max_depth: return + # If we've reached the max pages limit, don't discover new links + remaining_capacity = self.max_pages - self._pages_crawled + if remaining_capacity <= 0: + self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping link discovery") + return + # Get internal links and, if enabled, external links. links = result.links.get("internal", []) if self.include_external: links += result.links.get("external", []) + valid_links = [] + + # First collect all valid links for link in links: url = link.get("href") if url in visited: @@ -90,10 +104,29 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): self.stats.urls_skipped += 1 continue - # Score the URL if a scorer is provided. In this simple BFS - # the score is not used for ordering. + # Score the URL if a scorer is provided score = self.url_scorer.score(url) if self.url_scorer else 0 - # attach the score to metadata if needed. + + # Skip URLs with scores below the threshold + if score < self.score_threshold: + self.logger.debug(f"URL {url} skipped: score {score} below threshold {self.score_threshold}") + self.stats.urls_skipped += 1 + continue + + valid_links.append((url, score)) + + # If we have more valid links than capacity, sort by score and take the top ones + if len(valid_links) > remaining_capacity: + if self.url_scorer: + # Sort by score in descending order + valid_links.sort(key=lambda x: x[1], reverse=True) + # Take only as many as we have capacity for + valid_links = valid_links[:remaining_capacity] + self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit") + + # Process the final selected links + for url, score in valid_links: + # attach the score to metadata if needed if score: result.metadata = result.metadata or {} result.metadata["score"] = score @@ -125,7 +158,11 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): # Clone the config to disable deep crawling recursion and enforce batch mode. batch_config = config.clone(deep_crawl_strategy=None, stream=False) batch_results = await crawler.arun_many(urls=urls, config=batch_config) - + + # Update pages crawled counter - count only successful crawls + successful_results = [r for r in batch_results if r.success] + self._pages_crawled += len(successful_results) + for result in batch_results: url = result.url depth = depths.get(url, 0) @@ -134,7 +171,11 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): parent_url = next((parent for (u, parent) in current_level if u == url), None) result.metadata["parent_url"] = parent_url results.append(result) - await self.link_discovery(result, url, depth, visited, next_level, depths) + + # Only discover links from successful crawls + if result.success: + # Link discovery will handle the max pages limit internally + await self.link_discovery(result, url, depth, visited, next_level, depths) current_level = next_level @@ -161,6 +202,9 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): stream_config = config.clone(deep_crawl_strategy=None, stream=True) stream_gen = await crawler.arun_many(urls=urls, config=stream_config) + + # Keep track of processed results for this batch + results_count = 0 async for result in stream_gen: url = result.url depth = depths.get(url, 0) @@ -168,9 +212,24 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): result.metadata["depth"] = depth parent_url = next((parent for (u, parent) in current_level if u == url), None) result.metadata["parent_url"] = parent_url + + # Count only successful crawls + if result.success: + self._pages_crawled += 1 + + results_count += 1 yield result - await self.link_discovery(result, url, depth, visited, next_level, depths) - + + # Only discover links from successful crawls + if result.success: + # Link discovery will handle the max pages limit internally + await self.link_discovery(result, url, depth, visited, next_level, depths) + + # If we didn't get results back (e.g. due to errors), avoid getting stuck in an infinite loop + # by considering these URLs as visited but not counting them toward the max_pages limit + if results_count == 0 and urls: + self.logger.warning(f"No results returned for {len(urls)} URLs, marking as visited") + current_level = next_level async def shutdown(self) -> None: diff --git a/crawl4ai/deep_crawling/dfs_strategy.py b/crawl4ai/deep_crawling/dfs_strategy.py index 423315f0..f79f9628 100644 --- a/crawl4ai/deep_crawling/dfs_strategy.py +++ b/crawl4ai/deep_crawling/dfs_strategy.py @@ -37,6 +37,7 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): # Clone config to disable recursive deep crawling. batch_config = config.clone(deep_crawl_strategy=None, stream=False) url_results = await crawler.arun_many(urls=[url], config=batch_config) + for result in url_results: result.metadata = result.metadata or {} result.metadata["depth"] = depth @@ -44,13 +45,19 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): if self.url_scorer: result.metadata["score"] = self.url_scorer.score(url) results.append(result) - - new_links: List[Tuple[str, Optional[str]]] = [] - await self.link_discovery(result, url, depth, visited, new_links, depths) - # Push new links in reverse order so the first discovered is processed next. - for new_url, new_parent in reversed(new_links): - new_depth = depths.get(new_url, depth + 1) - stack.append((new_url, new_parent, new_depth)) + + # Count only successful crawls toward max_pages limit + if result.success: + self._pages_crawled += 1 + + # Only discover links from successful crawls + new_links: List[Tuple[str, Optional[str]]] = [] + await self.link_discovery(result, url, depth, visited, new_links, depths) + + # Push new links in reverse order so the first discovered is processed next. + for new_url, new_parent in reversed(new_links): + new_depth = depths.get(new_url, depth + 1) + stack.append((new_url, new_parent, new_depth)) return results async def _arun_stream( @@ -83,8 +90,13 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): result.metadata["score"] = self.url_scorer.score(url) yield result - new_links: List[Tuple[str, Optional[str]]] = [] - await self.link_discovery(result, url, depth, visited, new_links, depths) - for new_url, new_parent in reversed(new_links): - new_depth = depths.get(new_url, depth + 1) - stack.append((new_url, new_parent, new_depth)) + # Only count successful crawls toward max_pages limit + # and only discover links from successful crawls + if result.success: + self._pages_crawled += 1 + + new_links: List[Tuple[str, Optional[str]]] = [] + await self.link_discovery(result, url, depth, visited, new_links, depths) + for new_url, new_parent in reversed(new_links): + new_depth = depths.get(new_url, depth + 1) + stack.append((new_url, new_parent, new_depth)) diff --git a/docs/examples/deepcrawl.py b/docs/examples/deepcrawl_example.py similarity index 76% rename from docs/examples/deepcrawl.py rename to docs/examples/deepcrawl_example.py index 6df716af..6b11179c 100644 --- a/docs/examples/deepcrawl.py +++ b/docs/examples/deepcrawl_example.py @@ -80,7 +80,7 @@ async def stream_vs_nonstream(): base_config = CrawlerRunConfig( deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, include_external=False), scraping_strategy=LXMLWebScrapingStrategy(), - verbose=True, + verbose=False, ) async with AsyncWebCrawler() as crawler: @@ -212,11 +212,11 @@ async def filters_and_scorers(): # Create a keyword relevance scorer keyword_scorer = KeywordRelevanceScorer( - keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=0.3 + keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=1 ) config = CrawlerRunConfig( - deep_crawl_strategy=BestFirstCrawlingStrategy( # Note: Changed to BestFirst + deep_crawl_strategy=BestFirstCrawlingStrategy( max_depth=1, include_external=False, url_scorer=keyword_scorer ), scraping_strategy=LXMLWebScrapingStrategy(), @@ -373,6 +373,104 @@ async def advanced_filters(): # Main function to run the entire tutorial +async def max_pages_and_thresholds(): + """ + PART 6: Demonstrates using max_pages and score_threshold parameters with different strategies. + + This function shows: + - How to limit the number of pages crawled + - How to set score thresholds for more targeted crawling + - Comparing BFS, DFS, and Best-First strategies with these parameters + """ + print("\n===== MAX PAGES AND SCORE THRESHOLDS =====") + + from crawl4ai.deep_crawling import DFSDeepCrawlStrategy + + async with AsyncWebCrawler() as crawler: + # Define a common keyword scorer for all examples + keyword_scorer = KeywordRelevanceScorer( + keywords=["browser", "crawler", "web", "automation"], + weight=1.0 + ) + + # EXAMPLE 1: BFS WITH MAX PAGES + print("\nšŸ“Š EXAMPLE 1: BFS STRATEGY WITH MAX PAGES LIMIT") + print(" Limit the crawler to a maximum of 5 pages") + + bfs_config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=2, + include_external=False, + url_scorer=keyword_scorer, + max_pages=5 # Only crawl 5 pages + ), + scraping_strategy=LXMLWebScrapingStrategy(), + verbose=True, + cache_mode=CacheMode.BYPASS, + ) + + results = await crawler.arun(url="https://docs.crawl4ai.com", config=bfs_config) + + print(f" āœ… Crawled exactly {len(results)} pages as specified by max_pages") + for result in results: + depth = result.metadata.get("depth", 0) + print(f" → Depth: {depth} | {result.url}") + + # EXAMPLE 2: DFS WITH SCORE THRESHOLD + print("\nšŸ“Š EXAMPLE 2: DFS STRATEGY WITH SCORE THRESHOLD") + print(" Only crawl pages with a relevance score above 0.5") + + dfs_config = CrawlerRunConfig( + deep_crawl_strategy=DFSDeepCrawlStrategy( + max_depth=2, + include_external=False, + url_scorer=keyword_scorer, + score_threshold=0.7, # Only process URLs with scores above 0.5 + max_pages=10 + ), + scraping_strategy=LXMLWebScrapingStrategy(), + verbose=True, + cache_mode=CacheMode.BYPASS, + ) + + results = await crawler.arun(url="https://docs.crawl4ai.com", config=dfs_config) + + print(f" āœ… Crawled {len(results)} pages with scores above threshold") + for result in results: + score = result.metadata.get("score", 0) + depth = result.metadata.get("depth", 0) + print(f" → Depth: {depth} | Score: {score:.2f} | {result.url}") + + # EXAMPLE 3: BEST-FIRST WITH BOTH CONSTRAINTS + print("\nšŸ“Š EXAMPLE 3: BEST-FIRST STRATEGY WITH BOTH CONSTRAINTS") + print(" Limit to 7 pages with scores above 0.3, prioritizing highest scores") + + bf_config = CrawlerRunConfig( + deep_crawl_strategy=BestFirstCrawlingStrategy( + max_depth=2, + include_external=False, + url_scorer=keyword_scorer, + max_pages=7, # Limit to 7 pages total + ), + scraping_strategy=LXMLWebScrapingStrategy(), + verbose=True, + cache_mode=CacheMode.BYPASS, + stream=True, + ) + + results = [] + async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=bf_config): + results.append(result) + score = result.metadata.get("score", 0) + depth = result.metadata.get("depth", 0) + print(f" → Depth: {depth} | Score: {score:.2f} | {result.url}") + + print(f" āœ… Crawled {len(results)} high-value pages with scores above 0.3") + if results: + avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results) + print(f" āœ… Average score: {avg_score:.2f}") + print(" šŸ” Note: BestFirstCrawlingStrategy visited highest-scoring pages first") + async def run_tutorial(): """ Executes all tutorial sections in sequence. @@ -384,9 +482,10 @@ async def run_tutorial(): # Define sections - uncomment to run specific parts during development tutorial_sections = [ - basic_deep_crawl, - stream_vs_nonstream, - filters_and_scorers, + # basic_deep_crawl, + # stream_vs_nonstream, + # filters_and_scorers, + max_pages_and_thresholds, # Added new section wrap_up, advanced_filters, ] diff --git a/docs/md_v2/core/deep-crawling.md b/docs/md_v2/core/deep-crawling.md index 9766208a..00834787 100644 --- a/docs/md_v2/core/deep-crawling.md +++ b/docs/md_v2/core/deep-crawling.md @@ -73,12 +73,18 @@ from crawl4ai.deep_crawling import BFSDeepCrawlStrategy strategy = BFSDeepCrawlStrategy( max_depth=2, # Crawl initial page + 2 levels deep include_external=False, # Stay within the same domain + max_pages=50, # Maximum number of pages to crawl (optional) + score_threshold=0.3, # Minimum score for URLs to be crawled (optional) ) ``` **Key parameters:** - **`max_depth`**: Number of levels to crawl beyond the starting page - **`include_external`**: Whether to follow links to other domains +- **`max_pages`**: Maximum number of pages to crawl (default: infinite) +- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf) +- **`filter_chain`**: FilterChain instance for URL filtering +- **`url_scorer`**: Scorer instance for evaluating URLs ### 2.2 DFSDeepCrawlStrategy (Depth-First Search) @@ -91,12 +97,18 @@ from crawl4ai.deep_crawling import DFSDeepCrawlStrategy strategy = DFSDeepCrawlStrategy( max_depth=2, # Crawl initial page + 2 levels deep include_external=False, # Stay within the same domain + max_pages=30, # Maximum number of pages to crawl (optional) + score_threshold=0.5, # Minimum score for URLs to be crawled (optional) ) ``` **Key parameters:** - **`max_depth`**: Number of levels to crawl beyond the starting page - **`include_external`**: Whether to follow links to other domains +- **`max_pages`**: Maximum number of pages to crawl (default: infinite) +- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf) +- **`filter_chain`**: FilterChain instance for URL filtering +- **`url_scorer`**: Scorer instance for evaluating URLs ### 2.3 BestFirstCrawlingStrategy (ā­ļø - Recommended Deep crawl strategy) @@ -116,7 +128,8 @@ scorer = KeywordRelevanceScorer( strategy = BestFirstCrawlingStrategy( max_depth=2, include_external=False, - url_scorer=scorer + url_scorer=scorer, + max_pages=25, # Maximum number of pages to crawl (optional) ) ``` @@ -124,6 +137,8 @@ This crawling approach: - Evaluates each discovered URL based on scorer criteria - Visits higher-scoring pages first - Helps focus crawl resources on the most relevant content +- Can limit total pages crawled with `max_pages` +- Does not need `score_threshold` as it naturally prioritizes by score --- @@ -410,27 +425,64 @@ if __name__ == "__main__": --- -## 8. Common Pitfalls & Tips +## 8. Limiting and Controlling Crawl Size -1.**Set realistic depth limits.** Be cautious with `max_depth` values > 3, which can exponentially increase crawl size. +### 8.1 Using max_pages + +You can limit the total number of pages crawled with the `max_pages` parameter: + +```python +# Limit to exactly 20 pages regardless of depth +strategy = BFSDeepCrawlStrategy( + max_depth=3, + max_pages=20 +) +``` + +This feature is useful for: +- Controlling API costs +- Setting predictable execution times +- Focusing on the most important content +- Testing crawl configurations before full execution + +### 8.2 Using score_threshold + +For BFS and DFS strategies, you can set a minimum score threshold to only crawl high-quality pages: + +```python +# Only follow links with scores above 0.4 +strategy = DFSDeepCrawlStrategy( + max_depth=2, + url_scorer=KeywordRelevanceScorer(keywords=["api", "guide", "reference"]), + score_threshold=0.4 # Skip URLs with scores below this value +) +``` + +Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pages are already processed in order of highest score first. + +## 9. Common Pitfalls & Tips + +1.**Set realistic limits.** Be cautious with `max_depth` values > 3, which can exponentially increase crawl size. Use `max_pages` to set hard limits. 2.**Don't neglect the scoring component.** BestFirstCrawling works best with well-tuned scorers. Experiment with keyword weights for optimal prioritization. 3.**Be a good web citizen.** Respect robots.txt. (disabled by default) +4.**Handle page errors gracefully.** Not all pages will be accessible. Check `result.status` when processing results. -4.**Handle page errors gracefully.** Not all pages will be accessible. Check `result.success` and `result.error_message` when processing results. +5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling. --- -## 9. Summary & Next Steps +## 10. Summary & Next Steps In this **Deep Crawling with Crawl4AI** tutorial, you learned to: -- Configure **BFSDeepCrawlStrategy** and **BestFirstCrawlingStrategy** +- Configure **BFSDeepCrawlStrategy**, **DFSDeepCrawlStrategy**, and **BestFirstCrawlingStrategy** - Process results in streaming or non-streaming mode - Apply filters to target specific content - Use scorers to prioritize the most relevant pages +- Limit crawls with `max_pages` and `score_threshold` parameters - Build a complete advanced crawler with combined techniques With these tools, you can efficiently extract structured data from websites at scale, focusing precisely on the content you need for your specific use case.