From 2327db6fdc3b49b122dbfd2dc85010adb378ca6c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 5 Mar 2025 22:23:08 +0800 Subject: [PATCH 1/4] refactor(crawler): introduce CrawlResultContainer and simplify interfaces Introduces a new generic CrawlResultContainer class to standardize return types and improve type safety. Removes legacy parameter handling and simplifies method signatures. This change makes the API more consistent and easier to maintain. BREAKING CHANGE: Synchronous crawler methods now always return CrawlResultContainer instead of raw CrawlResult or List[CrawlResult]. Legacy parameters have been removed from method signatures. --- crawl4ai/async_webcrawler.py | 161 +++++++++++++++-------------------- 1 file changed, 71 insertions(+), 90 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index b5a646c9..dd777a36 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -4,7 +4,7 @@ import sys import time from colorama import Fore from pathlib import Path -from typing import Optional, List +from typing import Optional, List, Generic, TypeVar import json import asyncio @@ -23,7 +23,7 @@ from .async_crawler_strategy import ( AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse, ) -from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode +from .cache_context import CacheMode, CacheContext from .markdown_generation_strategy import ( DefaultMarkdownGenerator, MarkdownGenerationStrategy, @@ -44,17 +44,46 @@ from .utils import ( RobotsParser, ) -from typing import Union, AsyncGenerator, TypeVar +from typing import Union, AsyncGenerator CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult) -RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] +# RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] -DeepCrawlSingleReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] -DeepCrawlManyReturn = Union[ - List[List[CrawlResultT]], - AsyncGenerator[CrawlResultT, None], +class CrawlResultContainer(Generic[CrawlResultT]): + def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]): + # Normalize to a list + if isinstance(results, list): + self._results = results + else: + self._results = [results] + + def __iter__(self): + return iter(self._results) + + def __getitem__(self, index): + return self._results[index] + + def __len__(self): + return len(self._results) + + def __getattr__(self, attr): + # Delegate attribute access to the first element. + if self._results: + return getattr(self._results[0], attr) + raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'") + + def __repr__(self): + return f"{self.__class__.__name__}({self._results!r})" + +# Redefine the union type. Now synchronous calls always return a container, +# while stream mode is handled with an AsyncGenerator. +RunManyReturn = Union[ + CrawlResultContainer[CrawlResultT], + AsyncGenerator[CrawlResultT, None] ] + + class AsyncWebCrawler: """ Asynchronous web crawler with flexible caching capabilities. @@ -223,23 +252,6 @@ class AsyncWebCrawler: self, url: str, config: CrawlerRunConfig = None, - # Legacy parameters maintained for backwards compatibility - # word_count_threshold=MIN_WORD_THRESHOLD, - # extraction_strategy: ExtractionStrategy = None, - # chunking_strategy: ChunkingStrategy = RegexChunking(), - # content_filter: RelevantContentFilter = None, - # cache_mode: Optional[CacheMode] = None, - # Deprecated cache parameters - # bypass_cache: bool = False, - # disable_cache: bool = False, - # no_cache_read: bool = False, - # no_cache_write: bool = False, - # Other legacy parameters - # css_selector: str = None, - # screenshot: bool = False, - # pdf: bool = False, - # user_agent: str = None, - # verbose=True, **kwargs, ) -> RunManyReturn: """ @@ -270,47 +282,13 @@ class AsyncWebCrawler: Returns: CrawlResult: The result of crawling and processing """ - crawler_config = config or CrawlerRunConfig() + config = config or CrawlerRunConfig() if not isinstance(url, str) or not url: raise ValueError("Invalid URL, make sure the URL is a non-empty string") async with self._lock or self.nullcontext(): try: - self.logger.verbose = crawler_config.verbose - # Handle configuration - if crawler_config is not None: - config = crawler_config - else: - # Merge all parameters into a single kwargs dict for config creation - # config_kwargs = { - # "word_count_threshold": word_count_threshold, - # "extraction_strategy": extraction_strategy, - # "chunking_strategy": chunking_strategy, - # "content_filter": content_filter, - # "cache_mode": cache_mode, - # "bypass_cache": bypass_cache, - # "disable_cache": disable_cache, - # "no_cache_read": no_cache_read, - # "no_cache_write": no_cache_write, - # "css_selector": css_selector, - # "screenshot": screenshot, - # "pdf": pdf, - # "verbose": verbose, - # **kwargs, - # } - # config = CrawlerRunConfig.from_kwargs(config_kwargs) - pass - - # Handle deprecated cache parameters - # if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): - # # Convert legacy parameters if cache_mode not provided - # if config.cache_mode is None: - # config.cache_mode = _legacy_to_cache_mode( - # disable_cache=disable_cache, - # bypass_cache=bypass_cache, - # no_cache_read=no_cache_read, - # no_cache_write=no_cache_write, - # ) + self.logger.verbose = config.verbose # Default to ENABLED if no cache mode specified if config.cache_mode is None: @@ -457,7 +435,7 @@ class AsyncWebCrawler: if cache_context.should_write() and not bool(cached_result): await async_db_manager.acache_url(crawl_result) - return crawl_result + return CrawlResultContainer(crawl_result) else: self.logger.success( @@ -474,7 +452,7 @@ class AsyncWebCrawler: cached_result.success = bool(html) cached_result.session_id = getattr(config, "session_id", None) cached_result.redirected_url = cached_result.redirected_url or url - return cached_result + return CrawlResultContainer(cached_result) except Exception as e: error_context = get_error_context(sys.exc_info()) @@ -492,8 +470,10 @@ class AsyncWebCrawler: tag="ERROR", ) - return CrawlResult( - url=url, html="", success=False, error_message=error_message + return CrawlResultContainer( + CrawlResult( + url=url, html="", success=False, error_message=error_message + ) ) async def aprocess_html( @@ -669,17 +649,17 @@ class AsyncWebCrawler: config: Optional[CrawlerRunConfig] = None, dispatcher: Optional[BaseDispatcher] = None, # Legacy parameters maintained for backwards compatibility - word_count_threshold=MIN_WORD_THRESHOLD, - extraction_strategy: ExtractionStrategy = None, - chunking_strategy: ChunkingStrategy = RegexChunking(), - content_filter: RelevantContentFilter = None, - cache_mode: Optional[CacheMode] = None, - bypass_cache: bool = False, - css_selector: str = None, - screenshot: bool = False, - pdf: bool = False, - user_agent: str = None, - verbose=True, + # word_count_threshold=MIN_WORD_THRESHOLD, + # extraction_strategy: ExtractionStrategy = None, + # chunking_strategy: ChunkingStrategy = RegexChunking(), + # content_filter: RelevantContentFilter = None, + # cache_mode: Optional[CacheMode] = None, + # bypass_cache: bool = False, + # css_selector: str = None, + # screenshot: bool = False, + # pdf: bool = False, + # user_agent: str = None, + # verbose=True, **kwargs ) -> RunManyReturn: """ @@ -712,20 +692,21 @@ class AsyncWebCrawler: ): print(f"Processed {result.url}: {len(result.markdown)} chars") """ - if config is None: - config = CrawlerRunConfig( - word_count_threshold=word_count_threshold, - extraction_strategy=extraction_strategy, - chunking_strategy=chunking_strategy, - content_filter=content_filter, - cache_mode=cache_mode, - bypass_cache=bypass_cache, - css_selector=css_selector, - screenshot=screenshot, - pdf=pdf, - verbose=verbose, - **kwargs, - ) + config = config or CrawlerRunConfig() + # if config is None: + # config = CrawlerRunConfig( + # word_count_threshold=word_count_threshold, + # extraction_strategy=extraction_strategy, + # chunking_strategy=chunking_strategy, + # content_filter=content_filter, + # cache_mode=cache_mode, + # bypass_cache=bypass_cache, + # css_selector=css_selector, + # screenshot=screenshot, + # pdf=pdf, + # verbose=verbose, + # **kwargs, + # ) if dispatcher is None: dispatcher = MemoryAdaptiveDispatcher( From 29f7915b795418bbc8dec9218fa8e9acae167885 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 6 Mar 2025 20:30:57 +0800 Subject: [PATCH 2/4] fix(models): support float timestamps in CrawlStats Modify CrawlStats class to handle both datetime and float timestamp formats for start_time and end_time fields. This change improves compatibility with different time formats while maintaining existing functionality. Other minor changes: - Add datetime import in async_dispatcher - Update JsonElementExtractionStrategy kwargs handling No breaking changes. --- crawl4ai/async_dispatcher.py | 2 +- crawl4ai/extraction_strategy.py | 5 +++-- crawl4ai/models.py | 38 +++++++++++++++++++++++++++++---- 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/crawl4ai/async_dispatcher.py b/crawl4ai/async_dispatcher.py index 69d276fb..b587d011 100644 --- a/crawl4ai/async_dispatcher.py +++ b/crawl4ai/async_dispatcher.py @@ -13,7 +13,7 @@ from rich.live import Live from rich.table import Table from rich.console import Console from rich import box -from datetime import timedelta +from datetime import timedelta, datetime from collections.abc import AsyncGenerator import time import psutil diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index e0e49d99..3b708421 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -1064,7 +1064,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy): api_token (str): Legacy Parameter. API token for LLM provider llm_config (LLMConfig): LLM configuration object prompt (str, optional): Custom prompt template to use - **kwargs: Additional args passed to perform_completion_with_backoff + **kwargs: Additional args passed to LLM processor Returns: dict: Generated schema following the JsonElementExtractionStrategy format @@ -1130,7 +1130,8 @@ In this scenario, use your best judgment to generate the schema. Try to maximize prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]), json_response = True, api_token=llm_config.api_token, - **kwargs + base_url=llm_config.base_url, + extra_args=kwargs ) # Extract and return schema diff --git a/crawl4ai/models.py b/crawl4ai/models.py index ef9efc06..c1caff94 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -37,13 +37,33 @@ class CrawlStatus(Enum): FAILED = "FAILED" +# @dataclass +# class CrawlStats: +# task_id: str +# url: str +# status: CrawlStatus +# start_time: Optional[datetime] = None +# end_time: Optional[datetime] = None +# memory_usage: float = 0.0 +# peak_memory: float = 0.0 +# error_message: str = "" + +# @property +# def duration(self) -> str: +# if not self.start_time: +# return "0:00" +# end = self.end_time or datetime.now() +# duration = end - self.start_time +# return str(timedelta(seconds=int(duration.total_seconds()))) + + @dataclass class CrawlStats: task_id: str url: str status: CrawlStatus - start_time: Optional[datetime] = None - end_time: Optional[datetime] = None + start_time: Optional[Union[datetime, float]] = None + end_time: Optional[Union[datetime, float]] = None memory_usage: float = 0.0 peak_memory: float = 0.0 error_message: str = "" @@ -52,11 +72,21 @@ class CrawlStats: def duration(self) -> str: if not self.start_time: return "0:00" + + # Convert start_time to datetime if it's a float + start = self.start_time + if isinstance(start, float): + start = datetime.fromtimestamp(start) + + # Get end time or use current time end = self.end_time or datetime.now() - duration = end - self.start_time + # Convert end_time to datetime if it's a float + if isinstance(end, float): + end = datetime.fromtimestamp(end) + + duration = end - start return str(timedelta(seconds=int(duration.total_seconds()))) - class DisplayMode(Enum): DETAILED = "DETAILED" AGGREGATED = "AGGREGATED" From 1b72880007ade6c4658551e61c337a438f498086 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 6 Mar 2025 20:32:32 +0800 Subject: [PATCH 3/4] chore(version): bump version to 0.5.0.post3 --- crawl4ai/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index e56feb58..1f2ef59b 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.5.0.post2" +__version__ = "0.5.0.post3" From f78c46446ba647f92175329b55373987ec843e2a Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 6 Mar 2025 22:45:57 +0800 Subject: [PATCH 4/4] feat(deep-crawling): improve URL normalization and domain filtering Enhance URL handling in deep crawling with: - New URL normalization functions for consistent URL formats - Improved domain filtering with subdomain support - Added URLPatternFilter to public API - Better URL deduplication in BFS strategy These changes improve crawling accuracy and reduce duplicate visits. --- crawl4ai/__init__.py | 4 +- crawl4ai/__version__.py | 2 +- crawl4ai/deep_crawling/bfs_strategy.py | 10 +++- crawl4ai/deep_crawling/filters.py | 27 ++++++--- crawl4ai/utils.py | 79 +++++++++++++++++++++++++- docs/snippets/deep_crawl/intro.py | 78 +++++++++++++++++++++++++ 6 files changed, 186 insertions(+), 14 deletions(-) create mode 100644 docs/snippets/deep_crawl/intro.py diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 565bf93d..03cce871 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -48,8 +48,9 @@ from .deep_crawling import ( DeepCrawlStrategy, BFSDeepCrawlStrategy, FilterChain, - ContentTypeFilter, + URLPatternFilter, DomainFilter, + ContentTypeFilter, URLFilter, FilterStats, SEOFilter, @@ -75,6 +76,7 @@ __all__ = [ "BestFirstCrawlingStrategy", "DFSDeepCrawlStrategy", "FilterChain", + "URLPatternFilter", "ContentTypeFilter", "DomainFilter", "FilterStats", diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 1f2ef59b..9477177b 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.5.0.post3" +__version__ = "0.5.0.post4" diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py index 1ae4c4b9..54b72ea3 100644 --- a/crawl4ai/deep_crawling/bfs_strategy.py +++ b/crawl4ai/deep_crawling/bfs_strategy.py @@ -10,6 +10,7 @@ from .filters import FilterChain from .scorers import URLScorer from . import DeepCrawlStrategy from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult +from ..utils import normalize_url_for_deep_crawl, efficient_normalize_url_for_deep_crawl from math import inf as infinity class BFSDeepCrawlStrategy(DeepCrawlStrategy): @@ -99,14 +100,17 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): # First collect all valid links for link in links: url = link.get("href") - if url in visited: + # Strip URL fragments to avoid duplicate crawling + # base_url = url.split('#')[0] if url else url + base_url = normalize_url_for_deep_crawl(url, source_url) + if base_url in visited: continue if not await self.can_process_url(url, next_depth): self.stats.urls_skipped += 1 continue # Score the URL if a scorer is provided - score = self.url_scorer.score(url) if self.url_scorer else 0 + score = self.url_scorer.score(base_url) if self.url_scorer else 0 # Skip URLs with scores below the threshold if score < self.score_threshold: @@ -114,7 +118,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): self.stats.urls_skipped += 1 continue - valid_links.append((url, score)) + valid_links.append((base_url, score)) # If we have more valid links than capacity, sort by score and take the top ones if len(valid_links) > remaining_capacity: diff --git a/crawl4ai/deep_crawling/filters.py b/crawl4ai/deep_crawling/filters.py index c8af3022..9fd8a72a 100644 --- a/crawl4ai/deep_crawling/filters.py +++ b/crawl4ai/deep_crawling/filters.py @@ -427,6 +427,11 @@ class DomainFilter(URLFilter): if isinstance(domains, str): return {domains.lower()} return {d.lower() for d in domains} + + @staticmethod + def _is_subdomain(domain: str, parent_domain: str) -> bool: + """Check if domain is a subdomain of parent_domain""" + return domain == parent_domain or domain.endswith(f".{parent_domain}") @staticmethod @lru_cache(maxsize=10000) @@ -444,20 +449,26 @@ class DomainFilter(URLFilter): domain = self._extract_domain(url) - # Early return for blocked domains - if domain in self._blocked_domains: - self._update_stats(False) - return False + # Check for blocked domains, including subdomains + for blocked in self._blocked_domains: + if self._is_subdomain(domain, blocked): + self._update_stats(False) + return False # If no allowed domains specified, accept all non-blocked if self._allowed_domains is None: self._update_stats(True) return True - # Final allowed domains check - result = domain in self._allowed_domains - self._update_stats(result) - return result + # Check if domain matches any allowed domain (including subdomains) + for allowed in self._allowed_domains: + if self._is_subdomain(domain, allowed): + self._update_stats(True) + return True + + # No matches found + self._update_stats(False) + return False class ContentRelevanceFilter(URLFilter): diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index b5a50eab..146ce06c 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -1,5 +1,4 @@ import time -from urllib.parse import urlparse from concurrent.futures import ThreadPoolExecutor, as_completed from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString import json @@ -33,6 +32,8 @@ import hashlib from urllib.robotparser import RobotFileParser import aiohttp +from urllib.parse import urlparse, urlunparse +from functools import lru_cache from packaging import version from . import __version__ @@ -1962,6 +1963,82 @@ def normalize_url(href, base_url): return normalized +def normalize_url_for_deep_crawl(href, base_url): + """Normalize URLs to ensure consistent format""" + from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode + + # Handle None or empty values + if not href: + return None + + # Use urljoin to handle relative URLs + full_url = urljoin(base_url, href.strip()) + + # Parse the URL for normalization + parsed = urlparse(full_url) + + # Convert hostname to lowercase + netloc = parsed.netloc.lower() + + # Remove fragment entirely + fragment = '' + + # Normalize query parameters if needed + query = parsed.query + if query: + # Parse query parameters + params = parse_qs(query) + + # Remove tracking parameters (example - customize as needed) + tracking_params = ['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'fbclid'] + for param in tracking_params: + if param in params: + del params[param] + + # Rebuild query string, sorted for consistency + query = urlencode(params, doseq=True) if params else '' + + # Build normalized URL + normalized = urlunparse(( + parsed.scheme, + netloc, + parsed.path.rstrip('/') or '/', # Normalize trailing slash + parsed.params, + query, + fragment + )) + + return normalized + +@lru_cache(maxsize=10000) +def efficient_normalize_url_for_deep_crawl(href, base_url): + """Efficient URL normalization with proper parsing""" + from urllib.parse import urljoin + + if not href: + return None + + # Resolve relative URLs + full_url = urljoin(base_url, href.strip()) + + # Use proper URL parsing + parsed = urlparse(full_url) + + # Only perform the most critical normalizations + # 1. Lowercase hostname + # 2. Remove fragment + normalized = urlunparse(( + parsed.scheme, + parsed.netloc.lower(), + parsed.path, + parsed.params, + parsed.query, + '' # Remove fragment + )) + + return normalized + + def normalize_url_tmp(href, base_url): """Normalize URLs to ensure consistent format""" # Extract protocol and domain from base URL diff --git a/docs/snippets/deep_crawl/intro.py b/docs/snippets/deep_crawl/intro.py new file mode 100644 index 00000000..d8fd2f94 --- /dev/null +++ b/docs/snippets/deep_crawl/intro.py @@ -0,0 +1,78 @@ +import asyncio +from typing import List + +from crawl4ai import ( + AsyncWebCrawler, + CrawlerRunConfig, + BFSDeepCrawlStrategy, + CrawlResult, + FilterChain, + DomainFilter, + URLPatternFilter, +) + +# Import necessary classes from crawl4ai library: +# - AsyncWebCrawler: The main class for web crawling. +# - CrawlerRunConfig: Configuration class for crawler behavior. +# - BFSDeepCrawlStrategy: Breadth-First Search deep crawling strategy. +# - CrawlResult: Data model for individual crawl results. +# - FilterChain: Used to chain multiple URL filters. +# - URLPatternFilter: Filter URLs based on patterns. +# You had from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, which is also correct, +# but for simplicity and consistency, we will use the direct import from crawl4ai in this example, as it is re-exported in __init__.py + +async def basic_deep_crawl(): + """ + Performs a basic deep crawl starting from a seed URL, demonstrating: + - Breadth-First Search (BFS) deep crawling strategy. + - Filtering URLs based on URL patterns. + - Accessing crawl results and metadata. + """ + + # 1. Define URL Filters: + # Create a URLPatternFilter to include only URLs containing "text". + # This filter will be used to restrict crawling to URLs that are likely to contain textual content. + url_filter = URLPatternFilter( + patterns=[ + "*text*", # Include URLs that contain "text" in their path or URL + ] + ) + + # Create a DomainFilter to allow only URLs from the "groq.com" domain and block URLs from the "example.com" domain. + # This filter will be used to restrict crawling to URLs within the "groq.com" domain. + domain_filter = DomainFilter( + allowed_domains=["groq.com"], + blocked_domains=["example.com"], + ) + + # 2. Configure CrawlerRunConfig for Deep Crawling: + # Configure CrawlerRunConfig to use BFSDeepCrawlStrategy for deep crawling. + config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=2, # Set the maximum depth of crawling to 2 levels from the start URL + max_pages=10, # Limit the total number of pages to crawl to 10, to prevent excessive crawling + include_external=False, # Set to False to only crawl URLs within the same domain as the start URL + filter_chain=FilterChain(filters=[url_filter, domain_filter]), # Apply the URLPatternFilter and DomainFilter to filter URLs during deep crawl + ), + verbose=True, # Enable verbose logging to see detailed output during crawling + ) + + # 3. Initialize and Run AsyncWebCrawler: + # Use AsyncWebCrawler as a context manager for automatic start and close. + async with AsyncWebCrawler() as crawler: + results: List[CrawlResult] = await crawler.arun( + # url="https://docs.crawl4ai.com", # Uncomment to use crawl4ai documentation as start URL + url="https://console.groq.com/docs", # Set the start URL for deep crawling to Groq documentation + config=config, # Pass the configured CrawlerRunConfig to arun method + ) + + # 4. Process and Print Crawl Results: + # Iterate through the list of CrawlResult objects returned by the deep crawl. + for result in results: + # Print the URL and its crawl depth from the metadata for each crawled URL. + print(f"URL: {result.url}, Depth: {result.metadata.get('depth', 0)}") + + +if __name__ == "__main__": + import asyncio + asyncio.run(basic_deep_crawl())