diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index a6374e89..98111e4b 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -10,12 +10,17 @@ import asyncio # from contextlib import nullcontext, asynccontextmanager from contextlib import asynccontextmanager -from .models import CrawlResult, MarkdownGenerationResult, DispatchResult, ScrapingResult +from .models import ( + CrawlResult, + MarkdownGenerationResult, + DispatchResult, + ScrapingResult, +) from .async_database import async_db_manager from .chunking_strategy import * # noqa: F403 from .chunking_strategy import IdentityChunking from .content_filter_strategy import * # noqa: F403 -from .extraction_strategy import * # noqa: F403 +from .extraction_strategy import * # noqa: F403 from .extraction_strategy import NoExtractionStrategy from .async_crawler_strategy import ( AsyncCrawlerStrategy, @@ -30,7 +35,7 @@ from .markdown_generation_strategy import ( from .deep_crawling import DeepCrawlDecorator from .async_logger import AsyncLogger, AsyncLoggerBase from .async_configs import BrowserConfig, CrawlerRunConfig -from .async_dispatcher import * # noqa: F403 +from .async_dispatcher import * # noqa: F403 from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter from .utils import ( @@ -44,9 +49,10 @@ from .utils import ( from typing import Union, AsyncGenerator -CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult) +CrawlResultT = TypeVar("CrawlResultT", bound=CrawlResult) # RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] + class CrawlResultContainer(Generic[CrawlResultT]): def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]): # Normalize to a list @@ -68,20 +74,21 @@ class CrawlResultContainer(Generic[CrawlResultT]): # Delegate attribute access to the first element. if self._results: return getattr(self._results[0], attr) - raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'") + raise AttributeError( + f"{self.__class__.__name__} object has no attribute '{attr}'" + ) def __repr__(self): return f"{self.__class__.__name__}({self._results!r})" + # Redefine the union type. Now synchronous calls always return a container, # while stream mode is handled with an AsyncGenerator. RunManyReturn = Union[ - CrawlResultContainer[CrawlResultT], - AsyncGenerator[CrawlResultT, None] + CrawlResultContainer[CrawlResultT], AsyncGenerator[CrawlResultT, None] ] - class AsyncWebCrawler: """ Asynchronous web crawler with flexible caching capabilities. @@ -193,7 +200,7 @@ class AsyncWebCrawler: # Decorate arun method with deep crawling capabilities self._deep_handler = DeepCrawlDecorator(self) - self.arun = self._deep_handler(self.arun) + self.arun = self._deep_handler(self.arun) async def start(self): """ @@ -210,26 +217,39 @@ class AsyncWebCrawler: AsyncWebCrawler: The initialized crawler instance """ # Check for builtin browser if requested - if self.browser_config.browser_mode == "builtin" and not self.browser_config.cdp_url: + if ( + self.browser_config.browser_mode == "builtin" + and not self.browser_config.cdp_url + ): # Import here to avoid circular imports from .browser_profiler import BrowserProfiler + profiler = BrowserProfiler(logger=self.logger) - + # Get builtin browser info or launch if needed browser_info = profiler.get_builtin_browser_info() if not browser_info: - self.logger.info("Builtin browser not found, launching new instance...", tag="BROWSER") + self.logger.info( + "Builtin browser not found, launching new instance...", + tag="BROWSER", + ) cdp_url = await profiler.launch_builtin_browser() if not cdp_url: - self.logger.warning("Failed to launch builtin browser, falling back to dedicated browser", tag="BROWSER") + self.logger.warning( + "Failed to launch builtin browser, falling back to dedicated browser", + tag="BROWSER", + ) else: self.browser_config.cdp_url = cdp_url self.browser_config.use_managed_browser = True else: - self.logger.info(f"Using existing builtin browser at {browser_info.get('cdp_url')}", tag="BROWSER") - self.browser_config.cdp_url = browser_info.get('cdp_url') + self.logger.info( + f"Using existing builtin browser at {browser_info.get('cdp_url')}", + tag="BROWSER", + ) + self.browser_config.cdp_url = browser_info.get("cdp_url") self.browser_config.use_managed_browser = True - + await self.crawler_strategy.__aenter__() await self.awarmup() return self @@ -305,7 +325,7 @@ class AsyncWebCrawler: # Auto-start if not ready if not self.ready: await self.start() - + config = config or CrawlerRunConfig() if not isinstance(url, str) or not url: raise ValueError("Invalid URL, make sure the URL is a non-empty string") @@ -319,9 +339,7 @@ class AsyncWebCrawler: config.cache_mode = CacheMode.ENABLED # Create cache context - cache_context = CacheContext( - url, config.cache_mode, False - ) + cache_context = CacheContext(url, config.cache_mode, False) # Initialize processing variables async_response: AsyncCrawlResponse = None @@ -351,7 +369,7 @@ class AsyncWebCrawler: # if config.screenshot and not screenshot or config.pdf and not pdf: if config.screenshot and not screenshot_data: cached_result = None - + if config.pdf and not pdf_data: cached_result = None @@ -383,14 +401,18 @@ class AsyncWebCrawler: # Check robots.txt if enabled if config and config.check_robots_txt: - if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent): + if not await self.robots_parser.can_fetch( + url, self.browser_config.user_agent + ): return CrawlResult( url=url, html="", success=False, status_code=403, error_message="Access denied by robots.txt", - response_headers={"X-Robots-Status": "Blocked by robots.txt"} + response_headers={ + "X-Robots-Status": "Blocked by robots.txt" + }, ) ############################## @@ -417,7 +439,7 @@ class AsyncWebCrawler: ############################################################### # Process the HTML content, Call CrawlerStrategy.process_html # ############################################################### - crawl_result : CrawlResult = await self.aprocess_html( + crawl_result: CrawlResult = await self.aprocess_html( url=url, html=html, extracted_content=extracted_content, @@ -441,18 +463,11 @@ class AsyncWebCrawler: crawl_result.success = bool(html) crawl_result.session_id = getattr(config, "session_id", None) - self.logger.success( - message="{url:.50}... | Status: {status} | Total: {timing}", + self.logger.url_status( + url=cache_context.display_url, + success=crawl_result.success, + timing=time.perf_counter() - start_time, tag="COMPLETE", - params={ - "url": cache_context.display_url, - "status": crawl_result.success, - "timing": f"{time.perf_counter() - start_time:.2f}s", - }, - colors={ - "status": Fore.GREEN if crawl_result.success else Fore.RED, - "timing": Fore.YELLOW, - }, ) # Update cache if appropriate @@ -462,17 +477,12 @@ class AsyncWebCrawler: return CrawlResultContainer(crawl_result) else: - self.logger.success( - message="{url:.50}... | Status: {status} | Total: {timing}", - tag="COMPLETE", - params={ - "url": cache_context.display_url, - "status": True, - "timing": f"{time.perf_counter() - start_time:.2f}s", - }, - colors={"status": Fore.GREEN, "timing": Fore.YELLOW}, + self.logger.url_status( + url=cache_context.display_url, + success=True, + timing=time.perf_counter() - start_time, + tag="COMPLETE" ) - cached_result.success = bool(html) cached_result.session_id = getattr(config, "session_id", None) cached_result.redirected_url = cached_result.redirected_url or url @@ -494,7 +504,7 @@ class AsyncWebCrawler: tag="ERROR", ) - return CrawlResultContainer( + return CrawlResultContainer( CrawlResult( url=url, html="", success=False, error_message=error_message ) @@ -539,15 +549,14 @@ class AsyncWebCrawler: # Process HTML content params = config.__dict__.copy() - params.pop("url", None) + params.pop("url", None) # add keys from kwargs to params that doesn't exist in params params.update({k: v for k, v in kwargs.items() if k not in params.keys()}) - ################################ # Scraping Strategy Execution # ################################ - result : ScrapingResult = scraping_strategy.scrap(url, html, **params) + result: ScrapingResult = scraping_strategy.scrap(url, html, **params) if result is None: raise ValueError( @@ -593,11 +602,17 @@ class AsyncWebCrawler: ) # Log processing completion - self.logger.info( - message="{url:.50}... | Time: {timing}s", + self.logger.url_status( + url=_url, + success=True, + timing=int((time.perf_counter() - t1) * 1000) / 1000, tag="SCRAPE", - params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000}, ) + # self.logger.info( + # message="{url:.50}... | Time: {timing}s", + # tag="SCRAPE", + # params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000}, + # ) ################################ # Structured Content Extraction # @@ -667,7 +682,7 @@ class AsyncWebCrawler: async def arun_many( self, urls: List[str], - config: Optional[CrawlerRunConfig] = None, + config: Optional[CrawlerRunConfig] = None, dispatcher: Optional[BaseDispatcher] = None, # Legacy parameters maintained for backwards compatibility # word_count_threshold=MIN_WORD_THRESHOLD, @@ -681,8 +696,8 @@ class AsyncWebCrawler: # pdf: bool = False, # user_agent: str = None, # verbose=True, - **kwargs - ) -> RunManyReturn: + **kwargs, + ) -> RunManyReturn: """ Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy. @@ -738,28 +753,35 @@ class AsyncWebCrawler: def transform_result(task_result): return ( - setattr(task_result.result, 'dispatch_result', - DispatchResult( - task_id=task_result.task_id, - memory_usage=task_result.memory_usage, - peak_memory=task_result.peak_memory, - start_time=task_result.start_time, - end_time=task_result.end_time, - error_message=task_result.error_message, - ) - ) or task_result.result + setattr( + task_result.result, + "dispatch_result", + DispatchResult( + task_id=task_result.task_id, + memory_usage=task_result.memory_usage, + peak_memory=task_result.peak_memory, + start_time=task_result.start_time, + end_time=task_result.end_time, + error_message=task_result.error_message, + ), ) + or task_result.result + ) stream = config.stream - + if stream: + async def result_transformer(): - async for task_result in dispatcher.run_urls_stream(crawler=self, urls=urls, config=config): + async for task_result in dispatcher.run_urls_stream( + crawler=self, urls=urls, config=config + ): yield transform_result(task_result) + return result_transformer() else: _results = await dispatcher.run_urls(crawler=self, urls=urls, config=config) - return [transform_result(res) for res in _results] + return [transform_result(res) for res in _results] async def aclear_cache(self): """Clear the cache database.""" diff --git a/deps.txt b/deps.txt new file mode 100644 index 00000000..1d085f0f --- /dev/null +++ b/deps.txt @@ -0,0 +1,115 @@ +aiofiles==24.1.0 +aiohappyeyeballs==2.4.4 +aiohttp==3.11.11 +aiolimiter==1.2.1 +aiosignal==1.3.2 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.8.0 +attrs==24.3.0 +beautifulsoup4==4.12.3 +certifi==2024.12.14 +cffi==1.17.1 +chardet==5.2.0 +charset-normalizer==3.4.1 +click==8.1.8 +colorama==0.4.6 +-e git+https://github.com/unclecode/crawl4ai.git@4359b1200377d86af3cd10fa98f91cf599b16d6a#egg=Crawl4AI +cryptography==44.0.0 +cssselect==1.2.0 +Cython==3.0.12 +Deprecated==1.2.18 +distro==1.9.0 +dnspython==2.7.0 +email_validator==2.2.0 +fake-http-header==0.3.5 +fake-useragent==2.0.3 +fastapi==0.115.11 +faust-cchardet==2.1.19 +filelock==3.16.1 +frozenlist==1.5.0 +fsspec==2024.12.0 +ghp-import==2.1.0 +greenlet==3.1.1 +gunicorn==23.0.0 +h11==0.14.0 +httpcore==1.0.7 +httpx==0.27.2 +huggingface-hub==0.27.1 +humanize==4.12.1 +idna==3.10 +importlib_metadata==8.5.0 +iniconfig==2.0.0 +Jinja2==3.1.5 +jiter==0.8.2 +joblib==1.4.2 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +jwt==1.3.1 +limits==4.2 +litellm==1.59.0 +lxml==5.3.0 +Markdown==3.7 +markdown-it-py==3.0.0 +MarkupSafe==3.0.2 +mdurl==0.1.2 +mergedeep==1.3.4 +mkdocs==1.6.1 +mkdocs-get-deps==0.2.0 +mkdocs-terminal==4.7.0 +mockito==1.5.3 +multidict==6.1.0 +nltk==3.9.1 +numpy==2.2.2 +openai==1.59.9 +packaging==24.2 +pathspec==0.12.1 +pdf2image==1.17.0 +pillow==10.4.0 +platformdirs==4.3.6 +playwright==1.49.1 +pluggy==1.5.0 +prometheus-fastapi-instrumentator==7.0.2 +prometheus_client==0.21.1 +propcache==0.2.1 +psutil==6.1.1 +pycparser==2.22 +pydantic==2.10.5 +pydantic_core==2.27.2 +pyee==12.0.0 +Pygments==2.19.1 +pymdown-extensions==10.14.3 +pyOpenSSL==25.0.0 +pytest==8.3.4 +pytest-mockito==0.0.4 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +PyYAML==6.0.2 +pyyaml_env_tag==0.1 +rank-bm25==0.2.2 +redis==5.2.1 +referencing==0.36.1 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +rpds-py==0.22.3 +six==1.17.0 +slowapi==0.1.9 +sniffio==1.3.1 +snowballstemmer==2.2.0 +soupsieve==2.6 +starlette==0.46.1 +tenacity==9.0.0 +tf-playwright-stealth==1.1.0 +tiktoken==0.8.0 +tokenizers==0.21.0 +tqdm==4.67.1 +typing_extensions==4.12.2 +urllib3==2.3.0 +uvicorn==0.34.0 +validators==0.34.0 +watchdog==6.0.0 +wrapt==1.17.2 +xxhash==3.5.0 +yarl==1.18.3 +zipp==3.21.0