fix: streamline url status logging via single entrypoint i.e. logger.url_status

2025-03-20 18:59:15 +05:30
parent eedda1ae5c
commit ac2f9ae533
2 changed files with 205 additions and 68 deletions
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -10,12 +10,17 @@ import asyncio

 # from contextlib import nullcontext, asynccontextmanager
 from contextlib import asynccontextmanager
-from .models import CrawlResult, MarkdownGenerationResult, DispatchResult, ScrapingResult
+from .models import (
+    CrawlResult,
+    MarkdownGenerationResult,
+    DispatchResult,
+    ScrapingResult,
+)
 from .async_database import async_db_manager
 from .chunking_strategy import *  # noqa: F403
 from .chunking_strategy import IdentityChunking
 from .content_filter_strategy import *  # noqa: F403
-from .extraction_strategy import * # noqa: F403
+from .extraction_strategy import *  # noqa: F403
 from .extraction_strategy import NoExtractionStrategy
 from .async_crawler_strategy import (
    AsyncCrawlerStrategy,
@@ -30,7 +35,7 @@ from .markdown_generation_strategy import (
 from .deep_crawling import DeepCrawlDecorator
 from .async_logger import AsyncLogger, AsyncLoggerBase
 from .async_configs import BrowserConfig, CrawlerRunConfig
-from .async_dispatcher import * # noqa: F403
+from .async_dispatcher import *  # noqa: F403
 from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter

 from .utils import (
@@ -44,9 +49,10 @@ from .utils import (

 from typing import Union, AsyncGenerator

-CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
+CrawlResultT = TypeVar("CrawlResultT", bound=CrawlResult)
 # RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]

+
 class CrawlResultContainer(Generic[CrawlResultT]):
    def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
        # Normalize to a list
@@ -68,20 +74,21 @@ class CrawlResultContainer(Generic[CrawlResultT]):
        # Delegate attribute access to the first element.
        if self._results:
            return getattr(self._results[0], attr)
-        raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'")
+        raise AttributeError(
+            f"{self.__class__.__name__} object has no attribute '{attr}'"
+        )

    def __repr__(self):
        return f"{self.__class__.__name__}({self._results!r})"

+
 # Redefine the union type. Now synchronous calls always return a container,
 # while stream mode is handled with an AsyncGenerator.
 RunManyReturn = Union[
-    CrawlResultContainer[CrawlResultT],
-    AsyncGenerator[CrawlResultT, None]
+    CrawlResultContainer[CrawlResultT], AsyncGenerator[CrawlResultT, None]
 ]


-
 class AsyncWebCrawler:
    """
    Asynchronous web crawler with flexible caching capabilities.
@@ -193,7 +200,7 @@ class AsyncWebCrawler:

        # Decorate arun method with deep crawling capabilities
        self._deep_handler = DeepCrawlDecorator(self)
-        self.arun = self._deep_handler(self.arun)  
+        self.arun = self._deep_handler(self.arun)

    async def start(self):
        """
@@ -210,26 +217,39 @@ class AsyncWebCrawler:
            AsyncWebCrawler: The initialized crawler instance
        """
        # Check for builtin browser if requested
-        if self.browser_config.browser_mode == "builtin" and not self.browser_config.cdp_url:
+        if (
+            self.browser_config.browser_mode == "builtin"
+            and not self.browser_config.cdp_url
+        ):
            # Import here to avoid circular imports
            from .browser_profiler import BrowserProfiler
+
            profiler = BrowserProfiler(logger=self.logger)
-            
+
            # Get builtin browser info or launch if needed
            browser_info = profiler.get_builtin_browser_info()
            if not browser_info:
-                self.logger.info("Builtin browser not found, launching new instance...", tag="BROWSER")
+                self.logger.info(
+                    "Builtin browser not found, launching new instance...",
+                    tag="BROWSER",
+                )
                cdp_url = await profiler.launch_builtin_browser()
                if not cdp_url:
-                    self.logger.warning("Failed to launch builtin browser, falling back to dedicated browser", tag="BROWSER")
+                    self.logger.warning(
+                        "Failed to launch builtin browser, falling back to dedicated browser",
+                        tag="BROWSER",
+                    )
                else:
                    self.browser_config.cdp_url = cdp_url
                    self.browser_config.use_managed_browser = True
            else:
-                self.logger.info(f"Using existing builtin browser at {browser_info.get('cdp_url')}", tag="BROWSER")
-                self.browser_config.cdp_url = browser_info.get('cdp_url')
+                self.logger.info(
+                    f"Using existing builtin browser at {browser_info.get('cdp_url')}",
+                    tag="BROWSER",
+                )
+                self.browser_config.cdp_url = browser_info.get("cdp_url")
                self.browser_config.use_managed_browser = True
-                
+
        await self.crawler_strategy.__aenter__()
        await self.awarmup()
        return self
@@ -305,7 +325,7 @@ class AsyncWebCrawler:
        # Auto-start if not ready
        if not self.ready:
            await self.start()
-            
+
        config = config or CrawlerRunConfig()
        if not isinstance(url, str) or not url:
            raise ValueError("Invalid URL, make sure the URL is a non-empty string")
@@ -319,9 +339,7 @@ class AsyncWebCrawler:
                    config.cache_mode = CacheMode.ENABLED

                # Create cache context
-                cache_context = CacheContext(
-                    url, config.cache_mode, False
-                )
+                cache_context = CacheContext(url, config.cache_mode, False)

                # Initialize processing variables
                async_response: AsyncCrawlResponse = None
@@ -351,7 +369,7 @@ class AsyncWebCrawler:
                    # if config.screenshot and not screenshot or config.pdf and not pdf:
                    if config.screenshot and not screenshot_data:
                        cached_result = None
-                    
+
                    if config.pdf and not pdf_data:
                        cached_result = None

@@ -383,14 +401,18 @@ class AsyncWebCrawler:

                    # Check robots.txt if enabled
                    if config and config.check_robots_txt:
-                        if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent):
+                        if not await self.robots_parser.can_fetch(
+                            url, self.browser_config.user_agent
+                        ):
                            return CrawlResult(
                                url=url,
                                html="",
                                success=False,
                                status_code=403,
                                error_message="Access denied by robots.txt",
-                                response_headers={"X-Robots-Status": "Blocked by robots.txt"}
+                                response_headers={
+                                    "X-Robots-Status": "Blocked by robots.txt"
+                                },
                            )

                    ##############################
@@ -417,7 +439,7 @@ class AsyncWebCrawler:
                    ###############################################################
                    # Process the HTML content, Call CrawlerStrategy.process_html #
                    ###############################################################
-                    crawl_result : CrawlResult = await self.aprocess_html(
+                    crawl_result: CrawlResult = await self.aprocess_html(
                        url=url,
                        html=html,
                        extracted_content=extracted_content,
@@ -441,18 +463,11 @@ class AsyncWebCrawler:
                    crawl_result.success = bool(html)
                    crawl_result.session_id = getattr(config, "session_id", None)

-                    self.logger.success(
-                        message="{url:.50}... | Status: {status} | Total: {timing}",
+                    self.logger.url_status(
+                        url=cache_context.display_url,
+                        success=crawl_result.success,
+                        timing=time.perf_counter() - start_time,
                        tag="COMPLETE",
-                        params={
-                            "url": cache_context.display_url,
-                            "status": crawl_result.success,
-                            "timing": f"{time.perf_counter() - start_time:.2f}s",
-                        },
-                        colors={
-                            "status": Fore.GREEN if crawl_result.success else Fore.RED,
-                            "timing": Fore.YELLOW,
-                        },
                    )

                    # Update cache if appropriate
@@ -462,17 +477,12 @@ class AsyncWebCrawler:
                    return CrawlResultContainer(crawl_result)

                else:
-                    self.logger.success(
-                        message="{url:.50}... | Status: {status} | Total: {timing}",
-                        tag="COMPLETE",
-                        params={
-                            "url": cache_context.display_url,
-                            "status": True,
-                            "timing": f"{time.perf_counter() - start_time:.2f}s",
-                        },
-                        colors={"status": Fore.GREEN, "timing": Fore.YELLOW},
+                    self.logger.url_status(
+                        url=cache_context.display_url,
+                        success=True,
+                        timing=time.perf_counter() - start_time,
+                        tag="COMPLETE"
                    )
-
                    cached_result.success = bool(html)
                    cached_result.session_id = getattr(config, "session_id", None)
                    cached_result.redirected_url = cached_result.redirected_url or url
@@ -494,7 +504,7 @@ class AsyncWebCrawler:
                    tag="ERROR",
                )

-                return  CrawlResultContainer(
+                return CrawlResultContainer(
                    CrawlResult(
                        url=url, html="", success=False, error_message=error_message
                    )
@@ -539,15 +549,14 @@ class AsyncWebCrawler:

            # Process HTML content
            params = config.__dict__.copy()
-            params.pop("url", None)            
+            params.pop("url", None)
            # add keys from kwargs to params that doesn't exist in params
            params.update({k: v for k, v in kwargs.items() if k not in params.keys()})

-            
            ################################
            # Scraping Strategy Execution  #
            ################################
-            result : ScrapingResult = scraping_strategy.scrap(url, html, **params)
+            result: ScrapingResult = scraping_strategy.scrap(url, html, **params)

            if result is None:
                raise ValueError(
@@ -593,11 +602,17 @@ class AsyncWebCrawler:
        )

        # Log processing completion
-        self.logger.info(
-            message="{url:.50}... | Time: {timing}s",
+        self.logger.url_status(
+            url=_url,
+            success=True,
+            timing=int((time.perf_counter() - t1) * 1000) / 1000,
            tag="SCRAPE",
-            params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000},
        )
+        # self.logger.info(
+        #     message="{url:.50}... | Time: {timing}s",
+        #     tag="SCRAPE",
+        #     params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000},
+        # )

        ################################
        # Structured Content Extraction           #
@@ -667,7 +682,7 @@ class AsyncWebCrawler:
    async def arun_many(
        self,
        urls: List[str],
-        config: Optional[CrawlerRunConfig] = None, 
+        config: Optional[CrawlerRunConfig] = None,
        dispatcher: Optional[BaseDispatcher] = None,
        # Legacy parameters maintained for backwards compatibility
        # word_count_threshold=MIN_WORD_THRESHOLD,
@@ -681,8 +696,8 @@ class AsyncWebCrawler:
        # pdf: bool = False,
        # user_agent: str = None,
        # verbose=True,
-        **kwargs
-        ) -> RunManyReturn:
+        **kwargs,
+    ) -> RunManyReturn:
        """
        Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.

@@ -738,28 +753,35 @@ class AsyncWebCrawler:

        def transform_result(task_result):
            return (
-                    setattr(task_result.result, 'dispatch_result', 
-                        DispatchResult(
-                            task_id=task_result.task_id,
-                            memory_usage=task_result.memory_usage,
-                            peak_memory=task_result.peak_memory,
-                            start_time=task_result.start_time,
-                            end_time=task_result.end_time,
-                            error_message=task_result.error_message,
-                        )
-                    ) or task_result.result
+                setattr(
+                    task_result.result,
+                    "dispatch_result",
+                    DispatchResult(
+                        task_id=task_result.task_id,
+                        memory_usage=task_result.memory_usage,
+                        peak_memory=task_result.peak_memory,
+                        start_time=task_result.start_time,
+                        end_time=task_result.end_time,
+                        error_message=task_result.error_message,
+                    ),
                )
+                or task_result.result
+            )

        stream = config.stream
-        
+
        if stream:
+
            async def result_transformer():
-                async for task_result in dispatcher.run_urls_stream(crawler=self, urls=urls, config=config):
+                async for task_result in dispatcher.run_urls_stream(
+                    crawler=self, urls=urls, config=config
+                ):
                    yield transform_result(task_result)
+
            return result_transformer()
        else:
            _results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
-            return [transform_result(res) for res in _results]    
+            return [transform_result(res) for res in _results]

    async def aclear_cache(self):
        """Clear the cache database."""
--- a/deps.txt
+++ b/deps.txt
@@ -0,0 +1,115 @@
+aiofiles==24.1.0
+aiohappyeyeballs==2.4.4
+aiohttp==3.11.11
+aiolimiter==1.2.1
+aiosignal==1.3.2
+aiosqlite==0.20.0
+annotated-types==0.7.0
+anyio==4.8.0
+attrs==24.3.0
+beautifulsoup4==4.12.3
+certifi==2024.12.14
+cffi==1.17.1
+chardet==5.2.0
+charset-normalizer==3.4.1
+click==8.1.8
+colorama==0.4.6
+-e git+https://github.com/unclecode/crawl4ai.git@4359b1200377d86af3cd10fa98f91cf599b16d6a#egg=Crawl4AI
+cryptography==44.0.0
+cssselect==1.2.0
+Cython==3.0.12
+Deprecated==1.2.18
+distro==1.9.0
+dnspython==2.7.0
+email_validator==2.2.0
+fake-http-header==0.3.5
+fake-useragent==2.0.3
+fastapi==0.115.11
+faust-cchardet==2.1.19
+filelock==3.16.1
+frozenlist==1.5.0
+fsspec==2024.12.0
+ghp-import==2.1.0
+greenlet==3.1.1
+gunicorn==23.0.0
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.27.2
+huggingface-hub==0.27.1
+humanize==4.12.1
+idna==3.10
+importlib_metadata==8.5.0
+iniconfig==2.0.0
+Jinja2==3.1.5
+jiter==0.8.2
+joblib==1.4.2
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jwt==1.3.1
+limits==4.2
+litellm==1.59.0
+lxml==5.3.0
+Markdown==3.7
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+mergedeep==1.3.4
+mkdocs==1.6.1
+mkdocs-get-deps==0.2.0
+mkdocs-terminal==4.7.0
+mockito==1.5.3
+multidict==6.1.0
+nltk==3.9.1
+numpy==2.2.2
+openai==1.59.9
+packaging==24.2
+pathspec==0.12.1
+pdf2image==1.17.0
+pillow==10.4.0
+platformdirs==4.3.6
+playwright==1.49.1
+pluggy==1.5.0
+prometheus-fastapi-instrumentator==7.0.2
+prometheus_client==0.21.1
+propcache==0.2.1
+psutil==6.1.1
+pycparser==2.22
+pydantic==2.10.5
+pydantic_core==2.27.2
+pyee==12.0.0
+Pygments==2.19.1
+pymdown-extensions==10.14.3
+pyOpenSSL==25.0.0
+pytest==8.3.4
+pytest-mockito==0.0.4
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+PyYAML==6.0.2
+pyyaml_env_tag==0.1
+rank-bm25==0.2.2
+redis==5.2.1
+referencing==0.36.1
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.22.3
+six==1.17.0
+slowapi==0.1.9
+sniffio==1.3.1
+snowballstemmer==2.2.0
+soupsieve==2.6
+starlette==0.46.1
+tenacity==9.0.0
+tf-playwright-stealth==1.1.0
+tiktoken==0.8.0
+tokenizers==0.21.0
+tqdm==4.67.1
+typing_extensions==4.12.2
+urllib3==2.3.0
+uvicorn==0.34.0
+validators==0.34.0
+watchdog==6.0.0
+wrapt==1.17.2
+xxhash==3.5.0
+yarl==1.18.3
+zipp==3.21.0