fix: streamline url status logging via single entrypoint i.e. logger.url_status

2025-03-20 18:59:15 +05:30
parent eedda1ae5c
commit ac2f9ae533
2 changed files with 205 additions and 68 deletions
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -10,12 +10,17 @@ import asyncio
 # from contextlib import nullcontext, asynccontextmanager
 from contextlib import asynccontextmanager
-from .models import CrawlResult, MarkdownGenerationResult, DispatchResult, ScrapingResult
+from .models import (
    CrawlResult,
    MarkdownGenerationResult,
    DispatchResult,
    ScrapingResult,
 )
 from .async_database import async_db_manager
 from .chunking_strategy import *  # noqa: F403
 from .chunking_strategy import IdentityChunking
 from .content_filter_strategy import *  # noqa: F403
-from .extraction_strategy import * # noqa: F403
+from .extraction_strategy import *  # noqa: F403
 from .extraction_strategy import NoExtractionStrategy
 from .async_crawler_strategy import (
    AsyncCrawlerStrategy,
@@ -30,7 +35,7 @@ from .markdown_generation_strategy import (
 from .deep_crawling import DeepCrawlDecorator
 from .async_logger import AsyncLogger, AsyncLoggerBase
 from .async_configs import BrowserConfig, CrawlerRunConfig
-from .async_dispatcher import * # noqa: F403
+from .async_dispatcher import *  # noqa: F403
 from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
 from .utils import (
@@ -44,9 +49,10 @@ from .utils import (
 from typing import Union, AsyncGenerator
-CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
+CrawlResultT = TypeVar("CrawlResultT", bound=CrawlResult)
 # RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
 class CrawlResultContainer(Generic[CrawlResultT]):
    def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
        # Normalize to a list
@@ -68,20 +74,21 @@ class CrawlResultContainer(Generic[CrawlResultT]):
        # Delegate attribute access to the first element.
        if self._results:
            return getattr(self._results[0], attr)
-        raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'")
+        raise AttributeError(
            f"{self.__class__.__name__} object has no attribute '{attr}'"
        )
    def __repr__(self):
        return f"{self.__class__.__name__}({self._results!r})"
 # Redefine the union type. Now synchronous calls always return a container,
 # while stream mode is handled with an AsyncGenerator.
 RunManyReturn = Union[
-    CrawlResultContainer[CrawlResultT],
+    CrawlResultContainer[CrawlResultT], AsyncGenerator[CrawlResultT, None]
    AsyncGenerator[CrawlResultT, None]
 ]
 class AsyncWebCrawler:
    """
    Asynchronous web crawler with flexible caching capabilities.
@@ -210,24 +217,37 @@ class AsyncWebCrawler:
            AsyncWebCrawler: The initialized crawler instance
        """
        # Check for builtin browser if requested
-        if self.browser_config.browser_mode == "builtin" and not self.browser_config.cdp_url:
+        if (
            self.browser_config.browser_mode == "builtin"
            and not self.browser_config.cdp_url
        ):
            # Import here to avoid circular imports
            from .browser_profiler import BrowserProfiler
            profiler = BrowserProfiler(logger=self.logger)
            # Get builtin browser info or launch if needed
            browser_info = profiler.get_builtin_browser_info()
            if not browser_info:
-                self.logger.info("Builtin browser not found, launching new instance...", tag="BROWSER")
+                self.logger.info(
                    "Builtin browser not found, launching new instance...",
                    tag="BROWSER",
                )
                cdp_url = await profiler.launch_builtin_browser()
                if not cdp_url:
-                    self.logger.warning("Failed to launch builtin browser, falling back to dedicated browser", tag="BROWSER")
+                    self.logger.warning(
                        "Failed to launch builtin browser, falling back to dedicated browser",
                        tag="BROWSER",
                    )
                else:
                    self.browser_config.cdp_url = cdp_url
                    self.browser_config.use_managed_browser = True
            else:
-                self.logger.info(f"Using existing builtin browser at {browser_info.get('cdp_url')}", tag="BROWSER")
+                self.logger.info(
-                self.browser_config.cdp_url = browser_info.get('cdp_url')
+                    f"Using existing builtin browser at {browser_info.get('cdp_url')}",
                    tag="BROWSER",
                )
                self.browser_config.cdp_url = browser_info.get("cdp_url")
                self.browser_config.use_managed_browser = True
        await self.crawler_strategy.__aenter__()
@@ -319,9 +339,7 @@ class AsyncWebCrawler:
                    config.cache_mode = CacheMode.ENABLED
                # Create cache context
-                cache_context = CacheContext(
+                cache_context = CacheContext(url, config.cache_mode, False)
                    url, config.cache_mode, False
                )
                # Initialize processing variables
                async_response: AsyncCrawlResponse = None
@@ -383,14 +401,18 @@ class AsyncWebCrawler:
                    # Check robots.txt if enabled
                    if config and config.check_robots_txt:
-                        if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent):
+                        if not await self.robots_parser.can_fetch(
                            url, self.browser_config.user_agent
                        ):
                            return CrawlResult(
                                url=url,
                                html="",
                                success=False,
                                status_code=403,
                                error_message="Access denied by robots.txt",
-                                response_headers={"X-Robots-Status": "Blocked by robots.txt"}
+                                response_headers={
                                    "X-Robots-Status": "Blocked by robots.txt"
                                },
                            )
                    ##############################
@@ -417,7 +439,7 @@ class AsyncWebCrawler:
                    ###############################################################
                    # Process the HTML content, Call CrawlerStrategy.process_html #
                    ###############################################################
-                    crawl_result : CrawlResult = await self.aprocess_html(
+                    crawl_result: CrawlResult = await self.aprocess_html(
                        url=url,
                        html=html,
                        extracted_content=extracted_content,
@@ -441,18 +463,11 @@ class AsyncWebCrawler:
                    crawl_result.success = bool(html)
                    crawl_result.session_id = getattr(config, "session_id", None)
-                    self.logger.success(
+                    self.logger.url_status(
-                        message="{url:.50}... | Status: {status} | Total: {timing}",
+                        url=cache_context.display_url,
                        success=crawl_result.success,
                        timing=time.perf_counter() - start_time,
                        tag="COMPLETE",
                        params={
                            "url": cache_context.display_url,
                            "status": crawl_result.success,
                            "timing": f"{time.perf_counter() - start_time:.2f}s",
                        },
                        colors={
                            "status": Fore.GREEN if crawl_result.success else Fore.RED,
                            "timing": Fore.YELLOW,
                        },
                    )
                    # Update cache if appropriate
@@ -462,17 +477,12 @@ class AsyncWebCrawler:
                    return CrawlResultContainer(crawl_result)
                else:
-                    self.logger.success(
+                    self.logger.url_status(
-                        message="{url:.50}... | Status: {status} | Total: {timing}",
+                        url=cache_context.display_url,
-                        tag="COMPLETE",
+                        success=True,
-                        params={
+                        timing=time.perf_counter() - start_time,
-                            "url": cache_context.display_url,
+                        tag="COMPLETE"
                            "status": True,
                            "timing": f"{time.perf_counter() - start_time:.2f}s",
                        },
                        colors={"status": Fore.GREEN, "timing": Fore.YELLOW},
                    )
                    cached_result.success = bool(html)
                    cached_result.session_id = getattr(config, "session_id", None)
                    cached_result.redirected_url = cached_result.redirected_url or url
@@ -494,7 +504,7 @@ class AsyncWebCrawler:
                    tag="ERROR",
                )
-                return  CrawlResultContainer(
+                return CrawlResultContainer(
                    CrawlResult(
                        url=url, html="", success=False, error_message=error_message
                    )
@@ -543,11 +553,10 @@ class AsyncWebCrawler:
            # add keys from kwargs to params that doesn't exist in params
            params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
            ################################
            # Scraping Strategy Execution  #
            ################################
-            result : ScrapingResult = scraping_strategy.scrap(url, html, **params)
+            result: ScrapingResult = scraping_strategy.scrap(url, html, **params)
            if result is None:
                raise ValueError(
@@ -593,11 +602,17 @@ class AsyncWebCrawler:
        )
        # Log processing completion
-        self.logger.info(
+        self.logger.url_status(
-            message="{url:.50}... | Time: {timing}s",
+            url=_url,
            success=True,
            timing=int((time.perf_counter() - t1) * 1000) / 1000,
            tag="SCRAPE",
            params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000},
        )
        # self.logger.info(
        #     message="{url:.50}... | Time: {timing}s",
        #     tag="SCRAPE",
        #     params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000},
        # )
        ################################
        # Structured Content Extraction           #
@@ -681,8 +696,8 @@ class AsyncWebCrawler:
        # pdf: bool = False,
        # user_agent: str = None,
        # verbose=True,
-        **kwargs
+        **kwargs,
-        ) -> RunManyReturn:
+    ) -> RunManyReturn:
        """
        Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
@@ -738,24 +753,31 @@ class AsyncWebCrawler:
        def transform_result(task_result):
            return (
-                    setattr(task_result.result, 'dispatch_result', 
+                setattr(
-                        DispatchResult(
+                    task_result.result,
-                            task_id=task_result.task_id,
+                    "dispatch_result",
-                            memory_usage=task_result.memory_usage,
+                    DispatchResult(
-                            peak_memory=task_result.peak_memory,
+                        task_id=task_result.task_id,
-                            start_time=task_result.start_time,
+                        memory_usage=task_result.memory_usage,
-                            end_time=task_result.end_time,
+                        peak_memory=task_result.peak_memory,
-                            error_message=task_result.error_message,
+                        start_time=task_result.start_time,
-                        )
+                        end_time=task_result.end_time,
-                    ) or task_result.result
+                        error_message=task_result.error_message,
                    ),
                )
                or task_result.result
            )
        stream = config.stream
        if stream:
            async def result_transformer():
-                async for task_result in dispatcher.run_urls_stream(crawler=self, urls=urls, config=config):
+                async for task_result in dispatcher.run_urls_stream(
                    crawler=self, urls=urls, config=config
                ):
                    yield transform_result(task_result)
            return result_transformer()
        else:
            _results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
--- a/deps.txt
+++ b/deps.txt
@@ -0,0 +1,115 @@
 aiofiles==24.1.0
 aiohappyeyeballs==2.4.4
 aiohttp==3.11.11
 aiolimiter==1.2.1
 aiosignal==1.3.2
 aiosqlite==0.20.0
 annotated-types==0.7.0
 anyio==4.8.0
 attrs==24.3.0
 beautifulsoup4==4.12.3
 certifi==2024.12.14
 cffi==1.17.1
 chardet==5.2.0
 charset-normalizer==3.4.1
 click==8.1.8
 colorama==0.4.6
 -e git+https://github.com/unclecode/crawl4ai.git@4359b1200377d86af3cd10fa98f91cf599b16d6a#egg=Crawl4AI
 cryptography==44.0.0
 cssselect==1.2.0
 Cython==3.0.12
 Deprecated==1.2.18
 distro==1.9.0
 dnspython==2.7.0
 email_validator==2.2.0
 fake-http-header==0.3.5
 fake-useragent==2.0.3
 fastapi==0.115.11
 faust-cchardet==2.1.19
 filelock==3.16.1
 frozenlist==1.5.0
 fsspec==2024.12.0
 ghp-import==2.1.0
 greenlet==3.1.1
 gunicorn==23.0.0
 h11==0.14.0
 httpcore==1.0.7
 httpx==0.27.2
 huggingface-hub==0.27.1
 humanize==4.12.1
 idna==3.10
 importlib_metadata==8.5.0
 iniconfig==2.0.0
 Jinja2==3.1.5
 jiter==0.8.2
 joblib==1.4.2
 jsonschema==4.23.0
 jsonschema-specifications==2024.10.1
 jwt==1.3.1
 limits==4.2
 litellm==1.59.0
 lxml==5.3.0
 Markdown==3.7
 markdown-it-py==3.0.0
 MarkupSafe==3.0.2
 mdurl==0.1.2
 mergedeep==1.3.4
 mkdocs==1.6.1
 mkdocs-get-deps==0.2.0
 mkdocs-terminal==4.7.0
 mockito==1.5.3
 multidict==6.1.0
 nltk==3.9.1
 numpy==2.2.2
 openai==1.59.9
 packaging==24.2
 pathspec==0.12.1
 pdf2image==1.17.0
 pillow==10.4.0
 platformdirs==4.3.6
 playwright==1.49.1
 pluggy==1.5.0
 prometheus-fastapi-instrumentator==7.0.2
 prometheus_client==0.21.1
 propcache==0.2.1
 psutil==6.1.1
 pycparser==2.22
 pydantic==2.10.5
 pydantic_core==2.27.2
 pyee==12.0.0
 Pygments==2.19.1
 pymdown-extensions==10.14.3
 pyOpenSSL==25.0.0
 pytest==8.3.4
 pytest-mockito==0.0.4
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
 PyYAML==6.0.2
 pyyaml_env_tag==0.1
 rank-bm25==0.2.2
 redis==5.2.1
 referencing==0.36.1
 regex==2024.11.6
 requests==2.32.3
 rich==13.9.4
 rpds-py==0.22.3
 six==1.17.0
 slowapi==0.1.9
 sniffio==1.3.1
 snowballstemmer==2.2.0
 soupsieve==2.6
 starlette==0.46.1
 tenacity==9.0.0
 tf-playwright-stealth==1.1.0
 tiktoken==0.8.0
 tokenizers==0.21.0
 tqdm==4.67.1
 typing_extensions==4.12.2
 urllib3==2.3.0
 uvicorn==0.34.0
 validators==0.34.0
 watchdog==6.0.0
 wrapt==1.17.2
 xxhash==3.5.0
 yarl==1.18.3
 zipp==3.21.0