fix: streamline url status logging via single entrypoint i.e. logger.url_status

This commit is contained in:
Aravind Karnam
2025-03-20 18:59:15 +05:30
parent eedda1ae5c
commit ac2f9ae533
2 changed files with 205 additions and 68 deletions

View File

@@ -10,12 +10,17 @@ import asyncio
# from contextlib import nullcontext, asynccontextmanager # from contextlib import nullcontext, asynccontextmanager
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from .models import CrawlResult, MarkdownGenerationResult, DispatchResult, ScrapingResult from .models import (
CrawlResult,
MarkdownGenerationResult,
DispatchResult,
ScrapingResult,
)
from .async_database import async_db_manager from .async_database import async_db_manager
from .chunking_strategy import * # noqa: F403 from .chunking_strategy import * # noqa: F403
from .chunking_strategy import IdentityChunking from .chunking_strategy import IdentityChunking
from .content_filter_strategy import * # noqa: F403 from .content_filter_strategy import * # noqa: F403
from .extraction_strategy import * # noqa: F403 from .extraction_strategy import * # noqa: F403
from .extraction_strategy import NoExtractionStrategy from .extraction_strategy import NoExtractionStrategy
from .async_crawler_strategy import ( from .async_crawler_strategy import (
AsyncCrawlerStrategy, AsyncCrawlerStrategy,
@@ -30,7 +35,7 @@ from .markdown_generation_strategy import (
from .deep_crawling import DeepCrawlDecorator from .deep_crawling import DeepCrawlDecorator
from .async_logger import AsyncLogger, AsyncLoggerBase from .async_logger import AsyncLogger, AsyncLoggerBase
from .async_configs import BrowserConfig, CrawlerRunConfig from .async_configs import BrowserConfig, CrawlerRunConfig
from .async_dispatcher import * # noqa: F403 from .async_dispatcher import * # noqa: F403
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
from .utils import ( from .utils import (
@@ -44,9 +49,10 @@ from .utils import (
from typing import Union, AsyncGenerator from typing import Union, AsyncGenerator
CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult) CrawlResultT = TypeVar("CrawlResultT", bound=CrawlResult)
# RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] # RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
class CrawlResultContainer(Generic[CrawlResultT]): class CrawlResultContainer(Generic[CrawlResultT]):
def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]): def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
# Normalize to a list # Normalize to a list
@@ -68,20 +74,21 @@ class CrawlResultContainer(Generic[CrawlResultT]):
# Delegate attribute access to the first element. # Delegate attribute access to the first element.
if self._results: if self._results:
return getattr(self._results[0], attr) return getattr(self._results[0], attr)
raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'") raise AttributeError(
f"{self.__class__.__name__} object has no attribute '{attr}'"
)
def __repr__(self): def __repr__(self):
return f"{self.__class__.__name__}({self._results!r})" return f"{self.__class__.__name__}({self._results!r})"
# Redefine the union type. Now synchronous calls always return a container, # Redefine the union type. Now synchronous calls always return a container,
# while stream mode is handled with an AsyncGenerator. # while stream mode is handled with an AsyncGenerator.
RunManyReturn = Union[ RunManyReturn = Union[
CrawlResultContainer[CrawlResultT], CrawlResultContainer[CrawlResultT], AsyncGenerator[CrawlResultT, None]
AsyncGenerator[CrawlResultT, None]
] ]
class AsyncWebCrawler: class AsyncWebCrawler:
""" """
Asynchronous web crawler with flexible caching capabilities. Asynchronous web crawler with flexible caching capabilities.
@@ -210,24 +217,37 @@ class AsyncWebCrawler:
AsyncWebCrawler: The initialized crawler instance AsyncWebCrawler: The initialized crawler instance
""" """
# Check for builtin browser if requested # Check for builtin browser if requested
if self.browser_config.browser_mode == "builtin" and not self.browser_config.cdp_url: if (
self.browser_config.browser_mode == "builtin"
and not self.browser_config.cdp_url
):
# Import here to avoid circular imports # Import here to avoid circular imports
from .browser_profiler import BrowserProfiler from .browser_profiler import BrowserProfiler
profiler = BrowserProfiler(logger=self.logger) profiler = BrowserProfiler(logger=self.logger)
# Get builtin browser info or launch if needed # Get builtin browser info or launch if needed
browser_info = profiler.get_builtin_browser_info() browser_info = profiler.get_builtin_browser_info()
if not browser_info: if not browser_info:
self.logger.info("Builtin browser not found, launching new instance...", tag="BROWSER") self.logger.info(
"Builtin browser not found, launching new instance...",
tag="BROWSER",
)
cdp_url = await profiler.launch_builtin_browser() cdp_url = await profiler.launch_builtin_browser()
if not cdp_url: if not cdp_url:
self.logger.warning("Failed to launch builtin browser, falling back to dedicated browser", tag="BROWSER") self.logger.warning(
"Failed to launch builtin browser, falling back to dedicated browser",
tag="BROWSER",
)
else: else:
self.browser_config.cdp_url = cdp_url self.browser_config.cdp_url = cdp_url
self.browser_config.use_managed_browser = True self.browser_config.use_managed_browser = True
else: else:
self.logger.info(f"Using existing builtin browser at {browser_info.get('cdp_url')}", tag="BROWSER") self.logger.info(
self.browser_config.cdp_url = browser_info.get('cdp_url') f"Using existing builtin browser at {browser_info.get('cdp_url')}",
tag="BROWSER",
)
self.browser_config.cdp_url = browser_info.get("cdp_url")
self.browser_config.use_managed_browser = True self.browser_config.use_managed_browser = True
await self.crawler_strategy.__aenter__() await self.crawler_strategy.__aenter__()
@@ -319,9 +339,7 @@ class AsyncWebCrawler:
config.cache_mode = CacheMode.ENABLED config.cache_mode = CacheMode.ENABLED
# Create cache context # Create cache context
cache_context = CacheContext( cache_context = CacheContext(url, config.cache_mode, False)
url, config.cache_mode, False
)
# Initialize processing variables # Initialize processing variables
async_response: AsyncCrawlResponse = None async_response: AsyncCrawlResponse = None
@@ -383,14 +401,18 @@ class AsyncWebCrawler:
# Check robots.txt if enabled # Check robots.txt if enabled
if config and config.check_robots_txt: if config and config.check_robots_txt:
if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent): if not await self.robots_parser.can_fetch(
url, self.browser_config.user_agent
):
return CrawlResult( return CrawlResult(
url=url, url=url,
html="", html="",
success=False, success=False,
status_code=403, status_code=403,
error_message="Access denied by robots.txt", error_message="Access denied by robots.txt",
response_headers={"X-Robots-Status": "Blocked by robots.txt"} response_headers={
"X-Robots-Status": "Blocked by robots.txt"
},
) )
############################## ##############################
@@ -417,7 +439,7 @@ class AsyncWebCrawler:
############################################################### ###############################################################
# Process the HTML content, Call CrawlerStrategy.process_html # # Process the HTML content, Call CrawlerStrategy.process_html #
############################################################### ###############################################################
crawl_result : CrawlResult = await self.aprocess_html( crawl_result: CrawlResult = await self.aprocess_html(
url=url, url=url,
html=html, html=html,
extracted_content=extracted_content, extracted_content=extracted_content,
@@ -441,18 +463,11 @@ class AsyncWebCrawler:
crawl_result.success = bool(html) crawl_result.success = bool(html)
crawl_result.session_id = getattr(config, "session_id", None) crawl_result.session_id = getattr(config, "session_id", None)
self.logger.success( self.logger.url_status(
message="{url:.50}... | Status: {status} | Total: {timing}", url=cache_context.display_url,
success=crawl_result.success,
timing=time.perf_counter() - start_time,
tag="COMPLETE", tag="COMPLETE",
params={
"url": cache_context.display_url,
"status": crawl_result.success,
"timing": f"{time.perf_counter() - start_time:.2f}s",
},
colors={
"status": Fore.GREEN if crawl_result.success else Fore.RED,
"timing": Fore.YELLOW,
},
) )
# Update cache if appropriate # Update cache if appropriate
@@ -462,17 +477,12 @@ class AsyncWebCrawler:
return CrawlResultContainer(crawl_result) return CrawlResultContainer(crawl_result)
else: else:
self.logger.success( self.logger.url_status(
message="{url:.50}... | Status: {status} | Total: {timing}", url=cache_context.display_url,
tag="COMPLETE", success=True,
params={ timing=time.perf_counter() - start_time,
"url": cache_context.display_url, tag="COMPLETE"
"status": True,
"timing": f"{time.perf_counter() - start_time:.2f}s",
},
colors={"status": Fore.GREEN, "timing": Fore.YELLOW},
) )
cached_result.success = bool(html) cached_result.success = bool(html)
cached_result.session_id = getattr(config, "session_id", None) cached_result.session_id = getattr(config, "session_id", None)
cached_result.redirected_url = cached_result.redirected_url or url cached_result.redirected_url = cached_result.redirected_url or url
@@ -494,7 +504,7 @@ class AsyncWebCrawler:
tag="ERROR", tag="ERROR",
) )
return CrawlResultContainer( return CrawlResultContainer(
CrawlResult( CrawlResult(
url=url, html="", success=False, error_message=error_message url=url, html="", success=False, error_message=error_message
) )
@@ -543,11 +553,10 @@ class AsyncWebCrawler:
# add keys from kwargs to params that doesn't exist in params # add keys from kwargs to params that doesn't exist in params
params.update({k: v for k, v in kwargs.items() if k not in params.keys()}) params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
################################ ################################
# Scraping Strategy Execution # # Scraping Strategy Execution #
################################ ################################
result : ScrapingResult = scraping_strategy.scrap(url, html, **params) result: ScrapingResult = scraping_strategy.scrap(url, html, **params)
if result is None: if result is None:
raise ValueError( raise ValueError(
@@ -593,11 +602,17 @@ class AsyncWebCrawler:
) )
# Log processing completion # Log processing completion
self.logger.info( self.logger.url_status(
message="{url:.50}... | Time: {timing}s", url=_url,
success=True,
timing=int((time.perf_counter() - t1) * 1000) / 1000,
tag="SCRAPE", tag="SCRAPE",
params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000},
) )
# self.logger.info(
# message="{url:.50}... | Time: {timing}s",
# tag="SCRAPE",
# params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000},
# )
################################ ################################
# Structured Content Extraction # # Structured Content Extraction #
@@ -681,8 +696,8 @@ class AsyncWebCrawler:
# pdf: bool = False, # pdf: bool = False,
# user_agent: str = None, # user_agent: str = None,
# verbose=True, # verbose=True,
**kwargs **kwargs,
) -> RunManyReturn: ) -> RunManyReturn:
""" """
Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy. Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
@@ -738,24 +753,31 @@ class AsyncWebCrawler:
def transform_result(task_result): def transform_result(task_result):
return ( return (
setattr(task_result.result, 'dispatch_result', setattr(
DispatchResult( task_result.result,
task_id=task_result.task_id, "dispatch_result",
memory_usage=task_result.memory_usage, DispatchResult(
peak_memory=task_result.peak_memory, task_id=task_result.task_id,
start_time=task_result.start_time, memory_usage=task_result.memory_usage,
end_time=task_result.end_time, peak_memory=task_result.peak_memory,
error_message=task_result.error_message, start_time=task_result.start_time,
) end_time=task_result.end_time,
) or task_result.result error_message=task_result.error_message,
),
) )
or task_result.result
)
stream = config.stream stream = config.stream
if stream: if stream:
async def result_transformer(): async def result_transformer():
async for task_result in dispatcher.run_urls_stream(crawler=self, urls=urls, config=config): async for task_result in dispatcher.run_urls_stream(
crawler=self, urls=urls, config=config
):
yield transform_result(task_result) yield transform_result(task_result)
return result_transformer() return result_transformer()
else: else:
_results = await dispatcher.run_urls(crawler=self, urls=urls, config=config) _results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)

115
deps.txt Normal file
View File

@@ -0,0 +1,115 @@
aiofiles==24.1.0
aiohappyeyeballs==2.4.4
aiohttp==3.11.11
aiolimiter==1.2.1
aiosignal==1.3.2
aiosqlite==0.20.0
annotated-types==0.7.0
anyio==4.8.0
attrs==24.3.0
beautifulsoup4==4.12.3
certifi==2024.12.14
cffi==1.17.1
chardet==5.2.0
charset-normalizer==3.4.1
click==8.1.8
colorama==0.4.6
-e git+https://github.com/unclecode/crawl4ai.git@4359b1200377d86af3cd10fa98f91cf599b16d6a#egg=Crawl4AI
cryptography==44.0.0
cssselect==1.2.0
Cython==3.0.12
Deprecated==1.2.18
distro==1.9.0
dnspython==2.7.0
email_validator==2.2.0
fake-http-header==0.3.5
fake-useragent==2.0.3
fastapi==0.115.11
faust-cchardet==2.1.19
filelock==3.16.1
frozenlist==1.5.0
fsspec==2024.12.0
ghp-import==2.1.0
greenlet==3.1.1
gunicorn==23.0.0
h11==0.14.0
httpcore==1.0.7
httpx==0.27.2
huggingface-hub==0.27.1
humanize==4.12.1
idna==3.10
importlib_metadata==8.5.0
iniconfig==2.0.0
Jinja2==3.1.5
jiter==0.8.2
joblib==1.4.2
jsonschema==4.23.0
jsonschema-specifications==2024.10.1
jwt==1.3.1
limits==4.2
litellm==1.59.0
lxml==5.3.0
Markdown==3.7
markdown-it-py==3.0.0
MarkupSafe==3.0.2
mdurl==0.1.2
mergedeep==1.3.4
mkdocs==1.6.1
mkdocs-get-deps==0.2.0
mkdocs-terminal==4.7.0
mockito==1.5.3
multidict==6.1.0
nltk==3.9.1
numpy==2.2.2
openai==1.59.9
packaging==24.2
pathspec==0.12.1
pdf2image==1.17.0
pillow==10.4.0
platformdirs==4.3.6
playwright==1.49.1
pluggy==1.5.0
prometheus-fastapi-instrumentator==7.0.2
prometheus_client==0.21.1
propcache==0.2.1
psutil==6.1.1
pycparser==2.22
pydantic==2.10.5
pydantic_core==2.27.2
pyee==12.0.0
Pygments==2.19.1
pymdown-extensions==10.14.3
pyOpenSSL==25.0.0
pytest==8.3.4
pytest-mockito==0.0.4
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
PyYAML==6.0.2
pyyaml_env_tag==0.1
rank-bm25==0.2.2
redis==5.2.1
referencing==0.36.1
regex==2024.11.6
requests==2.32.3
rich==13.9.4
rpds-py==0.22.3
six==1.17.0
slowapi==0.1.9
sniffio==1.3.1
snowballstemmer==2.2.0
soupsieve==2.6
starlette==0.46.1
tenacity==9.0.0
tf-playwright-stealth==1.1.0
tiktoken==0.8.0
tokenizers==0.21.0
tqdm==4.67.1
typing_extensions==4.12.2
urllib3==2.3.0
uvicorn==0.34.0
validators==0.34.0
watchdog==6.0.0
wrapt==1.17.2
xxhash==3.5.0
yarl==1.18.3
zipp==3.21.0