fix: streamline url status logging via single entrypoint i.e. logger.url_status
This commit is contained in:
@@ -10,12 +10,17 @@ import asyncio
|
||||
|
||||
# from contextlib import nullcontext, asynccontextmanager
|
||||
from contextlib import asynccontextmanager
|
||||
from .models import CrawlResult, MarkdownGenerationResult, DispatchResult, ScrapingResult
|
||||
from .models import (
|
||||
CrawlResult,
|
||||
MarkdownGenerationResult,
|
||||
DispatchResult,
|
||||
ScrapingResult,
|
||||
)
|
||||
from .async_database import async_db_manager
|
||||
from .chunking_strategy import * # noqa: F403
|
||||
from .chunking_strategy import IdentityChunking
|
||||
from .content_filter_strategy import * # noqa: F403
|
||||
from .extraction_strategy import * # noqa: F403
|
||||
from .extraction_strategy import * # noqa: F403
|
||||
from .extraction_strategy import NoExtractionStrategy
|
||||
from .async_crawler_strategy import (
|
||||
AsyncCrawlerStrategy,
|
||||
@@ -30,7 +35,7 @@ from .markdown_generation_strategy import (
|
||||
from .deep_crawling import DeepCrawlDecorator
|
||||
from .async_logger import AsyncLogger, AsyncLoggerBase
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .async_dispatcher import * # noqa: F403
|
||||
from .async_dispatcher import * # noqa: F403
|
||||
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
|
||||
|
||||
from .utils import (
|
||||
@@ -44,9 +49,10 @@ from .utils import (
|
||||
|
||||
from typing import Union, AsyncGenerator
|
||||
|
||||
CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
|
||||
CrawlResultT = TypeVar("CrawlResultT", bound=CrawlResult)
|
||||
# RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||
|
||||
|
||||
class CrawlResultContainer(Generic[CrawlResultT]):
|
||||
def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
|
||||
# Normalize to a list
|
||||
@@ -68,20 +74,21 @@ class CrawlResultContainer(Generic[CrawlResultT]):
|
||||
# Delegate attribute access to the first element.
|
||||
if self._results:
|
||||
return getattr(self._results[0], attr)
|
||||
raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'")
|
||||
raise AttributeError(
|
||||
f"{self.__class__.__name__} object has no attribute '{attr}'"
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.__class__.__name__}({self._results!r})"
|
||||
|
||||
|
||||
# Redefine the union type. Now synchronous calls always return a container,
|
||||
# while stream mode is handled with an AsyncGenerator.
|
||||
RunManyReturn = Union[
|
||||
CrawlResultContainer[CrawlResultT],
|
||||
AsyncGenerator[CrawlResultT, None]
|
||||
CrawlResultContainer[CrawlResultT], AsyncGenerator[CrawlResultT, None]
|
||||
]
|
||||
|
||||
|
||||
|
||||
class AsyncWebCrawler:
|
||||
"""
|
||||
Asynchronous web crawler with flexible caching capabilities.
|
||||
@@ -193,7 +200,7 @@ class AsyncWebCrawler:
|
||||
|
||||
# Decorate arun method with deep crawling capabilities
|
||||
self._deep_handler = DeepCrawlDecorator(self)
|
||||
self.arun = self._deep_handler(self.arun)
|
||||
self.arun = self._deep_handler(self.arun)
|
||||
|
||||
async def start(self):
|
||||
"""
|
||||
@@ -210,26 +217,39 @@ class AsyncWebCrawler:
|
||||
AsyncWebCrawler: The initialized crawler instance
|
||||
"""
|
||||
# Check for builtin browser if requested
|
||||
if self.browser_config.browser_mode == "builtin" and not self.browser_config.cdp_url:
|
||||
if (
|
||||
self.browser_config.browser_mode == "builtin"
|
||||
and not self.browser_config.cdp_url
|
||||
):
|
||||
# Import here to avoid circular imports
|
||||
from .browser_profiler import BrowserProfiler
|
||||
|
||||
profiler = BrowserProfiler(logger=self.logger)
|
||||
|
||||
|
||||
# Get builtin browser info or launch if needed
|
||||
browser_info = profiler.get_builtin_browser_info()
|
||||
if not browser_info:
|
||||
self.logger.info("Builtin browser not found, launching new instance...", tag="BROWSER")
|
||||
self.logger.info(
|
||||
"Builtin browser not found, launching new instance...",
|
||||
tag="BROWSER",
|
||||
)
|
||||
cdp_url = await profiler.launch_builtin_browser()
|
||||
if not cdp_url:
|
||||
self.logger.warning("Failed to launch builtin browser, falling back to dedicated browser", tag="BROWSER")
|
||||
self.logger.warning(
|
||||
"Failed to launch builtin browser, falling back to dedicated browser",
|
||||
tag="BROWSER",
|
||||
)
|
||||
else:
|
||||
self.browser_config.cdp_url = cdp_url
|
||||
self.browser_config.use_managed_browser = True
|
||||
else:
|
||||
self.logger.info(f"Using existing builtin browser at {browser_info.get('cdp_url')}", tag="BROWSER")
|
||||
self.browser_config.cdp_url = browser_info.get('cdp_url')
|
||||
self.logger.info(
|
||||
f"Using existing builtin browser at {browser_info.get('cdp_url')}",
|
||||
tag="BROWSER",
|
||||
)
|
||||
self.browser_config.cdp_url = browser_info.get("cdp_url")
|
||||
self.browser_config.use_managed_browser = True
|
||||
|
||||
|
||||
await self.crawler_strategy.__aenter__()
|
||||
await self.awarmup()
|
||||
return self
|
||||
@@ -305,7 +325,7 @@ class AsyncWebCrawler:
|
||||
# Auto-start if not ready
|
||||
if not self.ready:
|
||||
await self.start()
|
||||
|
||||
|
||||
config = config or CrawlerRunConfig()
|
||||
if not isinstance(url, str) or not url:
|
||||
raise ValueError("Invalid URL, make sure the URL is a non-empty string")
|
||||
@@ -319,9 +339,7 @@ class AsyncWebCrawler:
|
||||
config.cache_mode = CacheMode.ENABLED
|
||||
|
||||
# Create cache context
|
||||
cache_context = CacheContext(
|
||||
url, config.cache_mode, False
|
||||
)
|
||||
cache_context = CacheContext(url, config.cache_mode, False)
|
||||
|
||||
# Initialize processing variables
|
||||
async_response: AsyncCrawlResponse = None
|
||||
@@ -351,7 +369,7 @@ class AsyncWebCrawler:
|
||||
# if config.screenshot and not screenshot or config.pdf and not pdf:
|
||||
if config.screenshot and not screenshot_data:
|
||||
cached_result = None
|
||||
|
||||
|
||||
if config.pdf and not pdf_data:
|
||||
cached_result = None
|
||||
|
||||
@@ -383,14 +401,18 @@ class AsyncWebCrawler:
|
||||
|
||||
# Check robots.txt if enabled
|
||||
if config and config.check_robots_txt:
|
||||
if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent):
|
||||
if not await self.robots_parser.can_fetch(
|
||||
url, self.browser_config.user_agent
|
||||
):
|
||||
return CrawlResult(
|
||||
url=url,
|
||||
html="",
|
||||
success=False,
|
||||
status_code=403,
|
||||
error_message="Access denied by robots.txt",
|
||||
response_headers={"X-Robots-Status": "Blocked by robots.txt"}
|
||||
response_headers={
|
||||
"X-Robots-Status": "Blocked by robots.txt"
|
||||
},
|
||||
)
|
||||
|
||||
##############################
|
||||
@@ -417,7 +439,7 @@ class AsyncWebCrawler:
|
||||
###############################################################
|
||||
# Process the HTML content, Call CrawlerStrategy.process_html #
|
||||
###############################################################
|
||||
crawl_result : CrawlResult = await self.aprocess_html(
|
||||
crawl_result: CrawlResult = await self.aprocess_html(
|
||||
url=url,
|
||||
html=html,
|
||||
extracted_content=extracted_content,
|
||||
@@ -441,18 +463,11 @@ class AsyncWebCrawler:
|
||||
crawl_result.success = bool(html)
|
||||
crawl_result.session_id = getattr(config, "session_id", None)
|
||||
|
||||
self.logger.success(
|
||||
message="{url:.50}... | Status: {status} | Total: {timing}",
|
||||
self.logger.url_status(
|
||||
url=cache_context.display_url,
|
||||
success=crawl_result.success,
|
||||
timing=time.perf_counter() - start_time,
|
||||
tag="COMPLETE",
|
||||
params={
|
||||
"url": cache_context.display_url,
|
||||
"status": crawl_result.success,
|
||||
"timing": f"{time.perf_counter() - start_time:.2f}s",
|
||||
},
|
||||
colors={
|
||||
"status": Fore.GREEN if crawl_result.success else Fore.RED,
|
||||
"timing": Fore.YELLOW,
|
||||
},
|
||||
)
|
||||
|
||||
# Update cache if appropriate
|
||||
@@ -462,17 +477,12 @@ class AsyncWebCrawler:
|
||||
return CrawlResultContainer(crawl_result)
|
||||
|
||||
else:
|
||||
self.logger.success(
|
||||
message="{url:.50}... | Status: {status} | Total: {timing}",
|
||||
tag="COMPLETE",
|
||||
params={
|
||||
"url": cache_context.display_url,
|
||||
"status": True,
|
||||
"timing": f"{time.perf_counter() - start_time:.2f}s",
|
||||
},
|
||||
colors={"status": Fore.GREEN, "timing": Fore.YELLOW},
|
||||
self.logger.url_status(
|
||||
url=cache_context.display_url,
|
||||
success=True,
|
||||
timing=time.perf_counter() - start_time,
|
||||
tag="COMPLETE"
|
||||
)
|
||||
|
||||
cached_result.success = bool(html)
|
||||
cached_result.session_id = getattr(config, "session_id", None)
|
||||
cached_result.redirected_url = cached_result.redirected_url or url
|
||||
@@ -494,7 +504,7 @@ class AsyncWebCrawler:
|
||||
tag="ERROR",
|
||||
)
|
||||
|
||||
return CrawlResultContainer(
|
||||
return CrawlResultContainer(
|
||||
CrawlResult(
|
||||
url=url, html="", success=False, error_message=error_message
|
||||
)
|
||||
@@ -539,15 +549,14 @@ class AsyncWebCrawler:
|
||||
|
||||
# Process HTML content
|
||||
params = config.__dict__.copy()
|
||||
params.pop("url", None)
|
||||
params.pop("url", None)
|
||||
# add keys from kwargs to params that doesn't exist in params
|
||||
params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
|
||||
|
||||
|
||||
################################
|
||||
# Scraping Strategy Execution #
|
||||
################################
|
||||
result : ScrapingResult = scraping_strategy.scrap(url, html, **params)
|
||||
result: ScrapingResult = scraping_strategy.scrap(url, html, **params)
|
||||
|
||||
if result is None:
|
||||
raise ValueError(
|
||||
@@ -593,11 +602,17 @@ class AsyncWebCrawler:
|
||||
)
|
||||
|
||||
# Log processing completion
|
||||
self.logger.info(
|
||||
message="{url:.50}... | Time: {timing}s",
|
||||
self.logger.url_status(
|
||||
url=_url,
|
||||
success=True,
|
||||
timing=int((time.perf_counter() - t1) * 1000) / 1000,
|
||||
tag="SCRAPE",
|
||||
params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000},
|
||||
)
|
||||
# self.logger.info(
|
||||
# message="{url:.50}... | Time: {timing}s",
|
||||
# tag="SCRAPE",
|
||||
# params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000},
|
||||
# )
|
||||
|
||||
################################
|
||||
# Structured Content Extraction #
|
||||
@@ -667,7 +682,7 @@ class AsyncWebCrawler:
|
||||
async def arun_many(
|
||||
self,
|
||||
urls: List[str],
|
||||
config: Optional[CrawlerRunConfig] = None,
|
||||
config: Optional[CrawlerRunConfig] = None,
|
||||
dispatcher: Optional[BaseDispatcher] = None,
|
||||
# Legacy parameters maintained for backwards compatibility
|
||||
# word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
@@ -681,8 +696,8 @@ class AsyncWebCrawler:
|
||||
# pdf: bool = False,
|
||||
# user_agent: str = None,
|
||||
# verbose=True,
|
||||
**kwargs
|
||||
) -> RunManyReturn:
|
||||
**kwargs,
|
||||
) -> RunManyReturn:
|
||||
"""
|
||||
Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
|
||||
|
||||
@@ -738,28 +753,35 @@ class AsyncWebCrawler:
|
||||
|
||||
def transform_result(task_result):
|
||||
return (
|
||||
setattr(task_result.result, 'dispatch_result',
|
||||
DispatchResult(
|
||||
task_id=task_result.task_id,
|
||||
memory_usage=task_result.memory_usage,
|
||||
peak_memory=task_result.peak_memory,
|
||||
start_time=task_result.start_time,
|
||||
end_time=task_result.end_time,
|
||||
error_message=task_result.error_message,
|
||||
)
|
||||
) or task_result.result
|
||||
setattr(
|
||||
task_result.result,
|
||||
"dispatch_result",
|
||||
DispatchResult(
|
||||
task_id=task_result.task_id,
|
||||
memory_usage=task_result.memory_usage,
|
||||
peak_memory=task_result.peak_memory,
|
||||
start_time=task_result.start_time,
|
||||
end_time=task_result.end_time,
|
||||
error_message=task_result.error_message,
|
||||
),
|
||||
)
|
||||
or task_result.result
|
||||
)
|
||||
|
||||
stream = config.stream
|
||||
|
||||
|
||||
if stream:
|
||||
|
||||
async def result_transformer():
|
||||
async for task_result in dispatcher.run_urls_stream(crawler=self, urls=urls, config=config):
|
||||
async for task_result in dispatcher.run_urls_stream(
|
||||
crawler=self, urls=urls, config=config
|
||||
):
|
||||
yield transform_result(task_result)
|
||||
|
||||
return result_transformer()
|
||||
else:
|
||||
_results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
|
||||
return [transform_result(res) for res in _results]
|
||||
return [transform_result(res) for res in _results]
|
||||
|
||||
async def aclear_cache(self):
|
||||
"""Clear the cache database."""
|
||||
|
||||
115
deps.txt
Normal file
115
deps.txt
Normal file
@@ -0,0 +1,115 @@
|
||||
aiofiles==24.1.0
|
||||
aiohappyeyeballs==2.4.4
|
||||
aiohttp==3.11.11
|
||||
aiolimiter==1.2.1
|
||||
aiosignal==1.3.2
|
||||
aiosqlite==0.20.0
|
||||
annotated-types==0.7.0
|
||||
anyio==4.8.0
|
||||
attrs==24.3.0
|
||||
beautifulsoup4==4.12.3
|
||||
certifi==2024.12.14
|
||||
cffi==1.17.1
|
||||
chardet==5.2.0
|
||||
charset-normalizer==3.4.1
|
||||
click==8.1.8
|
||||
colorama==0.4.6
|
||||
-e git+https://github.com/unclecode/crawl4ai.git@4359b1200377d86af3cd10fa98f91cf599b16d6a#egg=Crawl4AI
|
||||
cryptography==44.0.0
|
||||
cssselect==1.2.0
|
||||
Cython==3.0.12
|
||||
Deprecated==1.2.18
|
||||
distro==1.9.0
|
||||
dnspython==2.7.0
|
||||
email_validator==2.2.0
|
||||
fake-http-header==0.3.5
|
||||
fake-useragent==2.0.3
|
||||
fastapi==0.115.11
|
||||
faust-cchardet==2.1.19
|
||||
filelock==3.16.1
|
||||
frozenlist==1.5.0
|
||||
fsspec==2024.12.0
|
||||
ghp-import==2.1.0
|
||||
greenlet==3.1.1
|
||||
gunicorn==23.0.0
|
||||
h11==0.14.0
|
||||
httpcore==1.0.7
|
||||
httpx==0.27.2
|
||||
huggingface-hub==0.27.1
|
||||
humanize==4.12.1
|
||||
idna==3.10
|
||||
importlib_metadata==8.5.0
|
||||
iniconfig==2.0.0
|
||||
Jinja2==3.1.5
|
||||
jiter==0.8.2
|
||||
joblib==1.4.2
|
||||
jsonschema==4.23.0
|
||||
jsonschema-specifications==2024.10.1
|
||||
jwt==1.3.1
|
||||
limits==4.2
|
||||
litellm==1.59.0
|
||||
lxml==5.3.0
|
||||
Markdown==3.7
|
||||
markdown-it-py==3.0.0
|
||||
MarkupSafe==3.0.2
|
||||
mdurl==0.1.2
|
||||
mergedeep==1.3.4
|
||||
mkdocs==1.6.1
|
||||
mkdocs-get-deps==0.2.0
|
||||
mkdocs-terminal==4.7.0
|
||||
mockito==1.5.3
|
||||
multidict==6.1.0
|
||||
nltk==3.9.1
|
||||
numpy==2.2.2
|
||||
openai==1.59.9
|
||||
packaging==24.2
|
||||
pathspec==0.12.1
|
||||
pdf2image==1.17.0
|
||||
pillow==10.4.0
|
||||
platformdirs==4.3.6
|
||||
playwright==1.49.1
|
||||
pluggy==1.5.0
|
||||
prometheus-fastapi-instrumentator==7.0.2
|
||||
prometheus_client==0.21.1
|
||||
propcache==0.2.1
|
||||
psutil==6.1.1
|
||||
pycparser==2.22
|
||||
pydantic==2.10.5
|
||||
pydantic_core==2.27.2
|
||||
pyee==12.0.0
|
||||
Pygments==2.19.1
|
||||
pymdown-extensions==10.14.3
|
||||
pyOpenSSL==25.0.0
|
||||
pytest==8.3.4
|
||||
pytest-mockito==0.0.4
|
||||
python-dateutil==2.9.0.post0
|
||||
python-dotenv==1.0.1
|
||||
PyYAML==6.0.2
|
||||
pyyaml_env_tag==0.1
|
||||
rank-bm25==0.2.2
|
||||
redis==5.2.1
|
||||
referencing==0.36.1
|
||||
regex==2024.11.6
|
||||
requests==2.32.3
|
||||
rich==13.9.4
|
||||
rpds-py==0.22.3
|
||||
six==1.17.0
|
||||
slowapi==0.1.9
|
||||
sniffio==1.3.1
|
||||
snowballstemmer==2.2.0
|
||||
soupsieve==2.6
|
||||
starlette==0.46.1
|
||||
tenacity==9.0.0
|
||||
tf-playwright-stealth==1.1.0
|
||||
tiktoken==0.8.0
|
||||
tokenizers==0.21.0
|
||||
tqdm==4.67.1
|
||||
typing_extensions==4.12.2
|
||||
urllib3==2.3.0
|
||||
uvicorn==0.34.0
|
||||
validators==0.34.0
|
||||
watchdog==6.0.0
|
||||
wrapt==1.17.2
|
||||
xxhash==3.5.0
|
||||
yarl==1.18.3
|
||||
zipp==3.21.0
|
||||
Reference in New Issue
Block a user