fix: streamline url status logging via single entrypoint i.e. logger.url_status
This commit is contained in:
@@ -10,12 +10,17 @@ import asyncio
|
|||||||
|
|
||||||
# from contextlib import nullcontext, asynccontextmanager
|
# from contextlib import nullcontext, asynccontextmanager
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
from .models import CrawlResult, MarkdownGenerationResult, DispatchResult, ScrapingResult
|
from .models import (
|
||||||
|
CrawlResult,
|
||||||
|
MarkdownGenerationResult,
|
||||||
|
DispatchResult,
|
||||||
|
ScrapingResult,
|
||||||
|
)
|
||||||
from .async_database import async_db_manager
|
from .async_database import async_db_manager
|
||||||
from .chunking_strategy import * # noqa: F403
|
from .chunking_strategy import * # noqa: F403
|
||||||
from .chunking_strategy import IdentityChunking
|
from .chunking_strategy import IdentityChunking
|
||||||
from .content_filter_strategy import * # noqa: F403
|
from .content_filter_strategy import * # noqa: F403
|
||||||
from .extraction_strategy import * # noqa: F403
|
from .extraction_strategy import * # noqa: F403
|
||||||
from .extraction_strategy import NoExtractionStrategy
|
from .extraction_strategy import NoExtractionStrategy
|
||||||
from .async_crawler_strategy import (
|
from .async_crawler_strategy import (
|
||||||
AsyncCrawlerStrategy,
|
AsyncCrawlerStrategy,
|
||||||
@@ -30,7 +35,7 @@ from .markdown_generation_strategy import (
|
|||||||
from .deep_crawling import DeepCrawlDecorator
|
from .deep_crawling import DeepCrawlDecorator
|
||||||
from .async_logger import AsyncLogger, AsyncLoggerBase
|
from .async_logger import AsyncLogger, AsyncLoggerBase
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
from .async_dispatcher import * # noqa: F403
|
from .async_dispatcher import * # noqa: F403
|
||||||
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
|
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
|
||||||
|
|
||||||
from .utils import (
|
from .utils import (
|
||||||
@@ -44,9 +49,10 @@ from .utils import (
|
|||||||
|
|
||||||
from typing import Union, AsyncGenerator
|
from typing import Union, AsyncGenerator
|
||||||
|
|
||||||
CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
|
CrawlResultT = TypeVar("CrawlResultT", bound=CrawlResult)
|
||||||
# RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
# RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||||
|
|
||||||
|
|
||||||
class CrawlResultContainer(Generic[CrawlResultT]):
|
class CrawlResultContainer(Generic[CrawlResultT]):
|
||||||
def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
|
def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
|
||||||
# Normalize to a list
|
# Normalize to a list
|
||||||
@@ -68,20 +74,21 @@ class CrawlResultContainer(Generic[CrawlResultT]):
|
|||||||
# Delegate attribute access to the first element.
|
# Delegate attribute access to the first element.
|
||||||
if self._results:
|
if self._results:
|
||||||
return getattr(self._results[0], attr)
|
return getattr(self._results[0], attr)
|
||||||
raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'")
|
raise AttributeError(
|
||||||
|
f"{self.__class__.__name__} object has no attribute '{attr}'"
|
||||||
|
)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f"{self.__class__.__name__}({self._results!r})"
|
return f"{self.__class__.__name__}({self._results!r})"
|
||||||
|
|
||||||
|
|
||||||
# Redefine the union type. Now synchronous calls always return a container,
|
# Redefine the union type. Now synchronous calls always return a container,
|
||||||
# while stream mode is handled with an AsyncGenerator.
|
# while stream mode is handled with an AsyncGenerator.
|
||||||
RunManyReturn = Union[
|
RunManyReturn = Union[
|
||||||
CrawlResultContainer[CrawlResultT],
|
CrawlResultContainer[CrawlResultT], AsyncGenerator[CrawlResultT, None]
|
||||||
AsyncGenerator[CrawlResultT, None]
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class AsyncWebCrawler:
|
class AsyncWebCrawler:
|
||||||
"""
|
"""
|
||||||
Asynchronous web crawler with flexible caching capabilities.
|
Asynchronous web crawler with flexible caching capabilities.
|
||||||
@@ -210,24 +217,37 @@ class AsyncWebCrawler:
|
|||||||
AsyncWebCrawler: The initialized crawler instance
|
AsyncWebCrawler: The initialized crawler instance
|
||||||
"""
|
"""
|
||||||
# Check for builtin browser if requested
|
# Check for builtin browser if requested
|
||||||
if self.browser_config.browser_mode == "builtin" and not self.browser_config.cdp_url:
|
if (
|
||||||
|
self.browser_config.browser_mode == "builtin"
|
||||||
|
and not self.browser_config.cdp_url
|
||||||
|
):
|
||||||
# Import here to avoid circular imports
|
# Import here to avoid circular imports
|
||||||
from .browser_profiler import BrowserProfiler
|
from .browser_profiler import BrowserProfiler
|
||||||
|
|
||||||
profiler = BrowserProfiler(logger=self.logger)
|
profiler = BrowserProfiler(logger=self.logger)
|
||||||
|
|
||||||
# Get builtin browser info or launch if needed
|
# Get builtin browser info or launch if needed
|
||||||
browser_info = profiler.get_builtin_browser_info()
|
browser_info = profiler.get_builtin_browser_info()
|
||||||
if not browser_info:
|
if not browser_info:
|
||||||
self.logger.info("Builtin browser not found, launching new instance...", tag="BROWSER")
|
self.logger.info(
|
||||||
|
"Builtin browser not found, launching new instance...",
|
||||||
|
tag="BROWSER",
|
||||||
|
)
|
||||||
cdp_url = await profiler.launch_builtin_browser()
|
cdp_url = await profiler.launch_builtin_browser()
|
||||||
if not cdp_url:
|
if not cdp_url:
|
||||||
self.logger.warning("Failed to launch builtin browser, falling back to dedicated browser", tag="BROWSER")
|
self.logger.warning(
|
||||||
|
"Failed to launch builtin browser, falling back to dedicated browser",
|
||||||
|
tag="BROWSER",
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.browser_config.cdp_url = cdp_url
|
self.browser_config.cdp_url = cdp_url
|
||||||
self.browser_config.use_managed_browser = True
|
self.browser_config.use_managed_browser = True
|
||||||
else:
|
else:
|
||||||
self.logger.info(f"Using existing builtin browser at {browser_info.get('cdp_url')}", tag="BROWSER")
|
self.logger.info(
|
||||||
self.browser_config.cdp_url = browser_info.get('cdp_url')
|
f"Using existing builtin browser at {browser_info.get('cdp_url')}",
|
||||||
|
tag="BROWSER",
|
||||||
|
)
|
||||||
|
self.browser_config.cdp_url = browser_info.get("cdp_url")
|
||||||
self.browser_config.use_managed_browser = True
|
self.browser_config.use_managed_browser = True
|
||||||
|
|
||||||
await self.crawler_strategy.__aenter__()
|
await self.crawler_strategy.__aenter__()
|
||||||
@@ -319,9 +339,7 @@ class AsyncWebCrawler:
|
|||||||
config.cache_mode = CacheMode.ENABLED
|
config.cache_mode = CacheMode.ENABLED
|
||||||
|
|
||||||
# Create cache context
|
# Create cache context
|
||||||
cache_context = CacheContext(
|
cache_context = CacheContext(url, config.cache_mode, False)
|
||||||
url, config.cache_mode, False
|
|
||||||
)
|
|
||||||
|
|
||||||
# Initialize processing variables
|
# Initialize processing variables
|
||||||
async_response: AsyncCrawlResponse = None
|
async_response: AsyncCrawlResponse = None
|
||||||
@@ -383,14 +401,18 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
# Check robots.txt if enabled
|
# Check robots.txt if enabled
|
||||||
if config and config.check_robots_txt:
|
if config and config.check_robots_txt:
|
||||||
if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent):
|
if not await self.robots_parser.can_fetch(
|
||||||
|
url, self.browser_config.user_agent
|
||||||
|
):
|
||||||
return CrawlResult(
|
return CrawlResult(
|
||||||
url=url,
|
url=url,
|
||||||
html="",
|
html="",
|
||||||
success=False,
|
success=False,
|
||||||
status_code=403,
|
status_code=403,
|
||||||
error_message="Access denied by robots.txt",
|
error_message="Access denied by robots.txt",
|
||||||
response_headers={"X-Robots-Status": "Blocked by robots.txt"}
|
response_headers={
|
||||||
|
"X-Robots-Status": "Blocked by robots.txt"
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
##############################
|
##############################
|
||||||
@@ -417,7 +439,7 @@ class AsyncWebCrawler:
|
|||||||
###############################################################
|
###############################################################
|
||||||
# Process the HTML content, Call CrawlerStrategy.process_html #
|
# Process the HTML content, Call CrawlerStrategy.process_html #
|
||||||
###############################################################
|
###############################################################
|
||||||
crawl_result : CrawlResult = await self.aprocess_html(
|
crawl_result: CrawlResult = await self.aprocess_html(
|
||||||
url=url,
|
url=url,
|
||||||
html=html,
|
html=html,
|
||||||
extracted_content=extracted_content,
|
extracted_content=extracted_content,
|
||||||
@@ -441,18 +463,11 @@ class AsyncWebCrawler:
|
|||||||
crawl_result.success = bool(html)
|
crawl_result.success = bool(html)
|
||||||
crawl_result.session_id = getattr(config, "session_id", None)
|
crawl_result.session_id = getattr(config, "session_id", None)
|
||||||
|
|
||||||
self.logger.success(
|
self.logger.url_status(
|
||||||
message="{url:.50}... | Status: {status} | Total: {timing}",
|
url=cache_context.display_url,
|
||||||
|
success=crawl_result.success,
|
||||||
|
timing=time.perf_counter() - start_time,
|
||||||
tag="COMPLETE",
|
tag="COMPLETE",
|
||||||
params={
|
|
||||||
"url": cache_context.display_url,
|
|
||||||
"status": crawl_result.success,
|
|
||||||
"timing": f"{time.perf_counter() - start_time:.2f}s",
|
|
||||||
},
|
|
||||||
colors={
|
|
||||||
"status": Fore.GREEN if crawl_result.success else Fore.RED,
|
|
||||||
"timing": Fore.YELLOW,
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Update cache if appropriate
|
# Update cache if appropriate
|
||||||
@@ -462,17 +477,12 @@ class AsyncWebCrawler:
|
|||||||
return CrawlResultContainer(crawl_result)
|
return CrawlResultContainer(crawl_result)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.logger.success(
|
self.logger.url_status(
|
||||||
message="{url:.50}... | Status: {status} | Total: {timing}",
|
url=cache_context.display_url,
|
||||||
tag="COMPLETE",
|
success=True,
|
||||||
params={
|
timing=time.perf_counter() - start_time,
|
||||||
"url": cache_context.display_url,
|
tag="COMPLETE"
|
||||||
"status": True,
|
|
||||||
"timing": f"{time.perf_counter() - start_time:.2f}s",
|
|
||||||
},
|
|
||||||
colors={"status": Fore.GREEN, "timing": Fore.YELLOW},
|
|
||||||
)
|
)
|
||||||
|
|
||||||
cached_result.success = bool(html)
|
cached_result.success = bool(html)
|
||||||
cached_result.session_id = getattr(config, "session_id", None)
|
cached_result.session_id = getattr(config, "session_id", None)
|
||||||
cached_result.redirected_url = cached_result.redirected_url or url
|
cached_result.redirected_url = cached_result.redirected_url or url
|
||||||
@@ -494,7 +504,7 @@ class AsyncWebCrawler:
|
|||||||
tag="ERROR",
|
tag="ERROR",
|
||||||
)
|
)
|
||||||
|
|
||||||
return CrawlResultContainer(
|
return CrawlResultContainer(
|
||||||
CrawlResult(
|
CrawlResult(
|
||||||
url=url, html="", success=False, error_message=error_message
|
url=url, html="", success=False, error_message=error_message
|
||||||
)
|
)
|
||||||
@@ -543,11 +553,10 @@ class AsyncWebCrawler:
|
|||||||
# add keys from kwargs to params that doesn't exist in params
|
# add keys from kwargs to params that doesn't exist in params
|
||||||
params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
|
params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
|
||||||
|
|
||||||
|
|
||||||
################################
|
################################
|
||||||
# Scraping Strategy Execution #
|
# Scraping Strategy Execution #
|
||||||
################################
|
################################
|
||||||
result : ScrapingResult = scraping_strategy.scrap(url, html, **params)
|
result: ScrapingResult = scraping_strategy.scrap(url, html, **params)
|
||||||
|
|
||||||
if result is None:
|
if result is None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@@ -593,11 +602,17 @@ class AsyncWebCrawler:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Log processing completion
|
# Log processing completion
|
||||||
self.logger.info(
|
self.logger.url_status(
|
||||||
message="{url:.50}... | Time: {timing}s",
|
url=_url,
|
||||||
|
success=True,
|
||||||
|
timing=int((time.perf_counter() - t1) * 1000) / 1000,
|
||||||
tag="SCRAPE",
|
tag="SCRAPE",
|
||||||
params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000},
|
|
||||||
)
|
)
|
||||||
|
# self.logger.info(
|
||||||
|
# message="{url:.50}... | Time: {timing}s",
|
||||||
|
# tag="SCRAPE",
|
||||||
|
# params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000},
|
||||||
|
# )
|
||||||
|
|
||||||
################################
|
################################
|
||||||
# Structured Content Extraction #
|
# Structured Content Extraction #
|
||||||
@@ -681,8 +696,8 @@ class AsyncWebCrawler:
|
|||||||
# pdf: bool = False,
|
# pdf: bool = False,
|
||||||
# user_agent: str = None,
|
# user_agent: str = None,
|
||||||
# verbose=True,
|
# verbose=True,
|
||||||
**kwargs
|
**kwargs,
|
||||||
) -> RunManyReturn:
|
) -> RunManyReturn:
|
||||||
"""
|
"""
|
||||||
Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
|
Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
|
||||||
|
|
||||||
@@ -738,24 +753,31 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
def transform_result(task_result):
|
def transform_result(task_result):
|
||||||
return (
|
return (
|
||||||
setattr(task_result.result, 'dispatch_result',
|
setattr(
|
||||||
DispatchResult(
|
task_result.result,
|
||||||
task_id=task_result.task_id,
|
"dispatch_result",
|
||||||
memory_usage=task_result.memory_usage,
|
DispatchResult(
|
||||||
peak_memory=task_result.peak_memory,
|
task_id=task_result.task_id,
|
||||||
start_time=task_result.start_time,
|
memory_usage=task_result.memory_usage,
|
||||||
end_time=task_result.end_time,
|
peak_memory=task_result.peak_memory,
|
||||||
error_message=task_result.error_message,
|
start_time=task_result.start_time,
|
||||||
)
|
end_time=task_result.end_time,
|
||||||
) or task_result.result
|
error_message=task_result.error_message,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
or task_result.result
|
||||||
|
)
|
||||||
|
|
||||||
stream = config.stream
|
stream = config.stream
|
||||||
|
|
||||||
if stream:
|
if stream:
|
||||||
|
|
||||||
async def result_transformer():
|
async def result_transformer():
|
||||||
async for task_result in dispatcher.run_urls_stream(crawler=self, urls=urls, config=config):
|
async for task_result in dispatcher.run_urls_stream(
|
||||||
|
crawler=self, urls=urls, config=config
|
||||||
|
):
|
||||||
yield transform_result(task_result)
|
yield transform_result(task_result)
|
||||||
|
|
||||||
return result_transformer()
|
return result_transformer()
|
||||||
else:
|
else:
|
||||||
_results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
|
_results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
|
||||||
|
|||||||
115
deps.txt
Normal file
115
deps.txt
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
aiofiles==24.1.0
|
||||||
|
aiohappyeyeballs==2.4.4
|
||||||
|
aiohttp==3.11.11
|
||||||
|
aiolimiter==1.2.1
|
||||||
|
aiosignal==1.3.2
|
||||||
|
aiosqlite==0.20.0
|
||||||
|
annotated-types==0.7.0
|
||||||
|
anyio==4.8.0
|
||||||
|
attrs==24.3.0
|
||||||
|
beautifulsoup4==4.12.3
|
||||||
|
certifi==2024.12.14
|
||||||
|
cffi==1.17.1
|
||||||
|
chardet==5.2.0
|
||||||
|
charset-normalizer==3.4.1
|
||||||
|
click==8.1.8
|
||||||
|
colorama==0.4.6
|
||||||
|
-e git+https://github.com/unclecode/crawl4ai.git@4359b1200377d86af3cd10fa98f91cf599b16d6a#egg=Crawl4AI
|
||||||
|
cryptography==44.0.0
|
||||||
|
cssselect==1.2.0
|
||||||
|
Cython==3.0.12
|
||||||
|
Deprecated==1.2.18
|
||||||
|
distro==1.9.0
|
||||||
|
dnspython==2.7.0
|
||||||
|
email_validator==2.2.0
|
||||||
|
fake-http-header==0.3.5
|
||||||
|
fake-useragent==2.0.3
|
||||||
|
fastapi==0.115.11
|
||||||
|
faust-cchardet==2.1.19
|
||||||
|
filelock==3.16.1
|
||||||
|
frozenlist==1.5.0
|
||||||
|
fsspec==2024.12.0
|
||||||
|
ghp-import==2.1.0
|
||||||
|
greenlet==3.1.1
|
||||||
|
gunicorn==23.0.0
|
||||||
|
h11==0.14.0
|
||||||
|
httpcore==1.0.7
|
||||||
|
httpx==0.27.2
|
||||||
|
huggingface-hub==0.27.1
|
||||||
|
humanize==4.12.1
|
||||||
|
idna==3.10
|
||||||
|
importlib_metadata==8.5.0
|
||||||
|
iniconfig==2.0.0
|
||||||
|
Jinja2==3.1.5
|
||||||
|
jiter==0.8.2
|
||||||
|
joblib==1.4.2
|
||||||
|
jsonschema==4.23.0
|
||||||
|
jsonschema-specifications==2024.10.1
|
||||||
|
jwt==1.3.1
|
||||||
|
limits==4.2
|
||||||
|
litellm==1.59.0
|
||||||
|
lxml==5.3.0
|
||||||
|
Markdown==3.7
|
||||||
|
markdown-it-py==3.0.0
|
||||||
|
MarkupSafe==3.0.2
|
||||||
|
mdurl==0.1.2
|
||||||
|
mergedeep==1.3.4
|
||||||
|
mkdocs==1.6.1
|
||||||
|
mkdocs-get-deps==0.2.0
|
||||||
|
mkdocs-terminal==4.7.0
|
||||||
|
mockito==1.5.3
|
||||||
|
multidict==6.1.0
|
||||||
|
nltk==3.9.1
|
||||||
|
numpy==2.2.2
|
||||||
|
openai==1.59.9
|
||||||
|
packaging==24.2
|
||||||
|
pathspec==0.12.1
|
||||||
|
pdf2image==1.17.0
|
||||||
|
pillow==10.4.0
|
||||||
|
platformdirs==4.3.6
|
||||||
|
playwright==1.49.1
|
||||||
|
pluggy==1.5.0
|
||||||
|
prometheus-fastapi-instrumentator==7.0.2
|
||||||
|
prometheus_client==0.21.1
|
||||||
|
propcache==0.2.1
|
||||||
|
psutil==6.1.1
|
||||||
|
pycparser==2.22
|
||||||
|
pydantic==2.10.5
|
||||||
|
pydantic_core==2.27.2
|
||||||
|
pyee==12.0.0
|
||||||
|
Pygments==2.19.1
|
||||||
|
pymdown-extensions==10.14.3
|
||||||
|
pyOpenSSL==25.0.0
|
||||||
|
pytest==8.3.4
|
||||||
|
pytest-mockito==0.0.4
|
||||||
|
python-dateutil==2.9.0.post0
|
||||||
|
python-dotenv==1.0.1
|
||||||
|
PyYAML==6.0.2
|
||||||
|
pyyaml_env_tag==0.1
|
||||||
|
rank-bm25==0.2.2
|
||||||
|
redis==5.2.1
|
||||||
|
referencing==0.36.1
|
||||||
|
regex==2024.11.6
|
||||||
|
requests==2.32.3
|
||||||
|
rich==13.9.4
|
||||||
|
rpds-py==0.22.3
|
||||||
|
six==1.17.0
|
||||||
|
slowapi==0.1.9
|
||||||
|
sniffio==1.3.1
|
||||||
|
snowballstemmer==2.2.0
|
||||||
|
soupsieve==2.6
|
||||||
|
starlette==0.46.1
|
||||||
|
tenacity==9.0.0
|
||||||
|
tf-playwright-stealth==1.1.0
|
||||||
|
tiktoken==0.8.0
|
||||||
|
tokenizers==0.21.0
|
||||||
|
tqdm==4.67.1
|
||||||
|
typing_extensions==4.12.2
|
||||||
|
urllib3==2.3.0
|
||||||
|
uvicorn==0.34.0
|
||||||
|
validators==0.34.0
|
||||||
|
watchdog==6.0.0
|
||||||
|
wrapt==1.17.2
|
||||||
|
xxhash==3.5.0
|
||||||
|
yarl==1.18.3
|
||||||
|
zipp==3.21.0
|
||||||
Reference in New Issue
Block a user