Enhance crawler features and improve documentation

- Added detailed CrawlerRunConfig parameters documentation.
  - Introduced plans for real-time event-driven crawling.
  - Updated async logger default level to DEBUG for better insights.
  - Improved structure and readability in configuration file.
  - Enhanced documentation on future capabilities in new blog entries.
This commit is contained in:
UncleCode
2024-12-16 18:52:51 +08:00
parent ed7bc1909c
commit a11d9646e3
6 changed files with 439 additions and 125 deletions

View File

@@ -21,6 +21,7 @@ from .utils import get_error_context
from .user_agent_generator import UserAgentGenerator
from .config import SCREENSHOT_HEIGHT_TRESHOLD, DOWNLOAD_PAGE_TIMEOUT
from .async_configs import BrowserConfig, CrawlerRunConfig
from .async_logger import AsyncLogger
from playwright_stealth import StealthConfig, stealth_async
@@ -462,7 +463,7 @@ class AsyncCrawlerStrategy(ABC):
pass
class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
def __init__(self, browser_config: BrowserConfig = None, logger = None, **kwargs):
def __init__(self, browser_config: BrowserConfig = None, logger : AsyncLogger = None, **kwargs):
"""
Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration.
@@ -758,16 +759,22 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# Set up console logging if requested
if config.log_console:
page.on("console", lambda msg: self.logger.debug(
message="Console: {msg}",
tag="CONSOLE",
params={"msg": msg.text}
))
page.on("pageerror", lambda exc: self.logger.error(
message="Page error: {exc}",
tag="ERROR",
params={"exc": exc}
))
def log_consol(msg, console_log_type="debug"): # Corrected the parameter syntax
if console_log_type == "error":
self.logger.error(
message=f"Console error: {msg}", # Use f-string for variable interpolation
tag="CONSOLE",
params={"msg": msg.text}
)
elif console_log_type == "debug":
self.logger.debug(
message=f"Console: {msg}", # Use f-string for variable interpolation
tag="CONSOLE",
params={"msg": msg.text}
)
page.on("console", log_consol)
page.on("pageerror", lambda e: log_consol(e, "error"))
try:
# Set up download handling
@@ -956,12 +963,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# Define delayed content getter
async def get_delayed_content(delay: float = 5.0) -> str:
if self.config.verbose:
self.logger.info(
message="Waiting for {delay} seconds before retrieving content for {url}",
tag="INFO",
params={"delay": delay, "url": url}
)
self.logger.info(
message="Waiting for {delay} seconds before retrieving content for {url}",
tag="INFO",
params={"delay": delay, "url": url}
)
await asyncio.sleep(delay)
return await page.content()