feat: Enhance AsyncPlaywrightCrawlerStrategy with text-only and light modes, dynamic viewport adjustment, and session management

### New Features:
- **Text-Only Mode**: Added support for text-only crawling by disabling images, JavaScript, GPU, and other non-essential features.
- **Light Mode**: Optimized browser settings to reduce resource usage and improve efficiency during crawling.
- **Dynamic Viewport Adjustment**: Automatically adjusts viewport dimensions based on content size, ensuring accurate rendering and scaling.
- **Full Page Scanning**: Introduced a feature to scroll and capture dynamic content for pages with infinite scroll or lazy-loading elements.
- **Session Management**: Added `create_session` method for creating and managing browser sessions with unique IDs.

### Improvements:
- Unified viewport handling across contexts by dynamically setting dimensions using `self.viewport_width` and `self.viewport_height`.
- Enhanced logging and error handling for viewport adjustments, page scanning, and content evaluation.
- Reduced resource usage with additional browser flags for both `light_mode` and `text_only` configurations.
- Improved handling of cookies, headers, and proxies in session creation.

### Refactoring:
- Removed hardcoded viewport dimensions and replaced them with dynamic configurations.
- Cleaned up unused and commented-out code for better readability and maintainability.
- Introduced defaults for frequently used parameters like `delay_before_return_html`.

### Fixes:
- Resolved potential inconsistencies in viewport handling.
- Improved robustness of content loading and dynamic adjustments to avoid failures and timeouts.

### Docs Update:
- Updated schema usage in `quickstart_async.py` example:
  - Changed `OpenAIModelFee.schema()` to `OpenAIModelFee.model_json_schema()` for compatibility.
- Enhanced LLM extraction instruction documentation.

This commit introduces significant enhancements to improve efficiency, flexibility, and reliability of the crawler strategy.
This commit is contained in:
UncleCode
2024-12-08 20:04:44 +08:00
parent 8c611dcb4b
commit c51e901f68
8 changed files with 440 additions and 49 deletions

View File

@@ -220,8 +220,22 @@ class AsyncCrawlerStrategy(ABC):
class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
def __init__(self, use_cached_html=False, js_code=None, logger = None, **kwargs):
self.text_only = kwargs.get("text_only", False)
self.light_mode = kwargs.get("light_mode", False)
self.logger = logger
self.use_cached_html = use_cached_html
self.viewport_width = kwargs.get("viewport_width", 800 if self.text_only else 1920)
self.viewport_height = kwargs.get("viewport_height", 600 if self.text_only else 1080)
if self.text_only:
self.extra_args = kwargs.get("extra_args", []) + [
'--disable-images',
'--disable-javascript',
'--disable-gpu',
'--disable-software-rasterizer',
'--disable-dev-shm-usage'
]
self.user_agent = kwargs.get(
"user_agent",
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
@@ -300,7 +314,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
else:
# If no default context exists, create one
self.default_context = await self.browser.new_context(
viewport={"width": 1920, "height": 1080}
# viewport={"width": 1920, "height": 1080}
viewport={"width": self.viewport_width, "height": self.viewport_height}
)
# Set up the default context
@@ -334,10 +349,40 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
"--ignore-certificate-errors",
"--ignore-certificate-errors-spki-list",
"--disable-blink-features=AutomationControlled",
"--window-position=400,0",
f"--window-size={self.viewport_width},{self.viewport_height}",
]
}
if self.light_mode:
browser_args["args"].extend([
# "--disable-background-networking",
"--disable-background-timer-throttling",
"--disable-backgrounding-occluded-windows",
"--disable-breakpad",
"--disable-client-side-phishing-detection",
"--disable-component-extensions-with-background-pages",
"--disable-default-apps",
"--disable-extensions",
"--disable-features=TranslateUI",
"--disable-hang-monitor",
"--disable-ipc-flooding-protection",
"--disable-popup-blocking",
"--disable-prompt-on-repost",
"--disable-sync",
"--force-color-profile=srgb",
"--metrics-recording-only",
"--no-first-run",
"--password-store=basic",
"--use-mock-keychain"
])
if self.text_only:
browser_args["args"].extend([
'--blink-settings=imagesEnabled=false',
'--disable-remote-fonts'
])
# Add channel if specified (try Chrome first)
if self.chrome_channel:
browser_args["channel"] = self.chrome_channel
@@ -367,6 +412,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if self.browser_type == "firefox":
self.browser = await self.playwright.firefox.launch(**browser_args)
elif self.browser_type == "webkit":
if "viewport" not in browser_args:
browser_args["viewport"] = {"width": self.viewport_width, "height": self.viewport_height}
self.browser = await self.playwright.webkit.launch(**browser_args)
else:
if self.use_persistent_context and self.user_data_dir:
@@ -576,6 +623,38 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# Return the page object
return page
async def create_session(self, **kwargs) -> str:
"""Creates a new browser session and returns its ID."""
if not self.browser:
await self.start()
session_id = kwargs.get('session_id') or str(uuid.uuid4())
if self.use_managed_browser:
page = await self.default_context.new_page()
self.sessions[session_id] = (self.default_context, page, time.time())
else:
if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]:
context = self.browser
page = await context.new_page()
else:
context = await self.browser.new_context(
user_agent=kwargs.get("user_agent", self.user_agent),
viewport={"width": self.viewport_width, "height": self.viewport_height},
proxy={"server": self.proxy} if self.proxy else None,
accept_downloads=self.accept_downloads,
ignore_https_errors=True
)
if self.cookies:
await context.add_cookies(self.cookies)
await context.set_extra_http_headers(self.headers)
page = await context.new_page()
self.sessions[session_id] = (context, page, time.time())
return session_id
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
"""
Crawls a given URL or processes raw HTML/local file content based on the URL prefix.
@@ -684,12 +763,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]:
# In persistent context, browser is the context
context = self.browser
page = await context.new_page()
else:
# Normal context creation for non-persistent or non-Chrome browsers
context = await self.browser.new_context(
user_agent=user_agent,
viewport={"width": 1200, "height": 800},
viewport={"width": self.viewport_width, "height": self.viewport_height},
proxy={"server": self.proxy} if self.proxy else None,
java_script_enabled=True,
accept_downloads=self.accept_downloads,
@@ -699,7 +777,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if self.cookies:
await context.add_cookies(self.cookies)
await context.set_extra_http_headers(self.headers)
page = await context.new_page()
page = await context.new_page()
self.sessions[session_id] = (context, page, time.time())
else:
if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]:
@@ -709,7 +788,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# Normal context creation
context = await self.browser.new_context(
user_agent=user_agent,
viewport={"width": 1920, "height": 1080},
# viewport={"width": 1920, "height": 1080},
viewport={"width": self.viewport_width, "height": self.viewport_height},
proxy={"server": self.proxy} if self.proxy else None,
accept_downloads=self.accept_downloads,
ignore_https_errors=True # Add this line
@@ -763,9 +843,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if self.accept_downloads:
page.on("download", lambda download: asyncio.create_task(self._handle_download(download)))
# if self.verbose:
# print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...")
if self.use_cached_html:
cache_file_path = os.path.join(
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
@@ -786,7 +863,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if not kwargs.get("js_only", False):
await self.execute_hook('before_goto', page, context = context)
try:
response = await page.goto(
@@ -798,9 +874,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
except Error as e:
raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}")
# response = await page.goto("about:blank")
# await page.evaluate(f"window.location.href = '{url}'")
await self.execute_hook('after_goto', page, context = context)
# Get status code and headers
@@ -853,7 +926,83 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
else:
raise Error(f"Body element is hidden: {visibility_info}")
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
# CONTENT LOADING ASSURANCE
if not self.text_only and (kwargs.get("wait_for_images", True) or kwargs.get("adjust_viewport_to_content", False)):
# Wait for network idle after initial load and images to load
await page.wait_for_load_state("networkidle")
await asyncio.sleep(0.1)
await page.wait_for_function("Array.from(document.images).every(img => img.complete)")
# After initial load, adjust viewport to content size
if not self.text_only and kwargs.get("adjust_viewport_to_content", False):
try:
# Get actual page dimensions
page_width = await page.evaluate("document.documentElement.scrollWidth")
page_height = await page.evaluate("document.documentElement.scrollHeight")
target_width = self.viewport_width
target_height = int(target_width * page_width / page_height * 0.95)
await page.set_viewport_size({"width": target_width, "height": target_height})
# Compute scale factor
# We want the entire page visible: the scale should make both width and height fit
scale = min(target_width / page_width, target_height / page_height)
# Now we call CDP to set metrics.
# We tell Chrome that the "device" is page_width x page_height in size,
# but we scale it down so everything fits within the real viewport.
cdp = await page.context.new_cdp_session(page)
await cdp.send('Emulation.setDeviceMetricsOverride', {
'width': page_width, # full page width
'height': page_height, # full page height
'deviceScaleFactor': 1, # keep normal DPR
'mobile': False,
'scale': scale # scale the entire rendered content
})
except Exception as e:
self.logger.warning(
message="Failed to adjust viewport to content: {error}",
tag="VIEWPORT",
params={"error": str(e)}
)
# After viewport adjustment, handle page scanning if requested
if kwargs.get("scan_full_page", False):
try:
viewport_height = page.viewport_size.get("height", self.viewport_height)
current_position = viewport_height # Start with one viewport height
scroll_delay = kwargs.get("scroll_delay", 0.2)
# Initial scroll
await page.evaluate(f"window.scrollTo(0, {current_position})")
await asyncio.sleep(scroll_delay)
# Get height after first scroll to account for any dynamic content
total_height = await page.evaluate("document.documentElement.scrollHeight")
while current_position < total_height:
current_position = min(current_position + viewport_height, total_height)
await page.evaluate(f"window.scrollTo(0, {current_position})")
await asyncio.sleep(scroll_delay)
# Check for dynamic content
new_height = await page.evaluate("document.documentElement.scrollHeight")
if new_height > total_height:
total_height = new_height
# Scroll back to top
await page.evaluate("window.scrollTo(0, 0)")
except Exception as e:
self.logger.warning(
message="Failed to perform full page scan: {error}",
tag="PAGE_SCAN",
params={"error": str(e)}
)
else:
# Scroll to the bottom of the page
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
js_code = kwargs.get("js_code", kwargs.get("js", self.js_code))
if js_code:
@@ -887,7 +1036,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# await page.wait_for_load_state('networkidle', timeout=5000)
# Update image dimensions
update_image_dimensions_js = """
if not self.text_only:
update_image_dimensions_js = """
() => {
return new Promise((resolve) => {
const filterImage = (img) => {
@@ -944,26 +1094,26 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
}
"""
try:
try:
await page.wait_for_load_state(
# state="load",
state="domcontentloaded",
timeout=5
try:
await page.wait_for_load_state(
# state="load",
state="domcontentloaded",
timeout=5
)
except PlaywrightTimeoutError:
pass
await page.evaluate(update_image_dimensions_js)
except Exception as e:
self.logger.error(
message="Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {error}",
tag="ERROR",
params={"error": str(e)}
)
except PlaywrightTimeoutError:
pass
await page.evaluate(update_image_dimensions_js)
except Exception as e:
self.logger.error(
message="Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {error}",
tag="ERROR",
params={"error": str(e)}
)
# raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}")
# raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}")
# Wait a bit for any onload events to complete
await page.wait_for_timeout(100)
# await page.wait_for_timeout(100)
# Process iframes
if kwargs.get("process_iframes", False):
@@ -971,7 +1121,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
await self.execute_hook('before_retrieve_html', page, context = context)
# Check if delay_before_return_html is set then wait for that time
delay_before_return_html = kwargs.get("delay_before_return_html")
delay_before_return_html = kwargs.get("delay_before_return_html", 0.1)
if delay_before_return_html:
await asyncio.sleep(delay_before_return_html)