### New Features: - **Text-Only Mode**: Added support for text-only crawling by disabling images, JavaScript, GPU, and other non-essential features. - **Light Mode**: Optimized browser settings to reduce resource usage and improve efficiency during crawling. - **Dynamic Viewport Adjustment**: Automatically adjusts viewport dimensions based on content size, ensuring accurate rendering and scaling. - **Full Page Scanning**: Introduced a feature to scroll and capture dynamic content for pages with infinite scroll or lazy-loading elements. - **Session Management**: Added `create_session` method for creating and managing browser sessions with unique IDs. ### Improvements: - Unified viewport handling across contexts by dynamically setting dimensions using `self.viewport_width` and `self.viewport_height`. - Enhanced logging and error handling for viewport adjustments, page scanning, and content evaluation. - Reduced resource usage with additional browser flags for both `light_mode` and `text_only` configurations. - Improved handling of cookies, headers, and proxies in session creation. ### Refactoring: - Removed hardcoded viewport dimensions and replaced them with dynamic configurations. - Cleaned up unused and commented-out code for better readability and maintainability. - Introduced defaults for frequently used parameters like `delay_before_return_html`. ### Fixes: - Resolved potential inconsistencies in viewport handling. - Improved robustness of content loading and dynamic adjustments to avoid failures and timeouts. ### Docs Update: - Updated schema usage in `quickstart_async.py` example: - Changed `OpenAIModelFee.schema()` to `OpenAIModelFee.model_json_schema()` for compatibility. - Enhanced LLM extraction instruction documentation. This commit introduces significant enhancements to improve efficiency, flexibility, and reliability of the crawler strategy.
1435 lines
63 KiB
Python
1435 lines
63 KiB
Python
import asyncio
|
|
import base64
|
|
import time
|
|
from abc import ABC, abstractmethod
|
|
from typing import Callable, Dict, Any, List, Optional, Awaitable
|
|
import os, sys, shutil
|
|
import tempfile, subprocess
|
|
from playwright.async_api import async_playwright, Page, Browser, Error
|
|
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
|
from io import BytesIO
|
|
from PIL import Image, ImageDraw, ImageFont
|
|
from pathlib import Path
|
|
from playwright.async_api import ProxySettings
|
|
from pydantic import BaseModel
|
|
import hashlib
|
|
import json
|
|
import uuid
|
|
from .models import AsyncCrawlResponse
|
|
from .utils import create_box_message
|
|
from .user_agent_generator import UserAgentGenerator
|
|
from playwright_stealth import StealthConfig, stealth_async
|
|
|
|
stealth_config = StealthConfig(
|
|
webdriver=True,
|
|
chrome_app=True,
|
|
chrome_csi=True,
|
|
chrome_load_times=True,
|
|
chrome_runtime=True,
|
|
navigator_languages=True,
|
|
navigator_plugins=True,
|
|
navigator_permissions=True,
|
|
webgl_vendor=True,
|
|
outerdimensions=True,
|
|
navigator_hardware_concurrency=True,
|
|
media_codecs=True,
|
|
)
|
|
|
|
|
|
class ManagedBrowser:
|
|
def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None, host: str = "localhost", debugging_port: int = 9222):
|
|
self.browser_type = browser_type
|
|
self.user_data_dir = user_data_dir
|
|
self.headless = headless
|
|
self.browser_process = None
|
|
self.temp_dir = None
|
|
self.debugging_port = debugging_port
|
|
self.host = host
|
|
self.logger = logger
|
|
self.shutting_down = False
|
|
|
|
async def start(self) -> str:
|
|
"""
|
|
Starts the browser process and returns the CDP endpoint URL.
|
|
If user_data_dir is not provided, creates a temporary directory.
|
|
"""
|
|
|
|
# Create temp dir if needed
|
|
if not self.user_data_dir:
|
|
self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-")
|
|
self.user_data_dir = self.temp_dir
|
|
|
|
# Get browser path and args based on OS and browser type
|
|
browser_path = self._get_browser_path()
|
|
args = self._get_browser_args()
|
|
|
|
# Start browser process
|
|
try:
|
|
self.browser_process = subprocess.Popen(
|
|
args,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE
|
|
)
|
|
# Monitor browser process output for errors
|
|
asyncio.create_task(self._monitor_browser_process())
|
|
await asyncio.sleep(2) # Give browser time to start
|
|
return f"http://{self.host}:{self.debugging_port}"
|
|
except Exception as e:
|
|
await self.cleanup()
|
|
raise Exception(f"Failed to start browser: {e}")
|
|
|
|
async def _monitor_browser_process(self):
|
|
"""Monitor the browser process for unexpected termination."""
|
|
if self.browser_process:
|
|
try:
|
|
stdout, stderr = await asyncio.gather(
|
|
asyncio.to_thread(self.browser_process.stdout.read),
|
|
asyncio.to_thread(self.browser_process.stderr.read)
|
|
)
|
|
|
|
# Check shutting_down flag BEFORE logging anything
|
|
if self.browser_process.poll() is not None:
|
|
if not self.shutting_down:
|
|
self.logger.error(
|
|
message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
|
|
tag="ERROR",
|
|
params={
|
|
"code": self.browser_process.returncode,
|
|
"stdout": stdout.decode(),
|
|
"stderr": stderr.decode()
|
|
}
|
|
)
|
|
await self.cleanup()
|
|
else:
|
|
self.logger.info(
|
|
message="Browser process terminated normally | Code: {code}",
|
|
tag="INFO",
|
|
params={"code": self.browser_process.returncode}
|
|
)
|
|
except Exception as e:
|
|
if not self.shutting_down:
|
|
self.logger.error(
|
|
message="Error monitoring browser process: {error}",
|
|
tag="ERROR",
|
|
params={"error": str(e)}
|
|
)
|
|
|
|
def _get_browser_path(self) -> str:
|
|
"""Returns the browser executable path based on OS and browser type"""
|
|
if sys.platform == "darwin": # macOS
|
|
paths = {
|
|
"chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
|
"firefox": "/Applications/Firefox.app/Contents/MacOS/firefox",
|
|
"webkit": "/Applications/Safari.app/Contents/MacOS/Safari"
|
|
}
|
|
elif sys.platform == "win32": # Windows
|
|
paths = {
|
|
"chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
|
|
"firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe",
|
|
"webkit": None # WebKit not supported on Windows
|
|
}
|
|
else: # Linux
|
|
paths = {
|
|
"chromium": "google-chrome",
|
|
"firefox": "firefox",
|
|
"webkit": None # WebKit not supported on Linux
|
|
}
|
|
|
|
return paths.get(self.browser_type)
|
|
|
|
def _get_browser_args(self) -> List[str]:
|
|
"""Returns browser-specific command line arguments"""
|
|
base_args = [self._get_browser_path()]
|
|
|
|
if self.browser_type == "chromium":
|
|
args = [
|
|
f"--remote-debugging-port={self.debugging_port}",
|
|
f"--user-data-dir={self.user_data_dir}",
|
|
]
|
|
if self.headless:
|
|
args.append("--headless=new")
|
|
elif self.browser_type == "firefox":
|
|
args = [
|
|
"--remote-debugging-port", str(self.debugging_port),
|
|
"--profile", self.user_data_dir,
|
|
]
|
|
if self.headless:
|
|
args.append("--headless")
|
|
else:
|
|
raise NotImplementedError(f"Browser type {self.browser_type} not supported")
|
|
|
|
return base_args + args
|
|
|
|
async def cleanup(self):
|
|
"""Cleanup browser process and temporary directory"""
|
|
# Set shutting_down flag BEFORE any termination actions
|
|
self.shutting_down = True
|
|
|
|
if self.browser_process:
|
|
try:
|
|
self.browser_process.terminate()
|
|
# Wait for process to end gracefully
|
|
for _ in range(10): # 10 attempts, 100ms each
|
|
if self.browser_process.poll() is not None:
|
|
break
|
|
await asyncio.sleep(0.1)
|
|
|
|
# Force kill if still running
|
|
if self.browser_process.poll() is None:
|
|
self.browser_process.kill()
|
|
await asyncio.sleep(0.1) # Brief wait for kill to take effect
|
|
|
|
except Exception as e:
|
|
self.logger.error(
|
|
message="Error terminating browser: {error}",
|
|
tag="ERROR",
|
|
params={"error": str(e)}
|
|
)
|
|
|
|
if self.temp_dir and os.path.exists(self.temp_dir):
|
|
try:
|
|
shutil.rmtree(self.temp_dir)
|
|
except Exception as e:
|
|
self.logger.error(
|
|
message="Error removing temporary directory: {error}",
|
|
tag="ERROR",
|
|
params={"error": str(e)}
|
|
)
|
|
|
|
|
|
class AsyncCrawlerStrategy(ABC):
|
|
@abstractmethod
|
|
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
|
pass
|
|
|
|
@abstractmethod
|
|
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
|
|
pass
|
|
|
|
@abstractmethod
|
|
async def take_screenshot(self, **kwargs) -> str:
|
|
pass
|
|
|
|
@abstractmethod
|
|
def update_user_agent(self, user_agent: str):
|
|
pass
|
|
|
|
@abstractmethod
|
|
def set_hook(self, hook_type: str, hook: Callable):
|
|
pass
|
|
|
|
class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|
def __init__(self, use_cached_html=False, js_code=None, logger = None, **kwargs):
|
|
self.text_only = kwargs.get("text_only", False)
|
|
self.light_mode = kwargs.get("light_mode", False)
|
|
self.logger = logger
|
|
self.use_cached_html = use_cached_html
|
|
self.viewport_width = kwargs.get("viewport_width", 800 if self.text_only else 1920)
|
|
self.viewport_height = kwargs.get("viewport_height", 600 if self.text_only else 1080)
|
|
|
|
if self.text_only:
|
|
self.extra_args = kwargs.get("extra_args", []) + [
|
|
'--disable-images',
|
|
'--disable-javascript',
|
|
'--disable-gpu',
|
|
'--disable-software-rasterizer',
|
|
'--disable-dev-shm-usage'
|
|
]
|
|
|
|
self.user_agent = kwargs.get(
|
|
"user_agent",
|
|
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
|
|
"Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36"
|
|
)
|
|
user_agenr_generator = UserAgentGenerator()
|
|
if kwargs.get("user_agent_mode") == "random":
|
|
self.user_agent = user_agenr_generator.generate(
|
|
**kwargs.get("user_agent_generator_config", {})
|
|
)
|
|
self.proxy = kwargs.get("proxy")
|
|
self.proxy_config = kwargs.get("proxy_config")
|
|
self.headless = kwargs.get("headless", True)
|
|
self.browser_type = kwargs.get("browser_type", "chromium")
|
|
self.headers = kwargs.get("headers", {})
|
|
self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent)
|
|
self.headers.setdefault("sec-ch-ua", self.browser_hint)
|
|
self.cookies = kwargs.get("cookies", [])
|
|
self.sessions = {}
|
|
self.session_ttl = 1800
|
|
self.js_code = js_code
|
|
self.verbose = kwargs.get("verbose", False)
|
|
self.playwright = None
|
|
self.browser = None
|
|
self.sleep_on_close = kwargs.get("sleep_on_close", False)
|
|
self.use_managed_browser = kwargs.get("use_managed_browser", False)
|
|
self.user_data_dir = kwargs.get("user_data_dir", None)
|
|
self.use_persistent_context = kwargs.get("use_persistent_context", False)
|
|
self.chrome_channel = kwargs.get("chrome_channel", "chrome")
|
|
self.managed_browser = None
|
|
self.default_context = None
|
|
self.hooks = {
|
|
'on_browser_created': None,
|
|
'on_user_agent_updated': None,
|
|
'on_execution_started': None,
|
|
'before_goto': None,
|
|
'after_goto': None,
|
|
'before_return_html': None,
|
|
'before_retrieve_html': None
|
|
}
|
|
self.extra_args = kwargs.get("extra_args", [])
|
|
self.accept_downloads = kwargs.get("accept_downloads", False)
|
|
self.downloads_path = kwargs.get("downloads_path")
|
|
self._downloaded_files = [] # Track downloaded files for current crawl
|
|
if self.accept_downloads and not self.downloads_path:
|
|
self.downloads_path = os.path.join(os.getcwd(), "downloads")
|
|
os.makedirs(self.downloads_path, exist_ok=True)
|
|
|
|
|
|
async def __aenter__(self):
|
|
await self.start()
|
|
return self
|
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
await self.close()
|
|
|
|
async def start(self):
|
|
if self.playwright is None:
|
|
self.playwright = await async_playwright().start()
|
|
if self.browser is None:
|
|
if self.use_managed_browser:
|
|
# Use managed browser approach
|
|
self.managed_browser = ManagedBrowser(
|
|
browser_type=self.browser_type,
|
|
user_data_dir=self.user_data_dir,
|
|
headless=self.headless,
|
|
logger=self.logger
|
|
)
|
|
cdp_url = await self.managed_browser.start()
|
|
self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
|
|
|
|
# Get the default context that maintains the user profile
|
|
contexts = self.browser.contexts
|
|
if contexts:
|
|
self.default_context = contexts[0]
|
|
else:
|
|
# If no default context exists, create one
|
|
self.default_context = await self.browser.new_context(
|
|
# viewport={"width": 1920, "height": 1080}
|
|
viewport={"width": self.viewport_width, "height": self.viewport_height}
|
|
)
|
|
|
|
# Set up the default context
|
|
if self.default_context:
|
|
await self.default_context.set_extra_http_headers(self.headers)
|
|
if self.cookies:
|
|
await self.default_context.add_cookies(self.cookies)
|
|
if self.accept_downloads:
|
|
await self.default_context.set_default_timeout(60000)
|
|
await self.default_context.set_default_navigation_timeout(60000)
|
|
self.default_context._impl_obj._options["accept_downloads"] = True
|
|
self.default_context._impl_obj._options["downloads_path"] = self.downloads_path
|
|
|
|
if self.user_agent:
|
|
await self.default_context.set_extra_http_headers({
|
|
"User-Agent": self.user_agent,
|
|
"sec-ch-ua": self.browser_hint,
|
|
# **self.headers
|
|
})
|
|
else:
|
|
# Base browser arguments
|
|
browser_args = {
|
|
"headless": self.headless,
|
|
"args": [
|
|
"--no-sandbox",
|
|
"--disable-dev-shm-usage",
|
|
"--no-first-run",
|
|
"--no-default-browser-check",
|
|
"--disable-infobars",
|
|
"--window-position=0,0",
|
|
"--ignore-certificate-errors",
|
|
"--ignore-certificate-errors-spki-list",
|
|
"--disable-blink-features=AutomationControlled",
|
|
"--window-position=400,0",
|
|
f"--window-size={self.viewport_width},{self.viewport_height}",
|
|
]
|
|
}
|
|
|
|
if self.light_mode:
|
|
browser_args["args"].extend([
|
|
# "--disable-background-networking",
|
|
"--disable-background-timer-throttling",
|
|
"--disable-backgrounding-occluded-windows",
|
|
"--disable-breakpad",
|
|
"--disable-client-side-phishing-detection",
|
|
"--disable-component-extensions-with-background-pages",
|
|
"--disable-default-apps",
|
|
"--disable-extensions",
|
|
"--disable-features=TranslateUI",
|
|
"--disable-hang-monitor",
|
|
"--disable-ipc-flooding-protection",
|
|
"--disable-popup-blocking",
|
|
"--disable-prompt-on-repost",
|
|
"--disable-sync",
|
|
"--force-color-profile=srgb",
|
|
"--metrics-recording-only",
|
|
"--no-first-run",
|
|
"--password-store=basic",
|
|
"--use-mock-keychain"
|
|
])
|
|
|
|
if self.text_only:
|
|
browser_args["args"].extend([
|
|
'--blink-settings=imagesEnabled=false',
|
|
'--disable-remote-fonts'
|
|
])
|
|
|
|
# Add channel if specified (try Chrome first)
|
|
if self.chrome_channel:
|
|
browser_args["channel"] = self.chrome_channel
|
|
|
|
# Add extra args if provided
|
|
if self.extra_args:
|
|
browser_args["args"].extend(self.extra_args)
|
|
|
|
# Add downloads path if downloads are enabled
|
|
if self.accept_downloads:
|
|
browser_args["downloads_path"] = self.downloads_path
|
|
|
|
# Add proxy settings if a proxy is specified
|
|
if self.proxy:
|
|
proxy_settings = ProxySettings(server=self.proxy)
|
|
browser_args["proxy"] = proxy_settings
|
|
elif self.proxy_config:
|
|
proxy_settings = ProxySettings(
|
|
server=self.proxy_config.get("server"),
|
|
username=self.proxy_config.get("username"),
|
|
password=self.proxy_config.get("password")
|
|
)
|
|
browser_args["proxy"] = proxy_settings
|
|
|
|
try:
|
|
# Select the appropriate browser based on the browser_type
|
|
if self.browser_type == "firefox":
|
|
self.browser = await self.playwright.firefox.launch(**browser_args)
|
|
elif self.browser_type == "webkit":
|
|
if "viewport" not in browser_args:
|
|
browser_args["viewport"] = {"width": self.viewport_width, "height": self.viewport_height}
|
|
self.browser = await self.playwright.webkit.launch(**browser_args)
|
|
else:
|
|
if self.use_persistent_context and self.user_data_dir:
|
|
self.browser = await self.playwright.chromium.launch_persistent_context(
|
|
user_data_dir=self.user_data_dir,
|
|
accept_downloads=self.accept_downloads,
|
|
downloads_path=self.downloads_path if self.accept_downloads else None,
|
|
**browser_args
|
|
)
|
|
self.default_context = self.browser
|
|
else:
|
|
self.browser = await self.playwright.chromium.launch(**browser_args)
|
|
|
|
except Exception as e:
|
|
# Fallback to chromium if Chrome channel fails
|
|
if "chrome" in str(e) and browser_args.get("channel") == "chrome":
|
|
browser_args["channel"] = "chromium"
|
|
if self.use_persistent_context and self.user_data_dir:
|
|
self.browser = await self.playwright.chromium.launch_persistent_context(
|
|
user_data_dir=self.user_data_dir,
|
|
**browser_args
|
|
)
|
|
self.default_context = self.browser
|
|
else:
|
|
self.browser = await self.playwright.chromium.launch(**browser_args)
|
|
else:
|
|
raise
|
|
|
|
await self.execute_hook('on_browser_created', self.browser)
|
|
|
|
async def close(self):
|
|
if self.sleep_on_close:
|
|
await asyncio.sleep(0.5)
|
|
|
|
# Close all active sessions
|
|
session_ids = list(self.sessions.keys())
|
|
for session_id in session_ids:
|
|
await self.kill_session(session_id)
|
|
|
|
if self.browser:
|
|
await self.browser.close()
|
|
self.browser = None
|
|
|
|
if self.managed_browser:
|
|
await asyncio.sleep(0.5)
|
|
await self.managed_browser.cleanup()
|
|
self.managed_browser = None
|
|
|
|
if self.playwright:
|
|
await self.playwright.stop()
|
|
self.playwright = None
|
|
|
|
# Issue #256: Remove __del__ method to avoid potential issues with async cleanup
|
|
# def __del__(self):
|
|
# if self.browser or self.playwright:
|
|
# asyncio.get_event_loop().run_until_complete(self.close())
|
|
|
|
def set_hook(self, hook_type: str, hook: Callable):
|
|
if hook_type in self.hooks:
|
|
self.hooks[hook_type] = hook
|
|
else:
|
|
raise ValueError(f"Invalid hook type: {hook_type}")
|
|
|
|
async def execute_hook(self, hook_type: str, *args, **kwargs):
|
|
hook = self.hooks.get(hook_type)
|
|
if hook:
|
|
if asyncio.iscoroutinefunction(hook):
|
|
return await hook(*args, **kwargs)
|
|
else:
|
|
return hook(*args, **kwargs)
|
|
return args[0] if args else None
|
|
|
|
def update_user_agent(self, user_agent: str):
|
|
self.user_agent = user_agent
|
|
|
|
def set_custom_headers(self, headers: Dict[str, str]):
|
|
self.headers = headers
|
|
|
|
async def kill_session(self, session_id: str):
|
|
if session_id in self.sessions:
|
|
context, page, _ = self.sessions[session_id]
|
|
await page.close()
|
|
if not self.use_managed_browser:
|
|
await context.close()
|
|
del self.sessions[session_id]
|
|
|
|
def _cleanup_expired_sessions(self):
|
|
current_time = time.time()
|
|
expired_sessions = [
|
|
sid for sid, (_, _, last_used) in self.sessions.items()
|
|
if current_time - last_used > self.session_ttl
|
|
]
|
|
for sid in expired_sessions:
|
|
asyncio.create_task(self.kill_session(sid))
|
|
|
|
async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
|
|
wait_for = wait_for.strip()
|
|
|
|
if wait_for.startswith('js:'):
|
|
# Explicitly specified JavaScript
|
|
js_code = wait_for[3:].strip()
|
|
return await self.csp_compliant_wait(page, js_code, timeout)
|
|
elif wait_for.startswith('css:'):
|
|
# Explicitly specified CSS selector
|
|
css_selector = wait_for[4:].strip()
|
|
try:
|
|
await page.wait_for_selector(css_selector, timeout=timeout)
|
|
except Error as e:
|
|
if 'Timeout' in str(e):
|
|
raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'")
|
|
else:
|
|
raise ValueError(f"Invalid CSS selector: '{css_selector}'")
|
|
else:
|
|
# Auto-detect based on content
|
|
if wait_for.startswith('()') or wait_for.startswith('function'):
|
|
# It's likely a JavaScript function
|
|
return await self.csp_compliant_wait(page, wait_for, timeout)
|
|
else:
|
|
# Assume it's a CSS selector first
|
|
try:
|
|
await page.wait_for_selector(wait_for, timeout=timeout)
|
|
except Error as e:
|
|
if 'Timeout' in str(e):
|
|
raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'")
|
|
else:
|
|
# If it's not a timeout error, it might be an invalid selector
|
|
# Let's try to evaluate it as a JavaScript function as a fallback
|
|
try:
|
|
return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout)
|
|
except Error:
|
|
raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. "
|
|
"It should be either a valid CSS selector, a JavaScript function, "
|
|
"or explicitly prefixed with 'js:' or 'css:'.")
|
|
|
|
async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000):
|
|
wrapper_js = f"""
|
|
async () => {{
|
|
const userFunction = {user_wait_function};
|
|
const startTime = Date.now();
|
|
while (true) {{
|
|
if (await userFunction()) {{
|
|
return true;
|
|
}}
|
|
if (Date.now() - startTime > {timeout}) {{
|
|
throw new Error('Timeout waiting for condition');
|
|
}}
|
|
await new Promise(resolve => setTimeout(resolve, 100));
|
|
}}
|
|
}}
|
|
"""
|
|
|
|
try:
|
|
await page.evaluate(wrapper_js)
|
|
except TimeoutError:
|
|
raise TimeoutError(f"Timeout after {timeout}ms waiting for condition")
|
|
except Exception as e:
|
|
raise RuntimeError(f"Error in wait condition: {str(e)}")
|
|
|
|
async def process_iframes(self, page):
|
|
# Find all iframes
|
|
iframes = await page.query_selector_all('iframe')
|
|
|
|
for i, iframe in enumerate(iframes):
|
|
try:
|
|
# Add a unique identifier to the iframe
|
|
await iframe.evaluate(f'(element) => element.id = "iframe-{i}"')
|
|
|
|
# Get the frame associated with this iframe
|
|
frame = await iframe.content_frame()
|
|
|
|
if frame:
|
|
# Wait for the frame to load
|
|
await frame.wait_for_load_state('load', timeout=30000) # 30 seconds timeout
|
|
|
|
# Extract the content of the iframe's body
|
|
iframe_content = await frame.evaluate('() => document.body.innerHTML')
|
|
|
|
# Generate a unique class name for this iframe
|
|
class_name = f'extracted-iframe-content-{i}'
|
|
|
|
# Replace the iframe with a div containing the extracted content
|
|
_iframe = iframe_content.replace('`', '\\`')
|
|
await page.evaluate(f"""
|
|
() => {{
|
|
const iframe = document.getElementById('iframe-{i}');
|
|
const div = document.createElement('div');
|
|
div.innerHTML = `{_iframe}`;
|
|
div.className = '{class_name}';
|
|
iframe.replaceWith(div);
|
|
}}
|
|
""")
|
|
else:
|
|
# print(f"Warning: Could not access content frame for iframe {i}")
|
|
self.logger.warning(
|
|
message="Could not access content frame for iframe {index}",
|
|
tag="SCRAPE",
|
|
params={"index": i}
|
|
)
|
|
except Exception as e:
|
|
self.logger.error(
|
|
message="Error processing iframe {index}: {error}",
|
|
tag="ERROR",
|
|
params={"index": i, "error": str(e)}
|
|
)
|
|
# print(f"Error processing iframe {i}: {str(e)}")
|
|
|
|
# Return the page object
|
|
return page
|
|
|
|
async def create_session(self, **kwargs) -> str:
|
|
"""Creates a new browser session and returns its ID."""
|
|
if not self.browser:
|
|
await self.start()
|
|
|
|
session_id = kwargs.get('session_id') or str(uuid.uuid4())
|
|
|
|
if self.use_managed_browser:
|
|
page = await self.default_context.new_page()
|
|
self.sessions[session_id] = (self.default_context, page, time.time())
|
|
else:
|
|
if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]:
|
|
context = self.browser
|
|
page = await context.new_page()
|
|
else:
|
|
context = await self.browser.new_context(
|
|
user_agent=kwargs.get("user_agent", self.user_agent),
|
|
viewport={"width": self.viewport_width, "height": self.viewport_height},
|
|
proxy={"server": self.proxy} if self.proxy else None,
|
|
accept_downloads=self.accept_downloads,
|
|
ignore_https_errors=True
|
|
)
|
|
|
|
if self.cookies:
|
|
await context.add_cookies(self.cookies)
|
|
await context.set_extra_http_headers(self.headers)
|
|
page = await context.new_page()
|
|
|
|
self.sessions[session_id] = (context, page, time.time())
|
|
|
|
return session_id
|
|
|
|
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
|
"""
|
|
Crawls a given URL or processes raw HTML/local file content based on the URL prefix.
|
|
|
|
Args:
|
|
url (str): The URL to crawl. Supported prefixes:
|
|
- 'http://' or 'https://': Web URL to crawl.
|
|
- 'file://': Local file path to process.
|
|
- 'raw:': Raw HTML content to process.
|
|
**kwargs: Additional parameters:
|
|
- 'screenshot' (bool): Whether to take a screenshot.
|
|
- ... [other existing parameters]
|
|
|
|
Returns:
|
|
AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot.
|
|
"""
|
|
response_headers = {}
|
|
status_code = 200 # Default to 200 for local/raw HTML
|
|
screenshot_requested = kwargs.get('screenshot', False)
|
|
screenshot_data = None
|
|
|
|
if url.startswith(('http://', 'https://')):
|
|
# Proceed with standard web crawling
|
|
return await self._crawl_web(url, **kwargs)
|
|
|
|
elif url.startswith('file://'):
|
|
# Process local file
|
|
local_file_path = url[7:] # Remove 'file://' prefix
|
|
if not os.path.exists(local_file_path):
|
|
raise FileNotFoundError(f"Local file not found: {local_file_path}")
|
|
with open(local_file_path, 'r', encoding='utf-8') as f:
|
|
html = f.read()
|
|
if screenshot_requested:
|
|
screenshot_data = await self._generate_screenshot_from_html(html)
|
|
return AsyncCrawlResponse(
|
|
html=html,
|
|
response_headers=response_headers,
|
|
status_code=status_code,
|
|
screenshot=screenshot_data,
|
|
get_delayed_content=None
|
|
)
|
|
|
|
elif url.startswith('raw:'):
|
|
# Process raw HTML content
|
|
raw_html = url[4:] # Remove 'raw:' prefix
|
|
html = raw_html
|
|
if screenshot_requested:
|
|
screenshot_data = await self._generate_screenshot_from_html(html)
|
|
return AsyncCrawlResponse(
|
|
html=html,
|
|
response_headers=response_headers,
|
|
status_code=status_code,
|
|
screenshot=screenshot_data,
|
|
get_delayed_content=None
|
|
)
|
|
else:
|
|
raise ValueError("URL must start with 'http://', 'https://', 'file://', or 'raw:'")
|
|
|
|
|
|
async def _crawl_web(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
|
"""
|
|
Existing web crawling logic remains unchanged.
|
|
|
|
Args:
|
|
url (str): The web URL to crawl.
|
|
**kwargs: Additional parameters.
|
|
|
|
Returns:
|
|
AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot.
|
|
"""
|
|
response_headers = {}
|
|
status_code = None
|
|
|
|
# Reset downloaded files list for new crawl
|
|
self._downloaded_files = []
|
|
|
|
self._cleanup_expired_sessions()
|
|
session_id = kwargs.get("session_id")
|
|
|
|
# Check if in kwargs we have user_agent that will override the default user_agent
|
|
user_agent = kwargs.get("user_agent", self.user_agent)
|
|
|
|
# Generate random user agent if magic mode is enabled and user_agent_mode is not random
|
|
if kwargs.get("user_agent_mode") != "random" and kwargs.get("magic", False):
|
|
user_agent = UserAgentGenerator().generate(
|
|
**kwargs.get("user_agent_generator_config", {})
|
|
)
|
|
|
|
# Handle page creation differently for managed browser
|
|
context = None
|
|
if self.use_managed_browser:
|
|
if session_id:
|
|
# Reuse existing session if available
|
|
context, page, _ = self.sessions.get(session_id, (None, None, None))
|
|
if not page:
|
|
# Create new page in default context if session doesn't exist
|
|
page = await self.default_context.new_page()
|
|
self.sessions[session_id] = (self.default_context, page, time.time())
|
|
else:
|
|
# Create new page in default context for non-session requests
|
|
page = await self.default_context.new_page()
|
|
else:
|
|
if session_id:
|
|
context, page, _ = self.sessions.get(session_id, (None, None, None))
|
|
if not context:
|
|
if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]:
|
|
# In persistent context, browser is the context
|
|
context = self.browser
|
|
else:
|
|
# Normal context creation for non-persistent or non-Chrome browsers
|
|
context = await self.browser.new_context(
|
|
user_agent=user_agent,
|
|
viewport={"width": self.viewport_width, "height": self.viewport_height},
|
|
proxy={"server": self.proxy} if self.proxy else None,
|
|
java_script_enabled=True,
|
|
accept_downloads=self.accept_downloads,
|
|
# downloads_path=self.downloads_path if self.accept_downloads else None
|
|
)
|
|
await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}])
|
|
if self.cookies:
|
|
await context.add_cookies(self.cookies)
|
|
await context.set_extra_http_headers(self.headers)
|
|
|
|
page = await context.new_page()
|
|
self.sessions[session_id] = (context, page, time.time())
|
|
else:
|
|
if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]:
|
|
# In persistent context, browser is the context
|
|
context = self.browser
|
|
else:
|
|
# Normal context creation
|
|
context = await self.browser.new_context(
|
|
user_agent=user_agent,
|
|
# viewport={"width": 1920, "height": 1080},
|
|
viewport={"width": self.viewport_width, "height": self.viewport_height},
|
|
proxy={"server": self.proxy} if self.proxy else None,
|
|
accept_downloads=self.accept_downloads,
|
|
ignore_https_errors=True # Add this line
|
|
)
|
|
if self.cookies:
|
|
await context.add_cookies(self.cookies)
|
|
await context.set_extra_http_headers(self.headers)
|
|
|
|
if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False):
|
|
# Inject scripts to override navigator properties
|
|
await context.add_init_script("""
|
|
// Pass the Permissions Test.
|
|
const originalQuery = window.navigator.permissions.query;
|
|
window.navigator.permissions.query = (parameters) => (
|
|
parameters.name === 'notifications' ?
|
|
Promise.resolve({ state: Notification.permission }) :
|
|
originalQuery(parameters)
|
|
);
|
|
Object.defineProperty(navigator, 'webdriver', {
|
|
get: () => undefined
|
|
});
|
|
window.navigator.chrome = {
|
|
runtime: {},
|
|
// Add other properties if necessary
|
|
};
|
|
Object.defineProperty(navigator, 'plugins', {
|
|
get: () => [1, 2, 3, 4, 5],
|
|
});
|
|
Object.defineProperty(navigator, 'languages', {
|
|
get: () => ['en-US', 'en'],
|
|
});
|
|
Object.defineProperty(document, 'hidden', {
|
|
get: () => false
|
|
});
|
|
Object.defineProperty(document, 'visibilityState', {
|
|
get: () => 'visible'
|
|
});
|
|
""")
|
|
|
|
page = await context.new_page()
|
|
if kwargs.get("magic", False):
|
|
await stealth_async(page, stealth_config)
|
|
|
|
# Add console message and error logging
|
|
if kwargs.get("log_console", False):
|
|
page.on("console", lambda msg: print(f"Console: {msg.text}"))
|
|
page.on("pageerror", lambda exc: print(f"Page Error: {exc}"))
|
|
|
|
try:
|
|
# Set up download handling if enabled
|
|
if self.accept_downloads:
|
|
page.on("download", lambda download: asyncio.create_task(self._handle_download(download)))
|
|
|
|
if self.use_cached_html:
|
|
cache_file_path = os.path.join(
|
|
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
|
)
|
|
if os.path.exists(cache_file_path):
|
|
html = ""
|
|
with open(cache_file_path, "r") as f:
|
|
html = f.read()
|
|
# retrieve response headers and status code from cache
|
|
with open(cache_file_path + ".meta", "r") as f:
|
|
meta = json.load(f)
|
|
response_headers = meta.get("response_headers", {})
|
|
status_code = meta.get("status_code")
|
|
response = AsyncCrawlResponse(
|
|
html=html, response_headers=response_headers, status_code=status_code
|
|
)
|
|
return response
|
|
|
|
if not kwargs.get("js_only", False):
|
|
await self.execute_hook('before_goto', page, context = context)
|
|
|
|
try:
|
|
response = await page.goto(
|
|
url,
|
|
# wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]),
|
|
wait_until=kwargs.get("wait_until", "domcontentloaded"),
|
|
timeout=kwargs.get("page_timeout", 60000),
|
|
)
|
|
except Error as e:
|
|
raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}")
|
|
|
|
await self.execute_hook('after_goto', page, context = context)
|
|
|
|
# Get status code and headers
|
|
status_code = response.status
|
|
response_headers = response.headers
|
|
else:
|
|
status_code = 200
|
|
response_headers = {}
|
|
|
|
# Replace the current wait_for_selector line with this more robust check:
|
|
try:
|
|
# First wait for body to exist, regardless of visibility
|
|
await page.wait_for_selector('body', state='attached', timeout=30000)
|
|
|
|
# Then wait for it to become visible by checking CSS
|
|
await page.wait_for_function("""
|
|
() => {
|
|
const body = document.body;
|
|
const style = window.getComputedStyle(body);
|
|
return style.display !== 'none' &&
|
|
style.visibility !== 'hidden' &&
|
|
style.opacity !== '0';
|
|
}
|
|
""", timeout=30000)
|
|
|
|
except Error as e:
|
|
# If waiting fails, let's try to diagnose the issue
|
|
visibility_info = await page.evaluate("""
|
|
() => {
|
|
const body = document.body;
|
|
const style = window.getComputedStyle(body);
|
|
return {
|
|
display: style.display,
|
|
visibility: style.visibility,
|
|
opacity: style.opacity,
|
|
hasContent: body.innerHTML.length,
|
|
classList: Array.from(body.classList)
|
|
}
|
|
}
|
|
""")
|
|
|
|
if self.verbose:
|
|
print(f"Body visibility debug info: {visibility_info}")
|
|
|
|
# Even if body is hidden, we might still want to proceed
|
|
if kwargs.get('ignore_body_visibility', True):
|
|
if self.verbose:
|
|
print("Proceeding despite hidden body...")
|
|
pass
|
|
else:
|
|
raise Error(f"Body element is hidden: {visibility_info}")
|
|
|
|
# CONTENT LOADING ASSURANCE
|
|
if not self.text_only and (kwargs.get("wait_for_images", True) or kwargs.get("adjust_viewport_to_content", False)):
|
|
# Wait for network idle after initial load and images to load
|
|
await page.wait_for_load_state("networkidle")
|
|
await asyncio.sleep(0.1)
|
|
await page.wait_for_function("Array.from(document.images).every(img => img.complete)")
|
|
|
|
# After initial load, adjust viewport to content size
|
|
if not self.text_only and kwargs.get("adjust_viewport_to_content", False):
|
|
try:
|
|
# Get actual page dimensions
|
|
page_width = await page.evaluate("document.documentElement.scrollWidth")
|
|
page_height = await page.evaluate("document.documentElement.scrollHeight")
|
|
|
|
target_width = self.viewport_width
|
|
target_height = int(target_width * page_width / page_height * 0.95)
|
|
await page.set_viewport_size({"width": target_width, "height": target_height})
|
|
|
|
# Compute scale factor
|
|
# We want the entire page visible: the scale should make both width and height fit
|
|
scale = min(target_width / page_width, target_height / page_height)
|
|
|
|
# Now we call CDP to set metrics.
|
|
# We tell Chrome that the "device" is page_width x page_height in size,
|
|
# but we scale it down so everything fits within the real viewport.
|
|
cdp = await page.context.new_cdp_session(page)
|
|
await cdp.send('Emulation.setDeviceMetricsOverride', {
|
|
'width': page_width, # full page width
|
|
'height': page_height, # full page height
|
|
'deviceScaleFactor': 1, # keep normal DPR
|
|
'mobile': False,
|
|
'scale': scale # scale the entire rendered content
|
|
})
|
|
|
|
except Exception as e:
|
|
self.logger.warning(
|
|
message="Failed to adjust viewport to content: {error}",
|
|
tag="VIEWPORT",
|
|
params={"error": str(e)}
|
|
)
|
|
|
|
# After viewport adjustment, handle page scanning if requested
|
|
if kwargs.get("scan_full_page", False):
|
|
try:
|
|
viewport_height = page.viewport_size.get("height", self.viewport_height)
|
|
current_position = viewport_height # Start with one viewport height
|
|
scroll_delay = kwargs.get("scroll_delay", 0.2)
|
|
|
|
# Initial scroll
|
|
await page.evaluate(f"window.scrollTo(0, {current_position})")
|
|
await asyncio.sleep(scroll_delay)
|
|
|
|
# Get height after first scroll to account for any dynamic content
|
|
total_height = await page.evaluate("document.documentElement.scrollHeight")
|
|
|
|
while current_position < total_height:
|
|
current_position = min(current_position + viewport_height, total_height)
|
|
await page.evaluate(f"window.scrollTo(0, {current_position})")
|
|
await asyncio.sleep(scroll_delay)
|
|
|
|
# Check for dynamic content
|
|
new_height = await page.evaluate("document.documentElement.scrollHeight")
|
|
if new_height > total_height:
|
|
total_height = new_height
|
|
|
|
# Scroll back to top
|
|
await page.evaluate("window.scrollTo(0, 0)")
|
|
|
|
except Exception as e:
|
|
self.logger.warning(
|
|
message="Failed to perform full page scan: {error}",
|
|
tag="PAGE_SCAN",
|
|
params={"error": str(e)}
|
|
)
|
|
else:
|
|
# Scroll to the bottom of the page
|
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
|
|
js_code = kwargs.get("js_code", kwargs.get("js", self.js_code))
|
|
if js_code:
|
|
if isinstance(js_code, str):
|
|
await page.evaluate(js_code)
|
|
elif isinstance(js_code, list):
|
|
for js in js_code:
|
|
await page.evaluate(js)
|
|
|
|
# await page.wait_for_timeout(100)
|
|
|
|
# Check for on execution event
|
|
await self.execute_hook('on_execution_started', page, context = context)
|
|
|
|
if kwargs.get("simulate_user", False) or kwargs.get("magic", False):
|
|
# Simulate user interactions
|
|
await page.mouse.move(100, 100)
|
|
await page.mouse.down()
|
|
await page.mouse.up()
|
|
await page.keyboard.press('ArrowDown')
|
|
|
|
# Handle the wait_for parameter
|
|
wait_for = kwargs.get("wait_for")
|
|
if wait_for:
|
|
try:
|
|
await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000))
|
|
except Exception as e:
|
|
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
|
|
|
# if not wait_for and js_code:
|
|
# await page.wait_for_load_state('networkidle', timeout=5000)
|
|
|
|
# Update image dimensions
|
|
if not self.text_only:
|
|
update_image_dimensions_js = """
|
|
() => {
|
|
return new Promise((resolve) => {
|
|
const filterImage = (img) => {
|
|
// Filter out images that are too small
|
|
if (img.width < 100 && img.height < 100) return false;
|
|
|
|
// Filter out images that are not visible
|
|
const rect = img.getBoundingClientRect();
|
|
if (rect.width === 0 || rect.height === 0) return false;
|
|
|
|
// Filter out images with certain class names (e.g., icons, thumbnails)
|
|
if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false;
|
|
|
|
// Filter out images with certain patterns in their src (e.g., placeholder images)
|
|
if (img.src.includes('placeholder') || img.src.includes('icon')) return false;
|
|
|
|
return true;
|
|
};
|
|
|
|
const images = Array.from(document.querySelectorAll('img')).filter(filterImage);
|
|
let imagesLeft = images.length;
|
|
|
|
if (imagesLeft === 0) {
|
|
resolve();
|
|
return;
|
|
}
|
|
|
|
const checkImage = (img) => {
|
|
if (img.complete && img.naturalWidth !== 0) {
|
|
img.setAttribute('width', img.naturalWidth);
|
|
img.setAttribute('height', img.naturalHeight);
|
|
imagesLeft--;
|
|
if (imagesLeft === 0) resolve();
|
|
}
|
|
};
|
|
|
|
images.forEach(img => {
|
|
checkImage(img);
|
|
if (!img.complete) {
|
|
img.onload = () => {
|
|
checkImage(img);
|
|
};
|
|
img.onerror = () => {
|
|
imagesLeft--;
|
|
if (imagesLeft === 0) resolve();
|
|
};
|
|
}
|
|
});
|
|
|
|
// Fallback timeout of 5 seconds
|
|
// setTimeout(() => resolve(), 5000);
|
|
resolve();
|
|
});
|
|
}
|
|
"""
|
|
|
|
try:
|
|
try:
|
|
await page.wait_for_load_state(
|
|
# state="load",
|
|
state="domcontentloaded",
|
|
timeout=5
|
|
)
|
|
except PlaywrightTimeoutError:
|
|
pass
|
|
await page.evaluate(update_image_dimensions_js)
|
|
except Exception as e:
|
|
self.logger.error(
|
|
message="Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {error}",
|
|
tag="ERROR",
|
|
params={"error": str(e)}
|
|
)
|
|
# raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}")
|
|
|
|
# Wait a bit for any onload events to complete
|
|
# await page.wait_for_timeout(100)
|
|
|
|
# Process iframes
|
|
if kwargs.get("process_iframes", False):
|
|
page = await self.process_iframes(page)
|
|
|
|
await self.execute_hook('before_retrieve_html', page, context = context)
|
|
# Check if delay_before_return_html is set then wait for that time
|
|
delay_before_return_html = kwargs.get("delay_before_return_html", 0.1)
|
|
if delay_before_return_html:
|
|
await asyncio.sleep(delay_before_return_html)
|
|
|
|
# Check for remove_overlay_elements parameter
|
|
if kwargs.get("remove_overlay_elements", False):
|
|
await self.remove_overlay_elements(page)
|
|
|
|
html = await page.content()
|
|
await self.execute_hook('before_return_html', page, html, context = context)
|
|
|
|
# Check if kwargs has screenshot=True then take screenshot
|
|
screenshot_data = None
|
|
if kwargs.get("screenshot"):
|
|
# Check we have screenshot_wait_for parameter, if we have simply wait for that time
|
|
screenshot_wait_for = kwargs.get("screenshot_wait_for")
|
|
if screenshot_wait_for:
|
|
await asyncio.sleep(screenshot_wait_for)
|
|
screenshot_data = await self.take_screenshot(page)
|
|
|
|
# if self.verbose:
|
|
# print(f"[LOG] ✅ Crawled {url} successfully!")
|
|
|
|
if self.use_cached_html:
|
|
cache_file_path = os.path.join(
|
|
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
|
)
|
|
with open(cache_file_path, "w", encoding="utf-8") as f:
|
|
f.write(html)
|
|
# store response headers and status code in cache
|
|
with open(cache_file_path + ".meta", "w", encoding="utf-8") as f:
|
|
json.dump({
|
|
"response_headers": response_headers,
|
|
"status_code": status_code
|
|
}, f)
|
|
|
|
async def get_delayed_content(delay: float = 5.0) -> str:
|
|
if self.verbose:
|
|
print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}")
|
|
await asyncio.sleep(delay)
|
|
return await page.content()
|
|
|
|
response = AsyncCrawlResponse(
|
|
html=html,
|
|
response_headers=response_headers,
|
|
status_code=status_code,
|
|
screenshot=screenshot_data,
|
|
get_delayed_content=get_delayed_content,
|
|
downloaded_files=self._downloaded_files if self._downloaded_files else None
|
|
)
|
|
return response
|
|
except Error as e:
|
|
raise Error(f"async_crawler_strategy.py:_crawleb(): {str(e)}")
|
|
# finally:
|
|
# if not session_id:
|
|
# await page.close()
|
|
# await context.close()
|
|
|
|
async def _handle_download(self, download):
|
|
"""Handle file downloads."""
|
|
try:
|
|
suggested_filename = download.suggested_filename
|
|
download_path = os.path.join(self.downloads_path, suggested_filename)
|
|
|
|
self.logger.info(
|
|
message="Downloading {filename} to {path}",
|
|
tag="FETCH",
|
|
params={"filename": suggested_filename, "path": download_path}
|
|
)
|
|
|
|
start_time = time.perf_counter()
|
|
await download.save_as(download_path)
|
|
end_time = time.perf_counter()
|
|
self._downloaded_files.append(download_path)
|
|
|
|
self.logger.success(
|
|
message="Downloaded {filename} successfully",
|
|
tag="COMPLETE",
|
|
params={"filename": suggested_filename, "path": download_path, "duration": f"{end_time - start_time:.2f}s"}
|
|
)
|
|
except Exception as e:
|
|
self.logger.error(
|
|
message="Failed to handle download: {error}",
|
|
tag="ERROR",
|
|
params={"error": str(e)}
|
|
)
|
|
|
|
# if self.verbose:
|
|
# print(f"[ERROR] Failed to handle download: {str(e)}")
|
|
|
|
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
|
|
semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed
|
|
semaphore = asyncio.Semaphore(semaphore_count)
|
|
|
|
async def crawl_with_semaphore(url):
|
|
async with semaphore:
|
|
return await self.crawl(url, **kwargs)
|
|
|
|
tasks = [crawl_with_semaphore(url) for url in urls]
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
return [result if not isinstance(result, Exception) else str(result) for result in results]
|
|
|
|
async def remove_overlay_elements(self, page: Page) -> None:
|
|
"""
|
|
Removes popup overlays, modals, cookie notices, and other intrusive elements from the page.
|
|
|
|
Args:
|
|
page (Page): The Playwright page instance
|
|
"""
|
|
remove_overlays_js = """
|
|
async () => {
|
|
// Function to check if element is visible
|
|
const isVisible = (elem) => {
|
|
const style = window.getComputedStyle(elem);
|
|
return style.display !== 'none' &&
|
|
style.visibility !== 'hidden' &&
|
|
style.opacity !== '0';
|
|
};
|
|
|
|
// Common selectors for popups and overlays
|
|
const commonSelectors = [
|
|
// Close buttons first
|
|
'button[class*="close" i]', 'button[class*="dismiss" i]',
|
|
'button[aria-label*="close" i]', 'button[title*="close" i]',
|
|
'a[class*="close" i]', 'span[class*="close" i]',
|
|
|
|
// Cookie notices
|
|
'[class*="cookie-banner" i]', '[id*="cookie-banner" i]',
|
|
'[class*="cookie-consent" i]', '[id*="cookie-consent" i]',
|
|
|
|
// Newsletter/subscription dialogs
|
|
'[class*="newsletter" i]', '[class*="subscribe" i]',
|
|
|
|
// Generic popups/modals
|
|
'[class*="popup" i]', '[class*="modal" i]',
|
|
'[class*="overlay" i]', '[class*="dialog" i]',
|
|
'[role="dialog"]', '[role="alertdialog"]'
|
|
];
|
|
|
|
// Try to click close buttons first
|
|
for (const selector of commonSelectors.slice(0, 6)) {
|
|
const closeButtons = document.querySelectorAll(selector);
|
|
for (const button of closeButtons) {
|
|
if (isVisible(button)) {
|
|
try {
|
|
button.click();
|
|
await new Promise(resolve => setTimeout(resolve, 100));
|
|
} catch (e) {
|
|
console.log('Error clicking button:', e);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Remove remaining overlay elements
|
|
const removeOverlays = () => {
|
|
// Find elements with high z-index
|
|
const allElements = document.querySelectorAll('*');
|
|
for (const elem of allElements) {
|
|
const style = window.getComputedStyle(elem);
|
|
const zIndex = parseInt(style.zIndex);
|
|
const position = style.position;
|
|
|
|
if (
|
|
isVisible(elem) &&
|
|
(zIndex > 999 || position === 'fixed' || position === 'absolute') &&
|
|
(
|
|
elem.offsetWidth > window.innerWidth * 0.5 ||
|
|
elem.offsetHeight > window.innerHeight * 0.5 ||
|
|
style.backgroundColor.includes('rgba') ||
|
|
parseFloat(style.opacity) < 1
|
|
)
|
|
) {
|
|
elem.remove();
|
|
}
|
|
}
|
|
|
|
// Remove elements matching common selectors
|
|
for (const selector of commonSelectors) {
|
|
const elements = document.querySelectorAll(selector);
|
|
elements.forEach(elem => {
|
|
if (isVisible(elem)) {
|
|
elem.remove();
|
|
}
|
|
});
|
|
}
|
|
};
|
|
|
|
// Remove overlay elements
|
|
removeOverlays();
|
|
|
|
// Remove any fixed/sticky position elements at the top/bottom
|
|
const removeFixedElements = () => {
|
|
const elements = document.querySelectorAll('*');
|
|
elements.forEach(elem => {
|
|
const style = window.getComputedStyle(elem);
|
|
if (
|
|
(style.position === 'fixed' || style.position === 'sticky') &&
|
|
isVisible(elem)
|
|
) {
|
|
elem.remove();
|
|
}
|
|
});
|
|
};
|
|
|
|
removeFixedElements();
|
|
|
|
// Remove empty block elements as: div, p, span, etc.
|
|
const removeEmptyBlockElements = () => {
|
|
const blockElements = document.querySelectorAll('div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6');
|
|
blockElements.forEach(elem => {
|
|
if (elem.innerText.trim() === '') {
|
|
elem.remove();
|
|
}
|
|
});
|
|
};
|
|
|
|
// Remove margin-right and padding-right from body (often added by modal scripts)
|
|
document.body.style.marginRight = '0px';
|
|
document.body.style.paddingRight = '0px';
|
|
document.body.style.overflow = 'auto';
|
|
|
|
// Wait a bit for any animations to complete
|
|
await new Promise(resolve => setTimeout(resolve, 100));
|
|
}
|
|
"""
|
|
|
|
try:
|
|
await page.evaluate(remove_overlays_js)
|
|
await page.wait_for_timeout(500) # Wait for any animations to complete
|
|
except Exception as e:
|
|
self.logger.warning(
|
|
message="Failed to remove overlay elements: {error}",
|
|
tag="SCRAPE",
|
|
params={"error": str(e)}
|
|
)
|
|
# if self.verbose:
|
|
# print(f"Warning: Failed to remove overlay elements: {str(e)}")
|
|
|
|
async def take_screenshot(self, page: Page) -> str:
|
|
"""
|
|
Takes a screenshot of the current page.
|
|
|
|
Args:
|
|
page (Page): The Playwright page instance
|
|
|
|
Returns:
|
|
str: Base64-encoded screenshot image
|
|
"""
|
|
try:
|
|
# The page is already loaded, just take the screenshot
|
|
screenshot = await page.screenshot(full_page=True)
|
|
return base64.b64encode(screenshot).decode('utf-8')
|
|
except Exception as e:
|
|
error_message = f"Failed to take screenshot: {str(e)}"
|
|
self.logger.error(
|
|
message="Screenshot failed: {error}",
|
|
tag="ERROR",
|
|
params={"error": error_message}
|
|
)
|
|
|
|
|
|
# Generate an error image
|
|
img = Image.new('RGB', (800, 600), color='black')
|
|
draw = ImageDraw.Draw(img)
|
|
font = ImageFont.load_default()
|
|
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
|
|
|
|
buffered = BytesIO()
|
|
img.save(buffered, format="JPEG")
|
|
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
|
finally:
|
|
await page.close()
|
|
|
|
async def _generate_screenshot_from_html(self, html: str) -> Optional[str]:
|
|
"""
|
|
Generates a screenshot from raw HTML content.
|
|
|
|
Args:
|
|
html (str): The HTML content to render and capture.
|
|
|
|
Returns:
|
|
Optional[str]: Base64-encoded screenshot image or an error image if failed.
|
|
"""
|
|
try:
|
|
if not self.browser:
|
|
await self.start()
|
|
page = await self.browser.new_page()
|
|
await page.set_content(html, wait_until='networkidle')
|
|
screenshot = await page.screenshot(full_page=True)
|
|
await page.close()
|
|
return base64.b64encode(screenshot).decode('utf-8')
|
|
except Exception as e:
|
|
error_message = f"Failed to take screenshot: {str(e)}"
|
|
# print(error_message)
|
|
self.logger.error(
|
|
message="Screenshot failed: {error}",
|
|
tag="ERROR",
|
|
params={"error": error_message}
|
|
)
|
|
|
|
# Generate an error image
|
|
img = Image.new('RGB', (800, 600), color='black')
|
|
draw = ImageDraw.Draw(img)
|
|
font = ImageFont.load_default()
|
|
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
|
|
|
|
buffered = BytesIO()
|
|
img.save(buffered, format="JPEG")
|
|
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
|
|