feat(crawler): Enhance stealth and flexibility, improve error handling
- Implement playwright_stealth for better bot detection avoidance - Add user simulation and navigator override options - Improve iframe processing and browser selection - Enhance error reporting and debugging capabilities - Optimize image processing and parallel crawling - Add new example for user simulation feature - Added support for including links in Markdown content, by definin g a new flag `include_links_on_markdown` in `crawl` method.
This commit is contained in:
@@ -1,17 +1,35 @@
|
||||
import asyncio
|
||||
import base64, time
|
||||
import base64
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Callable, Dict, Any, List, Optional, Awaitable
|
||||
import os
|
||||
from playwright.async_api import async_playwright, Page, Browser, Error
|
||||
from io import BytesIO
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
from .utils import sanitize_input_encode, calculate_semaphore_count
|
||||
import json, uuid
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from playwright.async_api import ProxySettings
|
||||
from pydantic import BaseModel
|
||||
import hashlib
|
||||
import json
|
||||
import uuid
|
||||
from playwright_stealth import StealthConfig, stealth_async
|
||||
|
||||
stealth_config = StealthConfig(
|
||||
webdriver=True,
|
||||
chrome_app=True,
|
||||
chrome_csi=True,
|
||||
chrome_load_times=True,
|
||||
chrome_runtime=True,
|
||||
navigator_languages=True,
|
||||
navigator_plugins=True,
|
||||
navigator_permissions=True,
|
||||
webgl_vendor=True,
|
||||
outerdimensions=True,
|
||||
navigator_hardware_concurrency=True,
|
||||
media_codecs=True,
|
||||
)
|
||||
|
||||
|
||||
class AsyncCrawlResponse(BaseModel):
|
||||
html: str
|
||||
@@ -47,10 +65,14 @@ class AsyncCrawlerStrategy(ABC):
|
||||
class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
|
||||
self.use_cached_html = use_cached_html
|
||||
self.user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||
self.user_agent = kwargs.get(
|
||||
"user_agent",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
)
|
||||
self.proxy = kwargs.get("proxy")
|
||||
self.headless = kwargs.get("headless", True)
|
||||
self.browser_type = kwargs.get("browser_type", "chromium") # New parameter
|
||||
self.browser_type = kwargs.get("browser_type", "chromium")
|
||||
self.headers = kwargs.get("headers", {})
|
||||
self.sessions = {}
|
||||
self.session_ttl = 1800
|
||||
@@ -83,9 +105,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
"headless": self.headless,
|
||||
"args": [
|
||||
"--disable-gpu",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-setuid-sandbox",
|
||||
"--no-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--disable-infobars",
|
||||
"--window-position=0,0",
|
||||
"--ignore-certificate-errors",
|
||||
"--ignore-certificate-errors-spki-list",
|
||||
# "--headless=new", # Use the new headless mode
|
||||
]
|
||||
}
|
||||
|
||||
@@ -94,7 +121,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
proxy_settings = ProxySettings(server=self.proxy)
|
||||
browser_args["proxy"] = proxy_settings
|
||||
|
||||
|
||||
# Select the appropriate browser based on the browser_type
|
||||
if self.browser_type == "firefox":
|
||||
self.browser = await self.playwright.firefox.launch(**browser_args)
|
||||
@@ -147,8 +173,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
def _cleanup_expired_sessions(self):
|
||||
current_time = time.time()
|
||||
expired_sessions = [sid for sid, (_, _, last_used) in self.sessions.items()
|
||||
if current_time - last_used > self.session_ttl]
|
||||
expired_sessions = [
|
||||
sid for sid, (_, _, last_used) in self.sessions.items()
|
||||
if current_time - last_used > self.session_ttl
|
||||
]
|
||||
for sid in expired_sessions:
|
||||
asyncio.create_task(self.kill_session(sid))
|
||||
|
||||
@@ -188,8 +216,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout)
|
||||
except Error:
|
||||
raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. "
|
||||
"It should be either a valid CSS selector, a JavaScript function, "
|
||||
"or explicitly prefixed with 'js:' or 'css:'.")
|
||||
"It should be either a valid CSS selector, a JavaScript function, "
|
||||
"or explicitly prefixed with 'js:' or 'css:'.")
|
||||
|
||||
async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000):
|
||||
wrapper_js = f"""
|
||||
@@ -254,8 +282,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
print(f"Error processing iframe {i}: {str(e)}")
|
||||
|
||||
# Return the page object
|
||||
return page
|
||||
|
||||
return page
|
||||
|
||||
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
||||
response_headers = {}
|
||||
@@ -268,6 +295,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
if not context:
|
||||
context = await self.browser.new_context(
|
||||
user_agent=self.user_agent,
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
proxy={"server": self.proxy} if self.proxy else None
|
||||
)
|
||||
await context.set_extra_http_headers(self.headers)
|
||||
@@ -275,18 +303,58 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
self.sessions[session_id] = (context, page, time.time())
|
||||
else:
|
||||
context = await self.browser.new_context(
|
||||
user_agent=self.user_agent,
|
||||
proxy={"server": self.proxy} if self.proxy else None
|
||||
user_agent=self.user_agent,
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
proxy={"server": self.proxy} if self.proxy else None
|
||||
)
|
||||
await context.set_extra_http_headers(self.headers)
|
||||
|
||||
if kwargs.get("override_navigator", False):
|
||||
# Inject scripts to override navigator properties
|
||||
await context.add_init_script("""
|
||||
// Pass the Permissions Test.
|
||||
const originalQuery = window.navigator.permissions.query;
|
||||
window.navigator.permissions.query = (parameters) => (
|
||||
parameters.name === 'notifications' ?
|
||||
Promise.resolve({ state: Notification.permission }) :
|
||||
originalQuery(parameters)
|
||||
);
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => undefined
|
||||
});
|
||||
window.navigator.chrome = {
|
||||
runtime: {},
|
||||
// Add other properties if necessary
|
||||
};
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['en-US', 'en'],
|
||||
});
|
||||
Object.defineProperty(document, 'hidden', {
|
||||
get: () => false
|
||||
});
|
||||
Object.defineProperty(document, 'visibilityState', {
|
||||
get: () => 'visible'
|
||||
});
|
||||
""")
|
||||
|
||||
page = await context.new_page()
|
||||
# await stealth_async(page) #, stealth_config)
|
||||
|
||||
# Add console message and error logging
|
||||
page.on("console", lambda msg: print(f"Console: {msg.text}"))
|
||||
page.on("pageerror", lambda exc: print(f"Page Error: {exc}"))
|
||||
|
||||
try:
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...")
|
||||
|
||||
if self.use_cached_html:
|
||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest())
|
||||
cache_file_path = os.path.join(
|
||||
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
||||
)
|
||||
if os.path.exists(cache_file_path):
|
||||
html = ""
|
||||
with open(cache_file_path, "r") as f:
|
||||
@@ -296,12 +364,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
meta = json.load(f)
|
||||
response_headers = meta.get("response_headers", {})
|
||||
status_code = meta.get("status_code")
|
||||
response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
|
||||
response = AsyncCrawlResponse(
|
||||
html=html, response_headers=response_headers, status_code=status_code
|
||||
)
|
||||
return response
|
||||
|
||||
if not kwargs.get("js_only", False):
|
||||
await self.execute_hook('before_goto', page)
|
||||
response = await page.goto(url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000))
|
||||
|
||||
response = await page.goto(
|
||||
url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000)
|
||||
)
|
||||
|
||||
# response = await page.goto("about:blank")
|
||||
# await page.evaluate(f"window.location.href = '{url}'")
|
||||
|
||||
await self.execute_hook('after_goto', page)
|
||||
|
||||
# Get status code and headers
|
||||
@@ -311,37 +388,29 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
status_code = 200
|
||||
response_headers = {}
|
||||
|
||||
|
||||
await page.wait_for_selector('body')
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
|
||||
js_code = kwargs.get("js_code", kwargs.get("js", self.js_code))
|
||||
if js_code:
|
||||
if isinstance(js_code, str):
|
||||
r = await page.evaluate(js_code)
|
||||
await page.evaluate(js_code)
|
||||
elif isinstance(js_code, list):
|
||||
for js in js_code:
|
||||
await page.evaluate(js)
|
||||
|
||||
# await page.wait_for_timeout(100)
|
||||
await page.wait_for_load_state('networkidle')
|
||||
# Check for on execution even
|
||||
# Check for on execution event
|
||||
await self.execute_hook('on_execution_started', page)
|
||||
|
||||
# New code to handle the wait_for parameter
|
||||
# Example usage:
|
||||
# await crawler.crawl(
|
||||
# url,
|
||||
# js_code="// some JavaScript code",
|
||||
# wait_for="""() => {
|
||||
# return document.querySelector('#my-element') !== null;
|
||||
# }"""
|
||||
# )
|
||||
# Example of using a CSS selector:
|
||||
# await crawler.crawl(
|
||||
# url,
|
||||
# wait_for="#my-element"
|
||||
# )
|
||||
if kwargs.get("simulate_user", False):
|
||||
# Simulate user interactions
|
||||
await page.mouse.move(100, 100)
|
||||
await page.mouse.down()
|
||||
await page.mouse.up()
|
||||
await page.keyboard.press('ArrowDown')
|
||||
|
||||
# Handle the wait_for parameter
|
||||
wait_for = kwargs.get("wait_for")
|
||||
if wait_for:
|
||||
try:
|
||||
@@ -349,13 +418,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
||||
|
||||
# Check if kwargs has screenshot=True then take screenshot
|
||||
screenshot_data = None
|
||||
if kwargs.get("screenshot"):
|
||||
screenshot_data = await self.take_screenshot(url)
|
||||
|
||||
|
||||
|
||||
# New code to update image dimensions
|
||||
# Update image dimensions
|
||||
update_image_dimensions_js = """
|
||||
() => {
|
||||
return new Promise((resolve) => {
|
||||
@@ -428,12 +493,19 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
html = await page.content()
|
||||
await self.execute_hook('before_return_html', page, html)
|
||||
|
||||
# Check if kwargs has screenshot=True then take screenshot
|
||||
screenshot_data = None
|
||||
if kwargs.get("screenshot"):
|
||||
screenshot_data = await self.take_screenshot(url)
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] ✅ Crawled {url} successfully!")
|
||||
|
||||
if self.use_cached_html:
|
||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest())
|
||||
cache_file_path = os.path.join(
|
||||
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
||||
)
|
||||
with open(cache_file_path, "w", encoding="utf-8") as f:
|
||||
f.write(html)
|
||||
# store response headers and status code in cache
|
||||
@@ -443,7 +515,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
"status_code": status_code
|
||||
}, f)
|
||||
|
||||
|
||||
async def get_delayed_content(delay: float = 5.0) -> str:
|
||||
if self.verbose:
|
||||
print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}")
|
||||
@@ -463,59 +534,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
finally:
|
||||
if not session_id:
|
||||
await page.close()
|
||||
await context.close()
|
||||
|
||||
# try:
|
||||
# html = await _crawl()
|
||||
# return sanitize_input_encode(html)
|
||||
# except Error as e:
|
||||
# raise Error(f"Failed to crawl {url}: {str(e)}")
|
||||
# except Exception as e:
|
||||
# raise Exception(f"Failed to crawl {url}: {str(e)}")
|
||||
|
||||
async def execute_js(self, session_id: str, js_code: str, wait_for_js: str = None, wait_for_css: str = None) -> AsyncCrawlResponse:
|
||||
"""
|
||||
Execute JavaScript code in a specific session and optionally wait for a condition.
|
||||
|
||||
:param session_id: The ID of the session to execute the JS code in.
|
||||
:param js_code: The JavaScript code to execute.
|
||||
:param wait_for_js: JavaScript condition to wait for after execution.
|
||||
:param wait_for_css: CSS selector to wait for after execution.
|
||||
:return: AsyncCrawlResponse containing the page's HTML and other information.
|
||||
:raises ValueError: If the session does not exist.
|
||||
"""
|
||||
if not session_id:
|
||||
raise ValueError("Session ID must be provided")
|
||||
|
||||
if session_id not in self.sessions:
|
||||
raise ValueError(f"No active session found for session ID: {session_id}")
|
||||
|
||||
context, page, last_used = self.sessions[session_id]
|
||||
|
||||
try:
|
||||
await page.evaluate(js_code)
|
||||
|
||||
if wait_for_js:
|
||||
await page.wait_for_function(wait_for_js)
|
||||
|
||||
if wait_for_css:
|
||||
await page.wait_for_selector(wait_for_css)
|
||||
|
||||
# Get the updated HTML content
|
||||
html = await page.content()
|
||||
|
||||
# Get response headers and status code (assuming these are available)
|
||||
response_headers = await page.evaluate("() => JSON.stringify(performance.getEntriesByType('resource')[0].responseHeaders)")
|
||||
status_code = await page.evaluate("() => performance.getEntriesByType('resource')[0].responseStatus")
|
||||
|
||||
# Update the last used time for this session
|
||||
self.sessions[session_id] = (context, page, time.time())
|
||||
|
||||
return AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
|
||||
except Error as e:
|
||||
raise Error(f"Failed to execute JavaScript or wait for condition in session {session_id}: {str(e)}")
|
||||
|
||||
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
|
||||
semaphore_count = kwargs.get('semaphore_count', calculate_semaphore_count())
|
||||
semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed
|
||||
semaphore = asyncio.Semaphore(semaphore_count)
|
||||
|
||||
async def crawl_with_semaphore(url):
|
||||
@@ -526,7 +548,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
return [result if not isinstance(result, Exception) else str(result) for result in results]
|
||||
|
||||
async def take_screenshot(self, url: str, wait_time = 1000) -> str:
|
||||
async def take_screenshot(self, url: str, wait_time=1000) -> str:
|
||||
async with await self.browser.new_context(user_agent=self.user_agent) as context:
|
||||
page = await context.new_page()
|
||||
try:
|
||||
@@ -549,4 +571,5 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
img.save(buffered, format="JPEG")
|
||||
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||
finally:
|
||||
await page.close()
|
||||
await page.close()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user