From 768aa06ceb45682eb62e148526b3c5c755be0146 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 17 Oct 2024 21:37:48 +0800 Subject: [PATCH] feat(crawler): Enhance stealth and flexibility, improve error handling - Implement playwright_stealth for better bot detection avoidance - Add user simulation and navigator override options - Improve iframe processing and browser selection - Enhance error reporting and debugging capabilities - Optimize image processing and parallel crawling - Add new example for user simulation feature - Added support for including links in Markdown content, by definin g a new flag `include_links_on_markdown` in `crawl` method. --- .gitignore | 2 + CHANGELOG.md | 47 ++ crawl4ai/async_crawler_strategy copy.py | 558 ++++++++++++++++++++++++ crawl4ai/async_crawler_strategy.py | 221 +++++----- crawl4ai/async_webcrawler.py | 1 + crawl4ai/content_scrapping_strategy.py | 36 +- crawl4ai/utils.py | 2 +- docs/examples/quickstart_async.py | 12 + 8 files changed, 777 insertions(+), 102 deletions(-) create mode 100644 crawl4ai/async_crawler_strategy copy.py diff --git a/.gitignore b/.gitignore index 8b8f014c..1793e24c 100644 --- a/.gitignore +++ b/.gitignore @@ -202,5 +202,7 @@ todo.md git_changes.py git_changes.md pypi_build.sh +git_issues.py +git_issues.md .tests/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index a377d794..8b0513d4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,52 @@ # Changelog +## [v0.3.7] - 2024-10-17 + +### New Features +1. **Enhanced Browser Stealth**: + - Implemented `playwright_stealth` for improved bot detection avoidance. + - Added `StealthConfig` for fine-tuned control over stealth parameters. + +2. **User Simulation**: + - New `simulate_user` option to mimic human-like interactions (mouse movements, clicks, keyboard presses). + +3. **Navigator Override**: + - Added `override_navigator` option to modify navigator properties, further improving bot detection evasion. + +4. **Improved iframe Handling**: + - New `process_iframes` parameter to extract and integrate iframe content into the main page. + +5. **Flexible Browser Selection**: + - Support for choosing between Chromium, Firefox, and WebKit browsers. + +6. **Include Links in Markdown**: + - Added support for including links in Markdown content, by definin g a new flag `include_links_on_markdown` in `crawl` method. + +### Improvements +1. **Better Error Handling**: + - Enhanced error reporting in WebScrappingStrategy with detailed error messages and suggestions. + - Added console message and error logging for better debugging. + +2. **Image Processing Enhancements**: + - Improved image dimension updating and filtering logic. + +3. **Crawling Flexibility**: + - Added support for custom viewport sizes. + - Implemented delayed content retrieval with `delay_before_return_html` parameter. + +4. **Performance Optimization**: + - Adjusted default semaphore count for parallel crawling. + +### Bug Fixes +- Fixed an issue where the HTML content could be empty after processing. + +### Examples +- Added new example `crawl_with_user_simulation()` demonstrating the use of user simulation and navigator override features. + +### Developer Notes +- Refactored code for better maintainability and readability. +- Updated browser launch arguments for improved compatibility and performance. + ## [v0.3.6] - 2024-10-12 ### 1. Improved Crawling Control diff --git a/crawl4ai/async_crawler_strategy copy.py b/crawl4ai/async_crawler_strategy copy.py new file mode 100644 index 00000000..507c9247 --- /dev/null +++ b/crawl4ai/async_crawler_strategy copy.py @@ -0,0 +1,558 @@ +import asyncio +import base64 +import time +from abc import ABC, abstractmethod +from typing import Callable, Dict, Any, List, Optional, Awaitable +import os +from playwright.async_api import async_playwright, Page, Browser, Error +from io import BytesIO +from PIL import Image, ImageDraw, ImageFont +from pathlib import Path +from playwright.async_api import ProxySettings +from pydantic import BaseModel +import hashlib +import json +import uuid +from playwright_stealth import stealth_async + +class AsyncCrawlResponse(BaseModel): + html: str + response_headers: Dict[str, str] + status_code: int + screenshot: Optional[str] = None + get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None + + class Config: + arbitrary_types_allowed = True + +class AsyncCrawlerStrategy(ABC): + @abstractmethod + async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: + pass + + @abstractmethod + async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: + pass + + @abstractmethod + async def take_screenshot(self, url: str) -> str: + pass + + @abstractmethod + def update_user_agent(self, user_agent: str): + pass + + @abstractmethod + def set_hook(self, hook_type: str, hook: Callable): + pass + +class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + def __init__(self, use_cached_html=False, js_code=None, **kwargs): + self.use_cached_html = use_cached_html + self.user_agent = kwargs.get( + "user_agent", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + ) + self.proxy = kwargs.get("proxy") + self.headless = kwargs.get("headless", True) + self.browser_type = kwargs.get("browser_type", "chromium") + self.headers = kwargs.get("headers", {}) + self.sessions = {} + self.session_ttl = 1800 + self.js_code = js_code + self.verbose = kwargs.get("verbose", False) + self.playwright = None + self.browser = None + self.hooks = { + 'on_browser_created': None, + 'on_user_agent_updated': None, + 'on_execution_started': None, + 'before_goto': None, + 'after_goto': None, + 'before_return_html': None, + 'before_retrieve_html': None + } + + async def __aenter__(self): + await self.start() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + await self.close() + + async def start(self): + if self.playwright is None: + self.playwright = await async_playwright().start() + if self.browser is None: + browser_args = { + "headless": self.headless, + "args": [ + "--disable-gpu", + "--no-sandbox", + "--disable-dev-shm-usage", + "--disable-blink-features=AutomationControlled", + "--disable-infobars", + "--window-position=0,0", + "--ignore-certificate-errors", + "--ignore-certificate-errors-spki-list", + # "--headless=new", # Use the new headless mode + ] + } + + # Add proxy settings if a proxy is specified + if self.proxy: + proxy_settings = ProxySettings(server=self.proxy) + browser_args["proxy"] = proxy_settings + + # Select the appropriate browser based on the browser_type + if self.browser_type == "firefox": + self.browser = await self.playwright.firefox.launch(**browser_args) + elif self.browser_type == "webkit": + self.browser = await self.playwright.webkit.launch(**browser_args) + else: + self.browser = await self.playwright.chromium.launch(**browser_args) + + await self.execute_hook('on_browser_created', self.browser) + + async def close(self): + if self.browser: + await self.browser.close() + self.browser = None + if self.playwright: + await self.playwright.stop() + self.playwright = None + + def __del__(self): + if self.browser or self.playwright: + asyncio.get_event_loop().run_until_complete(self.close()) + + def set_hook(self, hook_type: str, hook: Callable): + if hook_type in self.hooks: + self.hooks[hook_type] = hook + else: + raise ValueError(f"Invalid hook type: {hook_type}") + + async def execute_hook(self, hook_type: str, *args): + hook = self.hooks.get(hook_type) + if hook: + if asyncio.iscoroutinefunction(hook): + return await hook(*args) + else: + return hook(*args) + return args[0] if args else None + + def update_user_agent(self, user_agent: str): + self.user_agent = user_agent + + def set_custom_headers(self, headers: Dict[str, str]): + self.headers = headers + + async def kill_session(self, session_id: str): + if session_id in self.sessions: + context, page, _ = self.sessions[session_id] + await page.close() + await context.close() + del self.sessions[session_id] + + def _cleanup_expired_sessions(self): + current_time = time.time() + expired_sessions = [ + sid for sid, (_, _, last_used) in self.sessions.items() + if current_time - last_used > self.session_ttl + ] + for sid in expired_sessions: + asyncio.create_task(self.kill_session(sid)) + + async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): + wait_for = wait_for.strip() + + if wait_for.startswith('js:'): + # Explicitly specified JavaScript + js_code = wait_for[3:].strip() + return await self.csp_compliant_wait(page, js_code, timeout) + elif wait_for.startswith('css:'): + # Explicitly specified CSS selector + css_selector = wait_for[4:].strip() + try: + await page.wait_for_selector(css_selector, timeout=timeout) + except Error as e: + if 'Timeout' in str(e): + raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'") + else: + raise ValueError(f"Invalid CSS selector: '{css_selector}'") + else: + # Auto-detect based on content + if wait_for.startswith('()') or wait_for.startswith('function'): + # It's likely a JavaScript function + return await self.csp_compliant_wait(page, wait_for, timeout) + else: + # Assume it's a CSS selector first + try: + await page.wait_for_selector(wait_for, timeout=timeout) + except Error as e: + if 'Timeout' in str(e): + raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'") + else: + # If it's not a timeout error, it might be an invalid selector + # Let's try to evaluate it as a JavaScript function as a fallback + try: + return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout) + except Error: + raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. " + "It should be either a valid CSS selector, a JavaScript function, " + "or explicitly prefixed with 'js:' or 'css:'.") + + async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000): + wrapper_js = f""" + async () => {{ + const userFunction = {user_wait_function}; + const startTime = Date.now(); + while (true) {{ + if (await userFunction()) {{ + return true; + }} + if (Date.now() - startTime > {timeout}) {{ + throw new Error('Timeout waiting for condition'); + }} + await new Promise(resolve => setTimeout(resolve, 100)); + }} + }} + """ + + try: + await page.evaluate(wrapper_js) + except TimeoutError: + raise TimeoutError(f"Timeout after {timeout}ms waiting for condition") + except Exception as e: + raise RuntimeError(f"Error in wait condition: {str(e)}") + + async def process_iframes(self, page): + # Find all iframes + iframes = await page.query_selector_all('iframe') + + for i, iframe in enumerate(iframes): + try: + # Add a unique identifier to the iframe + await iframe.evaluate(f'(element) => element.id = "iframe-{i}"') + + # Get the frame associated with this iframe + frame = await iframe.content_frame() + + if frame: + # Wait for the frame to load + await frame.wait_for_load_state('load', timeout=30000) # 30 seconds timeout + + # Extract the content of the iframe's body + iframe_content = await frame.evaluate('() => document.body.innerHTML') + + # Generate a unique class name for this iframe + class_name = f'extracted-iframe-content-{i}' + + # Replace the iframe with a div containing the extracted content + _iframe = iframe_content.replace('`', '\\`') + await page.evaluate(f""" + () => {{ + const iframe = document.getElementById('iframe-{i}'); + const div = document.createElement('div'); + div.innerHTML = `{_iframe}`; + div.className = '{class_name}'; + iframe.replaceWith(div); + }} + """) + else: + print(f"Warning: Could not access content frame for iframe {i}") + except Exception as e: + print(f"Error processing iframe {i}: {str(e)}") + + # Return the page object + return page + + async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: + response_headers = {} + status_code = None + + self._cleanup_expired_sessions() + session_id = kwargs.get("session_id") + if session_id: + context, page, _ = self.sessions.get(session_id, (None, None, None)) + if not context: + context = await self.browser.new_context( + user_agent=self.user_agent, + viewport={"width": 1920, "height": 1080}, + proxy={"server": self.proxy} if self.proxy else None + ) + await context.set_extra_http_headers(self.headers) + page = await context.new_page() + self.sessions[session_id] = (context, page, time.time()) + else: + context = await self.browser.new_context( + user_agent=self.user_agent, + viewport={"width": 1920, "height": 1080}, + proxy={"server": self.proxy} if self.proxy else None + ) + await context.set_extra_http_headers(self.headers) + + if kwargs.get("override_navigator", False): + # Inject scripts to override navigator properties + await context.add_init_script(""" + // Pass the Permissions Test. + const originalQuery = window.navigator.permissions.query; + window.navigator.permissions.query = (parameters) => ( + parameters.name === 'notifications' ? + Promise.resolve({ state: Notification.permission }) : + originalQuery(parameters) + ); + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + window.navigator.chrome = { + runtime: {}, + // Add other properties if necessary + }; + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5], + }); + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en'], + }); + Object.defineProperty(document, 'hidden', { + get: () => false + }); + Object.defineProperty(document, 'visibilityState', { + get: () => 'visible' + }); + """) + + page = await context.new_page() + + try: + if self.verbose: + print(f"[LOG] πŸ•ΈοΈ Crawling {url} using AsyncPlaywrightCrawlerStrategy...") + + if self.use_cached_html: + cache_file_path = os.path.join( + Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() + ) + if os.path.exists(cache_file_path): + html = "" + with open(cache_file_path, "r") as f: + html = f.read() + # retrieve response headers and status code from cache + with open(cache_file_path + ".meta", "r") as f: + meta = json.load(f) + response_headers = meta.get("response_headers", {}) + status_code = meta.get("status_code") + response = AsyncCrawlResponse( + html=html, response_headers=response_headers, status_code=status_code + ) + return response + + if not kwargs.get("js_only", False): + await self.execute_hook('before_goto', page) + + response = await page.goto("about:blank") + await stealth_async(page) + response = await page.goto( + url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000) + ) + + # await stealth_async(page) + # response = await page.goto("about:blank") + # await stealth_async(page) + # await page.evaluate(f"window.location.href = '{url}'") + + await self.execute_hook('after_goto', page) + + # Get status code and headers + status_code = response.status + response_headers = response.headers + else: + status_code = 200 + response_headers = {} + + await page.wait_for_selector('body') + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + + js_code = kwargs.get("js_code", kwargs.get("js", self.js_code)) + if js_code: + if isinstance(js_code, str): + await page.evaluate(js_code) + elif isinstance(js_code, list): + for js in js_code: + await page.evaluate(js) + + await page.wait_for_load_state('networkidle') + # Check for on execution event + await self.execute_hook('on_execution_started', page) + + if kwargs.get("simulate_user", False): + # Simulate user interactions + await page.mouse.move(100, 100) + await page.mouse.down() + await page.mouse.up() + await page.keyboard.press('ArrowDown') + + # Handle the wait_for parameter + wait_for = kwargs.get("wait_for") + if wait_for: + try: + await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000)) + except Exception as e: + raise RuntimeError(f"Wait condition failed: {str(e)}") + + + + # Update image dimensions + update_image_dimensions_js = """ + () => { + return new Promise((resolve) => { + const filterImage = (img) => { + // Filter out images that are too small + if (img.width < 100 && img.height < 100) return false; + + // Filter out images that are not visible + const rect = img.getBoundingClientRect(); + if (rect.width === 0 || rect.height === 0) return false; + + // Filter out images with certain class names (e.g., icons, thumbnails) + if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false; + + // Filter out images with certain patterns in their src (e.g., placeholder images) + if (img.src.includes('placeholder') || img.src.includes('icon')) return false; + + return true; + }; + + const images = Array.from(document.querySelectorAll('img')).filter(filterImage); + let imagesLeft = images.length; + + if (imagesLeft === 0) { + resolve(); + return; + } + + const checkImage = (img) => { + if (img.complete && img.naturalWidth !== 0) { + img.setAttribute('width', img.naturalWidth); + img.setAttribute('height', img.naturalHeight); + imagesLeft--; + if (imagesLeft === 0) resolve(); + } + }; + + images.forEach(img => { + checkImage(img); + if (!img.complete) { + img.onload = () => { + checkImage(img); + }; + img.onerror = () => { + imagesLeft--; + if (imagesLeft === 0) resolve(); + }; + } + }); + + // Fallback timeout of 5 seconds + setTimeout(() => resolve(), 5000); + }); + } + """ + await page.evaluate(update_image_dimensions_js) + + # Wait a bit for any onload events to complete + await page.wait_for_timeout(100) + + # Process iframes + if kwargs.get("process_iframes", False): + page = await self.process_iframes(page) + + await self.execute_hook('before_retrieve_html', page) + # Check if delay_before_return_html is set then wait for that time + delay_before_return_html = kwargs.get("delay_before_return_html") + if delay_before_return_html: + await asyncio.sleep(delay_before_return_html) + + html = await page.content() + await self.execute_hook('before_return_html', page, html) + + # Check if kwargs has screenshot=True then take screenshot + screenshot_data = None + if kwargs.get("screenshot"): + screenshot_data = await self.take_screenshot(url) + + if self.verbose: + print(f"[LOG] βœ… Crawled {url} successfully!") + + if self.use_cached_html: + cache_file_path = os.path.join( + Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() + ) + with open(cache_file_path, "w", encoding="utf-8") as f: + f.write(html) + # store response headers and status code in cache + with open(cache_file_path + ".meta", "w", encoding="utf-8") as f: + json.dump({ + "response_headers": response_headers, + "status_code": status_code + }, f) + + async def get_delayed_content(delay: float = 5.0) -> str: + if self.verbose: + print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}") + await asyncio.sleep(delay) + return await page.content() + + response = AsyncCrawlResponse( + html=html, + response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + get_delayed_content=get_delayed_content + ) + return response + except Error as e: + raise Error(f"Failed to crawl {url}: {str(e)}") + finally: + if not session_id: + await page.close() + await context.close() + + async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: + semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed + semaphore = asyncio.Semaphore(semaphore_count) + + async def crawl_with_semaphore(url): + async with semaphore: + return await self.crawl(url, **kwargs) + + tasks = [crawl_with_semaphore(url) for url in urls] + results = await asyncio.gather(*tasks, return_exceptions=True) + return [result if not isinstance(result, Exception) else str(result) for result in results] + + async def take_screenshot(self, url: str, wait_time=1000) -> str: + async with await self.browser.new_context(user_agent=self.user_agent) as context: + page = await context.new_page() + try: + await page.goto(url, wait_until="domcontentloaded", timeout=30000) + # Wait for a specified time (default is 1 second) + await page.wait_for_timeout(wait_time) + screenshot = await page.screenshot(full_page=True) + return base64.b64encode(screenshot).decode('utf-8') + except Exception as e: + error_message = f"Failed to take screenshot: {str(e)}" + print(error_message) + + # Generate an error image + img = Image.new('RGB', (800, 600), color='black') + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode('utf-8') + finally: + await page.close() + diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index e9699953..d4c94fee 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1,17 +1,35 @@ import asyncio -import base64, time +import base64 +import time from abc import ABC, abstractmethod from typing import Callable, Dict, Any, List, Optional, Awaitable import os from playwright.async_api import async_playwright, Page, Browser, Error from io import BytesIO from PIL import Image, ImageDraw, ImageFont -from .utils import sanitize_input_encode, calculate_semaphore_count -import json, uuid -import hashlib from pathlib import Path from playwright.async_api import ProxySettings from pydantic import BaseModel +import hashlib +import json +import uuid +from playwright_stealth import StealthConfig, stealth_async + +stealth_config = StealthConfig( + webdriver=True, + chrome_app=True, + chrome_csi=True, + chrome_load_times=True, + chrome_runtime=True, + navigator_languages=True, + navigator_plugins=True, + navigator_permissions=True, + webgl_vendor=True, + outerdimensions=True, + navigator_hardware_concurrency=True, + media_codecs=True, +) + class AsyncCrawlResponse(BaseModel): html: str @@ -47,10 +65,14 @@ class AsyncCrawlerStrategy(ABC): class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): def __init__(self, use_cached_html=False, js_code=None, **kwargs): self.use_cached_html = use_cached_html - self.user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") + self.user_agent = kwargs.get( + "user_agent", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + ) self.proxy = kwargs.get("proxy") self.headless = kwargs.get("headless", True) - self.browser_type = kwargs.get("browser_type", "chromium") # New parameter + self.browser_type = kwargs.get("browser_type", "chromium") self.headers = kwargs.get("headers", {}) self.sessions = {} self.session_ttl = 1800 @@ -83,9 +105,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "headless": self.headless, "args": [ "--disable-gpu", - "--disable-dev-shm-usage", - "--disable-setuid-sandbox", "--no-sandbox", + "--disable-dev-shm-usage", + "--disable-blink-features=AutomationControlled", + "--disable-infobars", + "--window-position=0,0", + "--ignore-certificate-errors", + "--ignore-certificate-errors-spki-list", + # "--headless=new", # Use the new headless mode ] } @@ -94,7 +121,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): proxy_settings = ProxySettings(server=self.proxy) browser_args["proxy"] = proxy_settings - # Select the appropriate browser based on the browser_type if self.browser_type == "firefox": self.browser = await self.playwright.firefox.launch(**browser_args) @@ -147,8 +173,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): def _cleanup_expired_sessions(self): current_time = time.time() - expired_sessions = [sid for sid, (_, _, last_used) in self.sessions.items() - if current_time - last_used > self.session_ttl] + expired_sessions = [ + sid for sid, (_, _, last_used) in self.sessions.items() + if current_time - last_used > self.session_ttl + ] for sid in expired_sessions: asyncio.create_task(self.kill_session(sid)) @@ -188,8 +216,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout) except Error: raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. " - "It should be either a valid CSS selector, a JavaScript function, " - "or explicitly prefixed with 'js:' or 'css:'.") + "It should be either a valid CSS selector, a JavaScript function, " + "or explicitly prefixed with 'js:' or 'css:'.") async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000): wrapper_js = f""" @@ -254,8 +282,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): print(f"Error processing iframe {i}: {str(e)}") # Return the page object - return page - + return page async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: response_headers = {} @@ -268,6 +295,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if not context: context = await self.browser.new_context( user_agent=self.user_agent, + viewport={"width": 1920, "height": 1080}, proxy={"server": self.proxy} if self.proxy else None ) await context.set_extra_http_headers(self.headers) @@ -275,18 +303,58 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.sessions[session_id] = (context, page, time.time()) else: context = await self.browser.new_context( - user_agent=self.user_agent, - proxy={"server": self.proxy} if self.proxy else None + user_agent=self.user_agent, + viewport={"width": 1920, "height": 1080}, + proxy={"server": self.proxy} if self.proxy else None ) await context.set_extra_http_headers(self.headers) + + if kwargs.get("override_navigator", False): + # Inject scripts to override navigator properties + await context.add_init_script(""" + // Pass the Permissions Test. + const originalQuery = window.navigator.permissions.query; + window.navigator.permissions.query = (parameters) => ( + parameters.name === 'notifications' ? + Promise.resolve({ state: Notification.permission }) : + originalQuery(parameters) + ); + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + window.navigator.chrome = { + runtime: {}, + // Add other properties if necessary + }; + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5], + }); + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en'], + }); + Object.defineProperty(document, 'hidden', { + get: () => false + }); + Object.defineProperty(document, 'visibilityState', { + get: () => 'visible' + }); + """) + page = await context.new_page() + # await stealth_async(page) #, stealth_config) + # Add console message and error logging + page.on("console", lambda msg: print(f"Console: {msg.text}")) + page.on("pageerror", lambda exc: print(f"Page Error: {exc}")) + try: if self.verbose: print(f"[LOG] πŸ•ΈοΈ Crawling {url} using AsyncPlaywrightCrawlerStrategy...") if self.use_cached_html: - cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()) + cache_file_path = os.path.join( + Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() + ) if os.path.exists(cache_file_path): html = "" with open(cache_file_path, "r") as f: @@ -296,12 +364,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): meta = json.load(f) response_headers = meta.get("response_headers", {}) status_code = meta.get("status_code") - response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code) + response = AsyncCrawlResponse( + html=html, response_headers=response_headers, status_code=status_code + ) return response if not kwargs.get("js_only", False): await self.execute_hook('before_goto', page) - response = await page.goto(url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000)) + + response = await page.goto( + url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000) + ) + + # response = await page.goto("about:blank") + # await page.evaluate(f"window.location.href = '{url}'") + await self.execute_hook('after_goto', page) # Get status code and headers @@ -311,37 +388,29 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): status_code = 200 response_headers = {} - await page.wait_for_selector('body') await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") js_code = kwargs.get("js_code", kwargs.get("js", self.js_code)) if js_code: if isinstance(js_code, str): - r = await page.evaluate(js_code) + await page.evaluate(js_code) elif isinstance(js_code, list): for js in js_code: await page.evaluate(js) - # await page.wait_for_timeout(100) await page.wait_for_load_state('networkidle') - # Check for on execution even + # Check for on execution event await self.execute_hook('on_execution_started', page) - # New code to handle the wait_for parameter - # Example usage: - # await crawler.crawl( - # url, - # js_code="// some JavaScript code", - # wait_for="""() => { - # return document.querySelector('#my-element') !== null; - # }""" - # ) - # Example of using a CSS selector: - # await crawler.crawl( - # url, - # wait_for="#my-element" - # ) + if kwargs.get("simulate_user", False): + # Simulate user interactions + await page.mouse.move(100, 100) + await page.mouse.down() + await page.mouse.up() + await page.keyboard.press('ArrowDown') + + # Handle the wait_for parameter wait_for = kwargs.get("wait_for") if wait_for: try: @@ -349,13 +418,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Exception as e: raise RuntimeError(f"Wait condition failed: {str(e)}") - # Check if kwargs has screenshot=True then take screenshot - screenshot_data = None - if kwargs.get("screenshot"): - screenshot_data = await self.take_screenshot(url) + - - # New code to update image dimensions + # Update image dimensions update_image_dimensions_js = """ () => { return new Promise((resolve) => { @@ -428,12 +493,19 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): html = await page.content() await self.execute_hook('before_return_html', page, html) + + # Check if kwargs has screenshot=True then take screenshot + screenshot_data = None + if kwargs.get("screenshot"): + screenshot_data = await self.take_screenshot(url) if self.verbose: print(f"[LOG] βœ… Crawled {url} successfully!") if self.use_cached_html: - cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()) + cache_file_path = os.path.join( + Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() + ) with open(cache_file_path, "w", encoding="utf-8") as f: f.write(html) # store response headers and status code in cache @@ -443,7 +515,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "status_code": status_code }, f) - async def get_delayed_content(delay: float = 5.0) -> str: if self.verbose: print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}") @@ -463,59 +534,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): finally: if not session_id: await page.close() + await context.close() - # try: - # html = await _crawl() - # return sanitize_input_encode(html) - # except Error as e: - # raise Error(f"Failed to crawl {url}: {str(e)}") - # except Exception as e: - # raise Exception(f"Failed to crawl {url}: {str(e)}") - - async def execute_js(self, session_id: str, js_code: str, wait_for_js: str = None, wait_for_css: str = None) -> AsyncCrawlResponse: - """ - Execute JavaScript code in a specific session and optionally wait for a condition. - - :param session_id: The ID of the session to execute the JS code in. - :param js_code: The JavaScript code to execute. - :param wait_for_js: JavaScript condition to wait for after execution. - :param wait_for_css: CSS selector to wait for after execution. - :return: AsyncCrawlResponse containing the page's HTML and other information. - :raises ValueError: If the session does not exist. - """ - if not session_id: - raise ValueError("Session ID must be provided") - - if session_id not in self.sessions: - raise ValueError(f"No active session found for session ID: {session_id}") - - context, page, last_used = self.sessions[session_id] - - try: - await page.evaluate(js_code) - - if wait_for_js: - await page.wait_for_function(wait_for_js) - - if wait_for_css: - await page.wait_for_selector(wait_for_css) - - # Get the updated HTML content - html = await page.content() - - # Get response headers and status code (assuming these are available) - response_headers = await page.evaluate("() => JSON.stringify(performance.getEntriesByType('resource')[0].responseHeaders)") - status_code = await page.evaluate("() => performance.getEntriesByType('resource')[0].responseStatus") - - # Update the last used time for this session - self.sessions[session_id] = (context, page, time.time()) - - return AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code) - except Error as e: - raise Error(f"Failed to execute JavaScript or wait for condition in session {session_id}: {str(e)}") - async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: - semaphore_count = kwargs.get('semaphore_count', calculate_semaphore_count()) + semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed semaphore = asyncio.Semaphore(semaphore_count) async def crawl_with_semaphore(url): @@ -526,7 +548,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): results = await asyncio.gather(*tasks, return_exceptions=True) return [result if not isinstance(result, Exception) else str(result) for result in results] - async def take_screenshot(self, url: str, wait_time = 1000) -> str: + async def take_screenshot(self, url: str, wait_time=1000) -> str: async with await self.browser.new_context(user_agent=self.user_agent) as context: page = await context.new_page() try: @@ -549,4 +571,5 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): img.save(buffered, format="JPEG") return base64.b64encode(buffered.getvalue()).decode('utf-8') finally: - await page.close() \ No newline at end of file + await page.close() + diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index ba82d28f..76846fe9 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -195,6 +195,7 @@ class AsyncWebCrawler: image_description_min_word_threshold=kwargs.get( "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD ), + **kwargs, ) if verbose: print( diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index 68f03412..64707f74 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -33,6 +33,7 @@ class WebScrappingStrategy(ContentScrappingStrategy): return await asyncio.to_thread(self._get_content_of_website_optimized, url, html, **kwargs) def _get_content_of_website_optimized(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]: + success = True if not html: return None @@ -273,10 +274,41 @@ class WebScrappingStrategy(ContentScrappingStrategy): if base64_pattern.match(src): # Replace base64 data with empty string img['src'] = base64_pattern.sub('', src) + + try: + str(body) + except Exception as e: + # Reset body to the original HTML + success = False + body = BeautifulSoup(html, 'html.parser') + + # Create a new div with a special ID + error_div = body.new_tag('div', id='crawl4ai_error_message') + error_div.string = ''' + Crawl4AI Error: This page is not fully supported. + + Possible reasons: + 1. The page may have restrictions that prevent crawling. + 2. The page might not be fully loaded. + + Suggestions: + - Try calling the crawl function with these parameters: + simulate_user=True, override_navigator=True + - Set headless=False to visualize what's happening on the page. + + If the issue persists, please check the page's structure and any potential anti-crawling measures. + ''' + + # Append the error div to the body + body.body.append(error_div) + + print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.") + + cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ') h = CustomHTML2Text() - h.ignore_links = True + h.ignore_links = not kwargs.get('include_links_on_markdown', False) h.body_width = 0 try: markdown = h.handle(cleaned_html) @@ -294,7 +326,7 @@ class WebScrappingStrategy(ContentScrappingStrategy): return { 'markdown': markdown, 'cleaned_html': cleaned_html, - 'success': True, + 'success': success, 'media': media, 'links': links, 'metadata': meta diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index efb5d79b..711fd2c4 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -692,8 +692,8 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: for img in imgs: src = img.get('src', '') if base64_pattern.match(src): - # Replace base64 data with empty string img['src'] = base64_pattern.sub('', src) + cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ') cleaned_html = sanitize_html(cleaned_html) diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index f6c16a4e..a3837406 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -379,6 +379,18 @@ async def crawl_custom_browser_type(): print(result.markdown[:500]) print("Time taken: ", time.time() - start) +async def crawl_with_user_simultion(): + async with AsyncWebCrawler(verbose=True, headless=True) as crawler: + url = "YOUR-URL-HERE" + result = await crawler.arun( + url=url, + bypass_cache=True, + simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction + override_navigator = True # Overrides the navigator object to make it look like a real user + ) + + print(result.markdown) + async def speed_comparison(): # print("\n--- Speed Comparison ---") # print("Firecrawl (simulated):")