From 5431fa2d0ce78cf933786d2817690d0681583772 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 10 Dec 2024 20:10:39 +0800 Subject: [PATCH] Add PDF & screenshot functionality, new tutorial - Added support for exporting pages as PDFs - Enhanced screenshot functionality for long pages - Created a tutorial on dynamic content loading with 'Load More' buttons. - Updated web crawler to handle PDF data in responses. --- crawl4ai/async_crawler_strategy.py | 146 +++++++++++++++++++++-- crawl4ai/async_webcrawler.py | 12 ++ crawl4ai/config.py | 3 +- crawl4ai/models.py | 2 + docs/examples/tutorial_dynamic_clicks.md | 117 ++++++++++++++++++ 5 files changed, 271 insertions(+), 9 deletions(-) create mode 100644 docs/examples/tutorial_dynamic_clicks.md diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 1d88c3a8..553e9df4 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -19,8 +19,14 @@ from .js_snippet import load_js_script from .models import AsyncCrawlResponse from .utils import create_box_message from .user_agent_generator import UserAgentGenerator +from .config import SCREENSHOT_HEIGHT_TRESHOLD from playwright_stealth import StealthConfig, stealth_async + +from io import BytesIO +import base64 +from PIL import Image, ImageDraw, ImageFont + stealth_config = StealthConfig( webdriver=True, chrome_app=True, @@ -481,6 +487,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.user_agent = user_agenr_generator.generate( **kwargs.get("user_agent_generator_config", {}) ) + self.pdf = kwargs.get("pdf", False) # New flag + self.screenshot_requested = kwargs.get('screenshot', False) + self.proxy = kwargs.get("proxy") self.proxy_config = kwargs.get("proxy_config") self.headless = kwargs.get("headless", True) @@ -752,7 +761,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """ response_headers = {} status_code = 200 # Default to 200 for local/raw HTML - screenshot_requested = kwargs.get('screenshot', False) + screenshot_requested = kwargs.get("screenshot", self.screenshot_requested) + pdf_requested = kwargs.get("pdf", self.pdf) screenshot_data = None if url.startswith(('http://', 'https://')): @@ -796,6 +806,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): response_headers = {} status_code = None + screenshot_requested = kwargs.get("screenshot", self.screenshot_requested) + pdf_requested = kwargs.get("pdf", self.pdf) + # Reset downloaded files list for new crawl self._downloaded_files = [] @@ -1069,17 +1082,28 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): html = await page.content() await self.execute_hook('before_return_html', page, html, context = context, **kwargs) + start_export_time = time.perf_counter() + pdf_data = None + if pdf_requested: + # Generate PDF once + pdf_data = await self.export_pdf(page) + # Check if kwargs has screenshot=True then take screenshot screenshot_data = None - if kwargs.get("screenshot"): + if screenshot_requested: #kwargs.get("screenshot"): # Check we have screenshot_wait_for parameter, if we have simply wait for that time screenshot_wait_for = kwargs.get("screenshot_wait_for") if screenshot_wait_for: await asyncio.sleep(screenshot_wait_for) - screenshot_data = await self.take_screenshot(page) - - # if self.verbose: - # print(f"[LOG] ✅ Crawled {url} successfully!") + + screenshot_data = await self.take_screenshot(page, **kwargs) + end_export_time = time.perf_counter() + if screenshot_data or pdf_data: + self.logger.info( + message="Exporting PDF and taking screenshot took {duration:.2f}s", + tag="EXPORT", + params={"duration": end_export_time - start_export_time} + ) if self.use_cached_html: cache_file_path = os.path.join( @@ -1105,6 +1129,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): response_headers=response_headers, status_code=status_code, screenshot=screenshot_data, + pdf_data=pdf_data, get_delayed_content=get_delayed_content, downloaded_files=self._downloaded_files if self._downloaded_files else None ) @@ -1181,7 +1206,112 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # if self.verbose: # print(f"Warning: Failed to remove overlay elements: {str(e)}") - async def take_screenshot(self, page: Page) -> str: + async def export_pdf(self, page: Page) -> bytes: + """ + Exports the current page as a PDF. + """ + pdf_data = await page.pdf(print_background=True) + return pdf_data + + async def take_screenshot(self, page, **kwargs) -> str: + page_height = await page.evaluate("document.documentElement.scrollHeight") + if page_height < kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD): + # Page is short enough, just take a screenshot + return await self.take_screenshot_naive(page) + else: + # Page is too long, try to take a full-page screenshot + return await self.take_screenshot_scroller(page, **kwargs) + # return await self.take_screenshot_from_pdf(await self.export_pdf(page)) + + async def take_screenshot_from_pdf(self, pdf_data: bytes) -> str: + """ + Convert the first page of the PDF to a screenshot. + Requires pdf2image and poppler. + """ + try: + from pdf2image import convert_from_bytes + images = convert_from_bytes(pdf_data) + final_img = images[0].convert('RGB') + buffered = BytesIO() + final_img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode('utf-8') + except Exception as e: + error_message = f"Failed to take PDF-based screenshot: {str(e)}" + self.logger.error( + message="PDF Screenshot failed: {error}", + tag="ERROR", + params={"error": error_message} + ) + # Return error image as fallback + img = Image.new('RGB', (800, 600), color='black') + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode('utf-8') + + async def take_screenshot_scroller(self, page: Page, **kwargs) -> str: + """ + Attempt to set a large viewport and take a full-page screenshot. + If still too large, segment the page as before. + """ + try: + # Get page height + page_height = await page.evaluate("document.documentElement.scrollHeight") + page_width = await page.evaluate("document.documentElement.scrollWidth") + + # Set a large viewport + large_viewport_height = min(page_height, kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD)) + await page.set_viewport_size({"width": page_width, "height": large_viewport_height}) + + # Page still too long, segment approach + segments = [] + viewport_size = page.viewport_size + viewport_height = viewport_size["height"] + + num_segments = (page_height // viewport_height) + 1 + for i in range(num_segments): + y_offset = i * viewport_height + await page.evaluate(f"window.scrollTo(0, {y_offset})") + await asyncio.sleep(0.01) # wait for render + seg_shot = await page.screenshot(full_page=False) + img = Image.open(BytesIO(seg_shot)).convert('RGB') + segments.append(img) + + total_height = sum(img.height for img in segments) + stitched = Image.new('RGB', (segments[0].width, total_height)) + offset = 0 + for img in segments: + # stitched.paste(img, (0, offset)) + stitched.paste(img.convert('RGB'), (0, offset)) + offset += img.height + + buffered = BytesIO() + stitched = stitched.convert('RGB') + stitched.save(buffered, format="BMP", quality=85) + encoded = base64.b64encode(buffered.getvalue()).decode('utf-8') + + return encoded + except Exception as e: + error_message = f"Failed to take large viewport screenshot: {str(e)}" + self.logger.error( + message="Large viewport screenshot failed: {error}", + tag="ERROR", + params={"error": error_message} + ) + # return error image + img = Image.new('RGB', (800, 600), color='black') + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode('utf-8') + finally: + await page.close() + + async def take_screenshot_naive(self, page: Page) -> str: """ Takes a screenshot of the current page. @@ -1193,7 +1323,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """ try: # The page is already loaded, just take the screenshot - screenshot = await page.screenshot(full_page=True) + screenshot = await page.screenshot(full_page=False) return base64.b64encode(screenshot).decode('utf-8') except Exception as e: error_message = f"Failed to take screenshot: {str(e)}" diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 1a4b1333..fc6fe82f 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -147,6 +147,7 @@ class AsyncWebCrawler: # Other parameters css_selector: str = None, screenshot: bool = False, + pdf: bool = False, user_agent: str = None, verbose=True, **kwargs, @@ -230,6 +231,7 @@ class AsyncWebCrawler: async_response: AsyncCrawlResponse = None cached_result = None screenshot_data = None + pdf_data = None extracted_content = None start_time = time.perf_counter() @@ -245,6 +247,10 @@ class AsyncWebCrawler: screenshot_data = cached_result.screenshot if not screenshot_data: cached_result = None + if pdf: + pdf_data = cached_result.pdf + if not pdf_data: + cached_result = None # if verbose: # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s") self.logger.url_status( @@ -264,10 +270,12 @@ class AsyncWebCrawler: async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl( url, screenshot=screenshot, + pdf=pdf, **kwargs ) html = sanitize_input_encode(async_response.html) screenshot_data = async_response.screenshot + pdf_data = async_response.pdf_data t2 = time.perf_counter() self.logger.url_status( url=cache_context.display_url, @@ -289,6 +297,7 @@ class AsyncWebCrawler: content_filter=content_filter, css_selector=css_selector, screenshot=screenshot_data, + pdf_data=pdf_data, verbose=verbose, is_cached=bool(cached_result), async_response=async_response, @@ -362,6 +371,7 @@ class AsyncWebCrawler: bypass_cache: bool = False, css_selector: str = None, screenshot: bool = False, + pdf: bool = False, user_agent: str = None, verbose=True, **kwargs, @@ -550,6 +560,7 @@ class AsyncWebCrawler: ) screenshot = None if not screenshot else screenshot + pdf_data = kwargs.get("pdf_data", None) if kwargs.get("prettiify", False): @@ -567,6 +578,7 @@ class AsyncWebCrawler: links=links, metadata=metadata, screenshot=screenshot, + pdf=pdf_data, extracted_content=extracted_content, success=True, error_message="", diff --git a/crawl4ai/config.py b/crawl4ai/config.py index 786ca4e5..e17ff34f 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -56,4 +56,5 @@ MAX_METRICS_HISTORY = 1000 NEED_MIGRATION = True URL_LOG_SHORTEN_LENGTH = 30 -SHOW_DEPRECATION_WARNINGS = True \ No newline at end of file +SHOW_DEPRECATION_WARNINGS = True +SCREENSHOT_HEIGHT_TRESHOLD = 10000 \ No newline at end of file diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 3a1b8bd1..315069fb 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -23,6 +23,7 @@ class CrawlResult(BaseModel): links: Dict[str, List[Dict]] = {} downloaded_files: Optional[List[str]] = None screenshot: Optional[str] = None + pdf : Optional[bytes] = None markdown: Optional[Union[str, MarkdownGenerationResult]] = None markdown_v2: Optional[MarkdownGenerationResult] = None fit_markdown: Optional[str] = None @@ -39,6 +40,7 @@ class AsyncCrawlResponse(BaseModel): response_headers: Dict[str, str] status_code: int screenshot: Optional[str] = None + pdf_data: Optional[bytes] = None get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None downloaded_files: Optional[List[str]] = None diff --git a/docs/examples/tutorial_dynamic_clicks.md b/docs/examples/tutorial_dynamic_clicks.md new file mode 100644 index 00000000..d9669952 --- /dev/null +++ b/docs/examples/tutorial_dynamic_clicks.md @@ -0,0 +1,117 @@ +# Tutorial: Clicking Buttons to Load More Content with Crawl4AI + +## Introduction + +When scraping dynamic websites, it’s common to encounter “Load More” or “Next” buttons that must be clicked to reveal new content. Crawl4AI provides a straightforward way to handle these situations using JavaScript execution and waiting conditions. In this tutorial, we’ll cover two approaches: + +1. **Step-by-step (Session-based) Approach:** Multiple calls to `arun()` to progressively load more content. +2. **Single-call Approach:** Execute a more complex JavaScript snippet inside a single `arun()` call to handle all clicks at once before the extraction. + +## Prerequisites + +- A working installation of Crawl4AI +- Basic familiarity with Python’s `async`/`await` syntax + +## Step-by-Step Approach + +Use a session ID to maintain state across multiple `arun()` calls: + +```python +from crawl4ai import AsyncWebCrawler, CacheMode + +js_code = [ + # This JS finds the “Next” button and clicks it + "const nextButton = document.querySelector('button.next'); nextButton && nextButton.click();" +] + +wait_for_condition = "css:.new-content-class" + +async with AsyncWebCrawler(headless=True, verbose=True) as crawler: + # 1. Load the initial page + result_initial = await crawler.arun( + url="https://example.com", + cache_mode=CacheMode.BYPASS, + session_id="my_session" + ) + + # 2. Click the 'Next' button and wait for new content + result_next = await crawler.arun( + url="https://example.com", + session_id="my_session", + js_code=js_code, + wait_for=wait_for_condition, + js_only=True, + cache_mode=CacheMode.BYPASS + ) + +# `result_next` now contains the updated HTML after clicking 'Next' +``` + +**Key Points:** +- **`session_id`**: Keeps the same browser context open. +- **`js_code`**: Executes JavaScript in the context of the already loaded page. +- **`wait_for`**: Ensures the crawler waits until new content is fully loaded. +- **`js_only=True`**: Runs the JS in the current session without reloading the page. + +By repeating the `arun()` call multiple times and modifying the `js_code` (e.g., clicking different modules or pages), you can iteratively load all the desired content. + +## Single-call Approach + +If the page allows it, you can run a single `arun()` call with a more elaborate JavaScript snippet that: +- Iterates over all the modules or "Next" buttons +- Clicks them one by one +- Waits for content updates between each click +- Once done, returns control to Crawl4AI for extraction. + +Example snippet: + +```python +from crawl4ai import AsyncWebCrawler, CacheMode + +js_code = [ + # Example JS that clicks multiple modules: + """ + (async () => { + const modules = document.querySelectorAll('.module-item'); + for (let i = 0; i < modules.length; i++) { + modules[i].scrollIntoView(); + modules[i].click(); + // Wait for each module’s content to load, adjust 100ms as needed + await new Promise(r => setTimeout(r, 100)); + } + })(); + """ +] + +async with AsyncWebCrawler(headless=True, verbose=True) as crawler: + result = await crawler.arun( + url="https://example.com", + js_code=js_code, + wait_for="css:.final-loaded-content-class", + cache_mode=CacheMode.BYPASS + ) + +# `result` now contains all content after all modules have been clicked in one go. +``` + +**Key Points:** +- All interactions (clicks and waits) happen before the extraction. +- Ideal for pages where all steps can be done in a single pass. + +## Choosing the Right Approach + +- **Step-by-Step (Session-based)**: + - Good when you need fine-grained control or must dynamically check conditions before clicking the next page. + - Useful if the page requires multiple conditions checked at runtime. + +- **Single-call**: + - Perfect if the sequence of interactions is known in advance. + - Cleaner code if the page’s structure is consistent and predictable. + +## Conclusion + +Crawl4AI makes it easy to handle dynamic content: +- Use session IDs and multiple `arun()` calls for stepwise crawling. +- Or pack all actions into one `arun()` call if the interactions are well-defined upfront. + +This flexibility ensures you can handle a wide range of dynamic web pages efficiently.