Add PDF & screenshot functionality, new tutorial

- Added support for exporting pages as PDFs - Enhanced screenshot functionality for long pages - Created a tutorial on dynamic content loading with 'Load More' buttons. - Updated web crawler to handle PDF data in responses.
2024-12-10 20:10:39 +08:00
parent e130fd8db9
commit 5431fa2d0c
5 changed files with 271 additions and 9 deletions
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -19,8 +19,14 @@ from .js_snippet import load_js_script
 from .models import AsyncCrawlResponse
 from .utils import create_box_message
 from .user_agent_generator import UserAgentGenerator
+from .config import SCREENSHOT_HEIGHT_TRESHOLD
 from playwright_stealth import StealthConfig, stealth_async

+
+from io import BytesIO
+import base64
+from PIL import Image, ImageDraw, ImageFont
+
 stealth_config = StealthConfig(
    webdriver=True,
    chrome_app=True,
@@ -481,6 +487,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            self.user_agent = user_agenr_generator.generate(
                 **kwargs.get("user_agent_generator_config", {})
            )
+        self.pdf = kwargs.get("pdf", False)  # New flag
+        self.screenshot_requested = kwargs.get('screenshot', False)
+        
        self.proxy = kwargs.get("proxy")
        self.proxy_config = kwargs.get("proxy_config")
        self.headless = kwargs.get("headless", True)
@@ -752,7 +761,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        """
        response_headers = {}
        status_code = 200  # Default to 200 for local/raw HTML
-        screenshot_requested = kwargs.get('screenshot', False)
+        screenshot_requested = kwargs.get("screenshot", self.screenshot_requested)
+        pdf_requested = kwargs.get("pdf", self.pdf)
        screenshot_data = None

        if url.startswith(('http://', 'https://')):
@@ -796,6 +806,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        response_headers = {}
        status_code = None
        
+        screenshot_requested = kwargs.get("screenshot", self.screenshot_requested)
+        pdf_requested = kwargs.get("pdf", self.pdf)
+        
        # Reset downloaded files list for new crawl
        self._downloaded_files = []
        
@@ -1069,17 +1082,28 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            html = await page.content()
            await self.execute_hook('before_return_html', page, html, context = context, **kwargs)
            
+            start_export_time = time.perf_counter()
+            pdf_data = None
+            if pdf_requested:
+                # Generate PDF once
+                pdf_data = await self.export_pdf(page)            
+            
            # Check if kwargs has screenshot=True then take screenshot
            screenshot_data = None
-            if kwargs.get("screenshot"):
+            if screenshot_requested: #kwargs.get("screenshot"):
                # Check we have screenshot_wait_for parameter, if we have simply wait for that time
                screenshot_wait_for = kwargs.get("screenshot_wait_for")
                if screenshot_wait_for:
                    await asyncio.sleep(screenshot_wait_for)
-                screenshot_data = await self.take_screenshot(page)          
-
-            # if self.verbose:
-            #     print(f"[LOG] ✅ Crawled {url} successfully!")
+                
+                screenshot_data = await self.take_screenshot(page, **kwargs)    
+            end_export_time = time.perf_counter()
+            if screenshot_data or pdf_data:
+                self.logger.info(
+                    message="Exporting PDF and taking screenshot took {duration:.2f}s",
+                    tag="EXPORT",
+                    params={"duration": end_export_time - start_export_time}
+                )
           
            if self.use_cached_html:
                cache_file_path = os.path.join(
@@ -1105,6 +1129,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                response_headers=response_headers, 
                status_code=status_code,
                screenshot=screenshot_data,
+                pdf_data=pdf_data,
                get_delayed_content=get_delayed_content,
                downloaded_files=self._downloaded_files if self._downloaded_files else None
            )
@@ -1181,7 +1206,112 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            # if self.verbose:
            #     print(f"Warning: Failed to remove overlay elements: {str(e)}")

-    async def take_screenshot(self, page: Page) -> str:
+    async def export_pdf(self, page: Page) -> bytes:
+        """
+        Exports the current page as a PDF.
+        """
+        pdf_data = await page.pdf(print_background=True)
+        return pdf_data
+
+    async def take_screenshot(self, page, **kwargs) -> str:
+        page_height = await page.evaluate("document.documentElement.scrollHeight")
+        if page_height < kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD):
+            # Page is short enough, just take a screenshot
+            return await self.take_screenshot_naive(page)
+        else:
+            # Page is too long, try to take a full-page screenshot
+            return await self.take_screenshot_scroller(page, **kwargs)
+            # return await self.take_screenshot_from_pdf(await self.export_pdf(page))     
+
+    async def take_screenshot_from_pdf(self, pdf_data: bytes) -> str:
+        """
+        Convert the first page of the PDF to a screenshot.
+        Requires pdf2image and poppler.
+        """
+        try:
+            from pdf2image import convert_from_bytes
+            images = convert_from_bytes(pdf_data)
+            final_img = images[0].convert('RGB')
+            buffered = BytesIO()
+            final_img.save(buffered, format="JPEG")
+            return base64.b64encode(buffered.getvalue()).decode('utf-8')
+        except Exception as e:
+            error_message = f"Failed to take PDF-based screenshot: {str(e)}"
+            self.logger.error(
+                message="PDF Screenshot failed: {error}",
+                tag="ERROR",
+                params={"error": error_message}
+            )
+            # Return error image as fallback
+            img = Image.new('RGB', (800, 600), color='black')
+            draw = ImageDraw.Draw(img)
+            font = ImageFont.load_default()
+            draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
+            buffered = BytesIO()
+            img.save(buffered, format="JPEG")
+            return base64.b64encode(buffered.getvalue()).decode('utf-8')
+
+    async def take_screenshot_scroller(self, page: Page, **kwargs) -> str:
+        """
+        Attempt to set a large viewport and take a full-page screenshot.
+        If still too large, segment the page as before.
+        """
+        try:
+            # Get page height
+            page_height = await page.evaluate("document.documentElement.scrollHeight")
+            page_width = await page.evaluate("document.documentElement.scrollWidth")
+
+            # Set a large viewport
+            large_viewport_height = min(page_height, kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD))
+            await page.set_viewport_size({"width": page_width, "height": large_viewport_height})
+            
+            # Page still too long, segment approach
+            segments = []
+            viewport_size = page.viewport_size
+            viewport_height = viewport_size["height"]
+
+            num_segments = (page_height // viewport_height) + 1
+            for i in range(num_segments):
+                y_offset = i * viewport_height
+                await page.evaluate(f"window.scrollTo(0, {y_offset})")
+                await asyncio.sleep(0.01)  # wait for render
+                seg_shot = await page.screenshot(full_page=False)
+                img = Image.open(BytesIO(seg_shot)).convert('RGB')
+                segments.append(img)
+
+            total_height = sum(img.height for img in segments)
+            stitched = Image.new('RGB', (segments[0].width, total_height))
+            offset = 0
+            for img in segments:
+                # stitched.paste(img, (0, offset))
+                stitched.paste(img.convert('RGB'), (0, offset))
+                offset += img.height
+
+            buffered = BytesIO()
+            stitched = stitched.convert('RGB')
+            stitched.save(buffered, format="BMP", quality=85)
+            encoded = base64.b64encode(buffered.getvalue()).decode('utf-8')
+
+            return encoded
+        except Exception as e:
+            error_message = f"Failed to take large viewport screenshot: {str(e)}"
+            self.logger.error(
+                message="Large viewport screenshot failed: {error}",
+                tag="ERROR",
+                params={"error": error_message}
+            )
+            # return error image
+            img = Image.new('RGB', (800, 600), color='black')
+            draw = ImageDraw.Draw(img)
+            font = ImageFont.load_default()
+            draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
+            buffered = BytesIO()
+            img.save(buffered, format="JPEG")
+            return base64.b64encode(buffered.getvalue()).decode('utf-8')
+        finally:
+            await page.close()
+    
+    async def take_screenshot_naive(self, page: Page) -> str:
        """
        Takes a screenshot of the current page.
        
@@ -1193,7 +1323,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        """
        try:
            # The page is already loaded, just take the screenshot
-            screenshot = await page.screenshot(full_page=True)
+            screenshot = await page.screenshot(full_page=False)
            return base64.b64encode(screenshot).decode('utf-8')
        except Exception as e:
            error_message = f"Failed to take screenshot: {str(e)}"
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -147,6 +147,7 @@ class AsyncWebCrawler:
        # Other parameters
        css_selector: str = None,
        screenshot: bool = False,
+        pdf: bool = False,
        user_agent: str = None,
        verbose=True,
        **kwargs,
@@ -230,6 +231,7 @@ class AsyncWebCrawler:
                async_response: AsyncCrawlResponse = None
                cached_result = None
                screenshot_data = None
+                pdf_data = None
                extracted_content = None
                
                start_time = time.perf_counter()
@@ -245,6 +247,10 @@ class AsyncWebCrawler:
                        screenshot_data = cached_result.screenshot
                        if not screenshot_data:
                            cached_result = None
+                    if pdf:
+                        pdf_data = cached_result.pdf
+                        if not pdf_data:
+                            cached_result = None
                    # if verbose:
                    #     print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s")
                    self.logger.url_status(
@@ -264,10 +270,12 @@ class AsyncWebCrawler:
                    async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(
                        url, 
                        screenshot=screenshot, 
+                        pdf=pdf,
                        **kwargs
                    )
                    html = sanitize_input_encode(async_response.html)
                    screenshot_data = async_response.screenshot
+                    pdf_data = async_response.pdf_data
                    t2 = time.perf_counter()
                    self.logger.url_status(
                        url=cache_context.display_url,
@@ -289,6 +297,7 @@ class AsyncWebCrawler:
                    content_filter=content_filter,
                    css_selector=css_selector,
                    screenshot=screenshot_data,
+                    pdf_data=pdf_data,
                    verbose=verbose,
                    is_cached=bool(cached_result),
                    async_response=async_response,
@@ -362,6 +371,7 @@ class AsyncWebCrawler:
        bypass_cache: bool = False,
        css_selector: str = None,
        screenshot: bool = False,
+        pdf: bool = False,
        user_agent: str = None,
        verbose=True,
        **kwargs,
@@ -550,6 +560,7 @@ class AsyncWebCrawler:
            )

        screenshot = None if not screenshot else screenshot
+        pdf_data = kwargs.get("pdf_data", None) 
        
        
        if kwargs.get("prettiify", False):
@@ -567,6 +578,7 @@ class AsyncWebCrawler:
            links=links,
            metadata=metadata,
            screenshot=screenshot,
+            pdf=pdf_data,
            extracted_content=extracted_content,
            success=True,
            error_message="",
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -56,4 +56,5 @@ MAX_METRICS_HISTORY = 1000

 NEED_MIGRATION = True
 URL_LOG_SHORTEN_LENGTH = 30
-SHOW_DEPRECATION_WARNINGS = True
+SHOW_DEPRECATION_WARNINGS = True
+SCREENSHOT_HEIGHT_TRESHOLD = 10000
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -23,6 +23,7 @@ class CrawlResult(BaseModel):
    links: Dict[str, List[Dict]] = {}
    downloaded_files: Optional[List[str]] = None
    screenshot: Optional[str] = None
+    pdf : Optional[bytes] = None
    markdown: Optional[Union[str, MarkdownGenerationResult]] = None
    markdown_v2: Optional[MarkdownGenerationResult] = None
    fit_markdown: Optional[str] = None
@@ -39,6 +40,7 @@ class AsyncCrawlResponse(BaseModel):
    response_headers: Dict[str, str]
    status_code: int
    screenshot: Optional[str] = None
+    pdf_data: Optional[bytes] = None
    get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
    downloaded_files: Optional[List[str]] = None

--- a/docs/examples/tutorial_dynamic_clicks.md
+++ b/docs/examples/tutorial_dynamic_clicks.md
@@ -0,0 +1,117 @@
+# Tutorial: Clicking Buttons to Load More Content with Crawl4AI
+
+## Introduction
+
+When scraping dynamic websites, it’s common to encounter “Load More” or “Next” buttons that must be clicked to reveal new content. Crawl4AI provides a straightforward way to handle these situations using JavaScript execution and waiting conditions. In this tutorial, we’ll cover two approaches:
+
+1. **Step-by-step (Session-based) Approach:** Multiple calls to `arun()` to progressively load more content.
+2. **Single-call Approach:** Execute a more complex JavaScript snippet inside a single `arun()` call to handle all clicks at once before the extraction.
+
+## Prerequisites
+
+- A working installation of Crawl4AI
+- Basic familiarity with Python’s `async`/`await` syntax
+
+## Step-by-Step Approach
+
+Use a session ID to maintain state across multiple `arun()` calls:
+
+```python
+from crawl4ai import AsyncWebCrawler, CacheMode
+
+js_code = [
+    # This JS finds the “Next” button and clicks it
+    "const nextButton = document.querySelector('button.next'); nextButton && nextButton.click();"
+]
+
+wait_for_condition = "css:.new-content-class"
+
+async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
+    # 1. Load the initial page
+    result_initial = await crawler.arun(
+        url="https://example.com",
+        cache_mode=CacheMode.BYPASS,
+        session_id="my_session"
+    )
+
+    # 2. Click the 'Next' button and wait for new content
+    result_next = await crawler.arun(
+        url="https://example.com",
+        session_id="my_session",
+        js_code=js_code,
+        wait_for=wait_for_condition,
+        js_only=True,
+        cache_mode=CacheMode.BYPASS
+    )
+
+# `result_next` now contains the updated HTML after clicking 'Next'
+```
+
+**Key Points:**
+- **`session_id`**: Keeps the same browser context open.
+- **`js_code`**: Executes JavaScript in the context of the already loaded page.
+- **`wait_for`**: Ensures the crawler waits until new content is fully loaded.
+- **`js_only=True`**: Runs the JS in the current session without reloading the page.
+
+By repeating the `arun()` call multiple times and modifying the `js_code` (e.g., clicking different modules or pages), you can iteratively load all the desired content.
+
+## Single-call Approach
+
+If the page allows it, you can run a single `arun()` call with a more elaborate JavaScript snippet that:
+- Iterates over all the modules or "Next" buttons
+- Clicks them one by one
+- Waits for content updates between each click
+- Once done, returns control to Crawl4AI for extraction.
+
+Example snippet:
+
+```python
+from crawl4ai import AsyncWebCrawler, CacheMode
+
+js_code = [
+    # Example JS that clicks multiple modules:
+    """
+    (async () => {
+      const modules = document.querySelectorAll('.module-item');
+      for (let i = 0; i < modules.length; i++) {
+        modules[i].scrollIntoView();
+        modules[i].click();
+        // Wait for each module’s content to load, adjust 100ms as needed
+        await new Promise(r => setTimeout(r, 100));
+      }
+    })();
+    """
+]
+
+async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        js_code=js_code,
+        wait_for="css:.final-loaded-content-class",
+        cache_mode=CacheMode.BYPASS
+    )
+
+# `result` now contains all content after all modules have been clicked in one go.
+```
+
+**Key Points:**
+- All interactions (clicks and waits) happen before the extraction.
+- Ideal for pages where all steps can be done in a single pass.
+
+## Choosing the Right Approach
+
+- **Step-by-Step (Session-based)**: 
+  - Good when you need fine-grained control or must dynamically check conditions before clicking the next page.
+  - Useful if the page requires multiple conditions checked at runtime.
+
+- **Single-call**:
+  - Perfect if the sequence of interactions is known in advance.
+  - Cleaner code if the page’s structure is consistent and predictable.
+
+## Conclusion
+
+Crawl4AI makes it easy to handle dynamic content:
+- Use session IDs and multiple `arun()` calls for stepwise crawling.
+- Or pack all actions into one `arun()` call if the interactions are well-defined upfront.
+
+This flexibility ensures you can handle a wide range of dynamic web pages efficiently.