Add PDF & screenshot functionality, new tutorial

- Added support for exporting pages as PDFs - Enhanced screenshot functionality for long pages - Created a tutorial on dynamic content loading with 'Load More' buttons. - Updated web crawler to handle PDF data in responses.
2024-12-10 20:10:39 +08:00
parent e130fd8db9
commit 5431fa2d0c
5 changed files with 271 additions and 9 deletions
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -19,8 +19,14 @@ from .js_snippet import load_js_script
 from .models import AsyncCrawlResponse
 from .utils import create_box_message
 from .user_agent_generator import UserAgentGenerator
 from .config import SCREENSHOT_HEIGHT_TRESHOLD
 from playwright_stealth import StealthConfig, stealth_async
 from io import BytesIO
 import base64
 from PIL import Image, ImageDraw, ImageFont
 stealth_config = StealthConfig(
    webdriver=True,
    chrome_app=True,
@@ -481,6 +487,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            self.user_agent = user_agenr_generator.generate(
                 **kwargs.get("user_agent_generator_config", {})
            )
        self.pdf = kwargs.get("pdf", False)  # New flag
        self.screenshot_requested = kwargs.get('screenshot', False)
        self.proxy = kwargs.get("proxy")
        self.proxy_config = kwargs.get("proxy_config")
        self.headless = kwargs.get("headless", True)
@@ -752,7 +761,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        """
        response_headers = {}
        status_code = 200  # Default to 200 for local/raw HTML
-        screenshot_requested = kwargs.get('screenshot', False)
+        screenshot_requested = kwargs.get("screenshot", self.screenshot_requested)
        pdf_requested = kwargs.get("pdf", self.pdf)
        screenshot_data = None
        if url.startswith(('http://', 'https://')):
@@ -796,6 +806,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        response_headers = {}
        status_code = None
        screenshot_requested = kwargs.get("screenshot", self.screenshot_requested)
        pdf_requested = kwargs.get("pdf", self.pdf)
        # Reset downloaded files list for new crawl
        self._downloaded_files = []
@@ -1069,17 +1082,28 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            html = await page.content()
            await self.execute_hook('before_return_html', page, html, context = context, **kwargs)
            start_export_time = time.perf_counter()
            pdf_data = None
            if pdf_requested:
                # Generate PDF once
                pdf_data = await self.export_pdf(page)            
            # Check if kwargs has screenshot=True then take screenshot
            screenshot_data = None
-            if kwargs.get("screenshot"):
+            if screenshot_requested: #kwargs.get("screenshot"):
                # Check we have screenshot_wait_for parameter, if we have simply wait for that time
                screenshot_wait_for = kwargs.get("screenshot_wait_for")
                if screenshot_wait_for:
                    await asyncio.sleep(screenshot_wait_for)
-                screenshot_data = await self.take_screenshot(page)          
+                
-
+                screenshot_data = await self.take_screenshot(page, **kwargs)    
-            # if self.verbose:
+            end_export_time = time.perf_counter()
-            #     print(f"[LOG] ✅ Crawled {url} successfully!")
+            if screenshot_data or pdf_data:
                self.logger.info(
                    message="Exporting PDF and taking screenshot took {duration:.2f}s",
                    tag="EXPORT",
                    params={"duration": end_export_time - start_export_time}
                )
            if self.use_cached_html:
                cache_file_path = os.path.join(
@@ -1105,6 +1129,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                response_headers=response_headers, 
                status_code=status_code,
                screenshot=screenshot_data,
                pdf_data=pdf_data,
                get_delayed_content=get_delayed_content,
                downloaded_files=self._downloaded_files if self._downloaded_files else None
            )
@@ -1181,7 +1206,112 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            # if self.verbose:
            #     print(f"Warning: Failed to remove overlay elements: {str(e)}")
-    async def take_screenshot(self, page: Page) -> str:
+    async def export_pdf(self, page: Page) -> bytes:
        """
        Exports the current page as a PDF.
        """
        pdf_data = await page.pdf(print_background=True)
        return pdf_data
    async def take_screenshot(self, page, **kwargs) -> str:
        page_height = await page.evaluate("document.documentElement.scrollHeight")
        if page_height < kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD):
            # Page is short enough, just take a screenshot
            return await self.take_screenshot_naive(page)
        else:
            # Page is too long, try to take a full-page screenshot
            return await self.take_screenshot_scroller(page, **kwargs)
            # return await self.take_screenshot_from_pdf(await self.export_pdf(page))     
    async def take_screenshot_from_pdf(self, pdf_data: bytes) -> str:
        """
        Convert the first page of the PDF to a screenshot.
        Requires pdf2image and poppler.
        """
        try:
            from pdf2image import convert_from_bytes
            images = convert_from_bytes(pdf_data)
            final_img = images[0].convert('RGB')
            buffered = BytesIO()
            final_img.save(buffered, format="JPEG")
            return base64.b64encode(buffered.getvalue()).decode('utf-8')
        except Exception as e:
            error_message = f"Failed to take PDF-based screenshot: {str(e)}"
            self.logger.error(
                message="PDF Screenshot failed: {error}",
                tag="ERROR",
                params={"error": error_message}
            )
            # Return error image as fallback
            img = Image.new('RGB', (800, 600), color='black')
            draw = ImageDraw.Draw(img)
            font = ImageFont.load_default()
            draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
            buffered = BytesIO()
            img.save(buffered, format="JPEG")
            return base64.b64encode(buffered.getvalue()).decode('utf-8')
    async def take_screenshot_scroller(self, page: Page, **kwargs) -> str:
        """
        Attempt to set a large viewport and take a full-page screenshot.
        If still too large, segment the page as before.
        """
        try:
            # Get page height
            page_height = await page.evaluate("document.documentElement.scrollHeight")
            page_width = await page.evaluate("document.documentElement.scrollWidth")
            # Set a large viewport
            large_viewport_height = min(page_height, kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD))
            await page.set_viewport_size({"width": page_width, "height": large_viewport_height})
            # Page still too long, segment approach
            segments = []
            viewport_size = page.viewport_size
            viewport_height = viewport_size["height"]
            num_segments = (page_height // viewport_height) + 1
            for i in range(num_segments):
                y_offset = i * viewport_height
                await page.evaluate(f"window.scrollTo(0, {y_offset})")
                await asyncio.sleep(0.01)  # wait for render
                seg_shot = await page.screenshot(full_page=False)
                img = Image.open(BytesIO(seg_shot)).convert('RGB')
                segments.append(img)
            total_height = sum(img.height for img in segments)
            stitched = Image.new('RGB', (segments[0].width, total_height))
            offset = 0
            for img in segments:
                # stitched.paste(img, (0, offset))
                stitched.paste(img.convert('RGB'), (0, offset))
                offset += img.height
            buffered = BytesIO()
            stitched = stitched.convert('RGB')
            stitched.save(buffered, format="BMP", quality=85)
            encoded = base64.b64encode(buffered.getvalue()).decode('utf-8')
            return encoded
        except Exception as e:
            error_message = f"Failed to take large viewport screenshot: {str(e)}"
            self.logger.error(
                message="Large viewport screenshot failed: {error}",
                tag="ERROR",
                params={"error": error_message}
            )
            # return error image
            img = Image.new('RGB', (800, 600), color='black')
            draw = ImageDraw.Draw(img)
            font = ImageFont.load_default()
            draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
            buffered = BytesIO()
            img.save(buffered, format="JPEG")
            return base64.b64encode(buffered.getvalue()).decode('utf-8')
        finally:
            await page.close()
    async def take_screenshot_naive(self, page: Page) -> str:
        """
        Takes a screenshot of the current page.
@@ -1193,7 +1323,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        """
        try:
            # The page is already loaded, just take the screenshot
-            screenshot = await page.screenshot(full_page=True)
+            screenshot = await page.screenshot(full_page=False)
            return base64.b64encode(screenshot).decode('utf-8')
        except Exception as e:
            error_message = f"Failed to take screenshot: {str(e)}"
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -147,6 +147,7 @@ class AsyncWebCrawler:
        # Other parameters
        css_selector: str = None,
        screenshot: bool = False,
        pdf: bool = False,
        user_agent: str = None,
        verbose=True,
        **kwargs,
@@ -230,6 +231,7 @@ class AsyncWebCrawler:
                async_response: AsyncCrawlResponse = None
                cached_result = None
                screenshot_data = None
                pdf_data = None
                extracted_content = None
                start_time = time.perf_counter()
@@ -245,6 +247,10 @@ class AsyncWebCrawler:
                        screenshot_data = cached_result.screenshot
                        if not screenshot_data:
                            cached_result = None
                    if pdf:
                        pdf_data = cached_result.pdf
                        if not pdf_data:
                            cached_result = None
                    # if verbose:
                    #     print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s")
                    self.logger.url_status(
@@ -264,10 +270,12 @@ class AsyncWebCrawler:
                    async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(
                        url, 
                        screenshot=screenshot, 
                        pdf=pdf,
                        **kwargs
                    )
                    html = sanitize_input_encode(async_response.html)
                    screenshot_data = async_response.screenshot
                    pdf_data = async_response.pdf_data
                    t2 = time.perf_counter()
                    self.logger.url_status(
                        url=cache_context.display_url,
@@ -289,6 +297,7 @@ class AsyncWebCrawler:
                    content_filter=content_filter,
                    css_selector=css_selector,
                    screenshot=screenshot_data,
                    pdf_data=pdf_data,
                    verbose=verbose,
                    is_cached=bool(cached_result),
                    async_response=async_response,
@@ -362,6 +371,7 @@ class AsyncWebCrawler:
        bypass_cache: bool = False,
        css_selector: str = None,
        screenshot: bool = False,
        pdf: bool = False,
        user_agent: str = None,
        verbose=True,
        **kwargs,
@@ -550,6 +560,7 @@ class AsyncWebCrawler:
            )
        screenshot = None if not screenshot else screenshot
        pdf_data = kwargs.get("pdf_data", None) 
        if kwargs.get("prettiify", False):
@@ -567,6 +578,7 @@ class AsyncWebCrawler:
            links=links,
            metadata=metadata,
            screenshot=screenshot,
            pdf=pdf_data,
            extracted_content=extracted_content,
            success=True,
            error_message="",
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -56,4 +56,5 @@ MAX_METRICS_HISTORY = 1000
 NEED_MIGRATION = True
 URL_LOG_SHORTEN_LENGTH = 30
-SHOW_DEPRECATION_WARNINGS = True
+SHOW_DEPRECATION_WARNINGS = True
 SCREENSHOT_HEIGHT_TRESHOLD = 10000
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -23,6 +23,7 @@ class CrawlResult(BaseModel):
    links: Dict[str, List[Dict]] = {}
    downloaded_files: Optional[List[str]] = None
    screenshot: Optional[str] = None
    pdf : Optional[bytes] = None
    markdown: Optional[Union[str, MarkdownGenerationResult]] = None
    markdown_v2: Optional[MarkdownGenerationResult] = None
    fit_markdown: Optional[str] = None
@@ -39,6 +40,7 @@ class AsyncCrawlResponse(BaseModel):
    response_headers: Dict[str, str]
    status_code: int
    screenshot: Optional[str] = None
    pdf_data: Optional[bytes] = None
    get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
    downloaded_files: Optional[List[str]] = None
--- a/docs/examples/tutorial_dynamic_clicks.md
+++ b/docs/examples/tutorial_dynamic_clicks.md
@@ -0,0 +1,117 @@
 # Tutorial: Clicking Buttons to Load More Content with Crawl4AI
 ## Introduction
 When scraping dynamic websites, it’s common to encounter “Load More” or “Next” buttons that must be clicked to reveal new content. Crawl4AI provides a straightforward way to handle these situations using JavaScript execution and waiting conditions. In this tutorial, we’ll cover two approaches:
 1. **Step-by-step (Session-based) Approach:** Multiple calls to `arun()` to progressively load more content.
 2. **Single-call Approach:** Execute a more complex JavaScript snippet inside a single `arun()` call to handle all clicks at once before the extraction.
 ## Prerequisites
 - A working installation of Crawl4AI
 - Basic familiarity with Python’s `async`/`await` syntax
 ## Step-by-Step Approach
 Use a session ID to maintain state across multiple `arun()` calls:
 ```python
 from crawl4ai import AsyncWebCrawler, CacheMode
 js_code = [
    # This JS finds the “Next” button and clicks it
    "const nextButton = document.querySelector('button.next'); nextButton && nextButton.click();"
 ]
 wait_for_condition = "css:.new-content-class"
 async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
    # 1. Load the initial page
    result_initial = await crawler.arun(
        url="https://example.com",
        cache_mode=CacheMode.BYPASS,
        session_id="my_session"
    )
    # 2. Click the 'Next' button and wait for new content
    result_next = await crawler.arun(
        url="https://example.com",
        session_id="my_session",
        js_code=js_code,
        wait_for=wait_for_condition,
        js_only=True,
        cache_mode=CacheMode.BYPASS
    )
 # `result_next` now contains the updated HTML after clicking 'Next'
 ```
 **Key Points:**
 - **`session_id`**: Keeps the same browser context open.
 - **`js_code`**: Executes JavaScript in the context of the already loaded page.
 - **`wait_for`**: Ensures the crawler waits until new content is fully loaded.
 - **`js_only=True`**: Runs the JS in the current session without reloading the page.
 By repeating the `arun()` call multiple times and modifying the `js_code` (e.g., clicking different modules or pages), you can iteratively load all the desired content.
 ## Single-call Approach
 If the page allows it, you can run a single `arun()` call with a more elaborate JavaScript snippet that:
 - Iterates over all the modules or "Next" buttons
 - Clicks them one by one
 - Waits for content updates between each click
 - Once done, returns control to Crawl4AI for extraction.
 Example snippet:
 ```python
 from crawl4ai import AsyncWebCrawler, CacheMode
 js_code = [
    # Example JS that clicks multiple modules:
    """
    (async () => {
      const modules = document.querySelectorAll('.module-item');
      for (let i = 0; i < modules.length; i++) {
        modules[i].scrollIntoView();
        modules[i].click();
        // Wait for each module’s content to load, adjust 100ms as needed
        await new Promise(r => setTimeout(r, 100));
      }
    })();
    """
 ]
 async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
    result = await crawler.arun(
        url="https://example.com",
        js_code=js_code,
        wait_for="css:.final-loaded-content-class",
        cache_mode=CacheMode.BYPASS
    )
 # `result` now contains all content after all modules have been clicked in one go.
 ```
 **Key Points:**
 - All interactions (clicks and waits) happen before the extraction.
 - Ideal for pages where all steps can be done in a single pass.
 ## Choosing the Right Approach
 - **Step-by-Step (Session-based)**: 
  - Good when you need fine-grained control or must dynamically check conditions before clicking the next page.
  - Useful if the page requires multiple conditions checked at runtime.
 - **Single-call**:
  - Perfect if the sequence of interactions is known in advance.
  - Cleaner code if the page’s structure is consistent and predictable.
 ## Conclusion
 Crawl4AI makes it easy to handle dynamic content:
 - Use session IDs and multiple `arun()` calls for stepwise crawling.
 - Or pack all actions into one `arun()` call if the interactions are well-defined upfront.
 This flexibility ensures you can handle a wide range of dynamic web pages efficiently.