feat: Add screenshot functionality to crawl_urls

2024-06-07 15:33:15 +08:00
parent 8e73a482a2
commit 226a62a3c0
3 changed files with 37 additions and 4 deletions
--- a/README.md
+++ b/README.md
@@ -13,8 +13,8 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
 ## Recent Changes 

 ### v0.2.3
- 🎨 Extract and return all media tags (Images, Audio, and Video).
- 🖼️ Take screenshots of the page.
+- 🎨 Extract and return all media tags (Images, Audio, and Video). Check `result.media`
+- 🖼️ Take [screenshots](#taking-screenshots-) of the page.

 ### v0.2.2
 - Support multiple JS scripts
@@ -266,6 +266,14 @@ Crawl result without raw HTML content:
 result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
 ```

+### Taking Screenshots
+
+```python
+result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
+with open("screenshot.png", "wb") as f:
+    f.write(base64.b64decode(result.screenshot))
+```
+
 ### Adding a chunking strategy: RegexChunking

 Using RegexChunking:
@@ -372,6 +380,7 @@ result = crawler.run(url="https://www.nbcnews.com/business")
 | `urls`                | A list of URLs to crawl and extract data from.                                                        | Yes      | -                   |
 | `include_raw_html`    | Whether to include the raw HTML content in the response.                                              | No       | `false`             |
 | `bypass_cache`        | Whether to force a fresh crawl even if the URL has been previously crawled.                           | No       | `false`             |
+| `screenshots`         | Whether to take screenshots of the page.                                                              | No       | `false`             |
 | `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5).    | No       | `5`                 |
 | `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy").                    | No       | `NoExtractionStrategy`    |
 | `chunking_strategy`   | The strategy to use for chunking the text before processing (e.g., "RegexChunking").                  | No       | `RegexChunking`     |
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -35,10 +35,20 @@ def cprint(message, press_any_key=False):

 def basic_usage(crawler):
    cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
-    result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
+    result = crawler.run(url="https://www.nbcnews.com/business")
    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
    print_result(result)

+def screenshot_usage(crawler):
+    cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
+    result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
+    cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
+    # Save the screenshot to a file
+    with open("screenshot.png", "wb") as f:
+        f.write(base64.b64decode(result.screenshot))
+    cprint("Screenshot saved to 'screenshot.png'!")
+    print_result(result)
+
 def understanding_parameters(crawler):
    cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
    cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.")
@@ -187,11 +197,11 @@ def main():

    crawler = create_crawler()

-    crawler.always_by_pass_cache = True
    basic_usage(crawler)
    understanding_parameters(crawler)
    
    crawler.always_by_pass_cache = True
+    screenshot_usage(crawler)
    add_chunking_strategy(crawler)
    add_extraction_strategy(crawler)
    add_llm_extraction_strategy(crawler)
--- a/pages/partial/how_to_guide.html
+++ b/pages/partial/how_to_guide.html
@@ -50,6 +50,20 @@ crawler.warmup()</code></pre>
        <div>
            <pre><code class="language-python">crawler.always_by_pass_cache = True</code></pre>
        </div>
+        <!-- Step 3.5 Screenshot -->
+        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
+            📸
+            <strong>Let's take a screenshot of the page!</strong>
+        </div>
+        <div>
+            <pre><code class="language-python">result = crawler.run(
+    url="https://www.nbcnews.com/business",
+    screenshot=True
+)
+with open("screenshot.png", "wb") as f:
+    f.write(base64.b64decode(result.screenshot))</code></pre>
+        </div>
+

        <!-- Step 4 -->
        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">