feat: Add screenshot functionality to crawl_urls

This commit is contained in:
unclecode
2024-06-07 15:33:15 +08:00
parent 8e73a482a2
commit 226a62a3c0
3 changed files with 37 additions and 4 deletions

View File

@@ -13,8 +13,8 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
## Recent Changes ## Recent Changes
### v0.2.3 ### v0.2.3
- 🎨 Extract and return all media tags (Images, Audio, and Video). - 🎨 Extract and return all media tags (Images, Audio, and Video). Check `result.media`
- 🖼️ Take screenshots of the page. - 🖼️ Take [screenshots](#taking-screenshots-) of the page.
### v0.2.2 ### v0.2.2
- Support multiple JS scripts - Support multiple JS scripts
@@ -266,6 +266,14 @@ Crawl result without raw HTML content:
result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False) result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
``` ```
### Taking Screenshots
```python
result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
with open("screenshot.png", "wb") as f:
f.write(base64.b64decode(result.screenshot))
```
### Adding a chunking strategy: RegexChunking ### Adding a chunking strategy: RegexChunking
Using RegexChunking: Using RegexChunking:
@@ -372,6 +380,7 @@ result = crawler.run(url="https://www.nbcnews.com/business")
| `urls` | A list of URLs to crawl and extract data from. | Yes | - | | `urls` | A list of URLs to crawl and extract data from. | Yes | - |
| `include_raw_html` | Whether to include the raw HTML content in the response. | No | `false` | | `include_raw_html` | Whether to include the raw HTML content in the response. | No | `false` |
| `bypass_cache` | Whether to force a fresh crawl even if the URL has been previously crawled. | No | `false` | | `bypass_cache` | Whether to force a fresh crawl even if the URL has been previously crawled. | No | `false` |
| `screenshots` | Whether to take screenshots of the page. | No | `false` |
| `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5). | No | `5` | | `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5). | No | `5` |
| `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `NoExtractionStrategy` | | `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `NoExtractionStrategy` |
| `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` | | `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` |

View File

@@ -35,10 +35,20 @@ def cprint(message, press_any_key=False):
def basic_usage(crawler): def basic_usage(crawler):
cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]") cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True) result = crawler.run(url="https://www.nbcnews.com/business")
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]") cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
print_result(result) print_result(result)
def screenshot_usage(crawler):
cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
# Save the screenshot to a file
with open("screenshot.png", "wb") as f:
f.write(base64.b64decode(result.screenshot))
cprint("Screenshot saved to 'screenshot.png'!")
print_result(result)
def understanding_parameters(crawler): def understanding_parameters(crawler):
cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]") cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.") cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.")
@@ -187,11 +197,11 @@ def main():
crawler = create_crawler() crawler = create_crawler()
crawler.always_by_pass_cache = True
basic_usage(crawler) basic_usage(crawler)
understanding_parameters(crawler) understanding_parameters(crawler)
crawler.always_by_pass_cache = True crawler.always_by_pass_cache = True
screenshot_usage(crawler)
add_chunking_strategy(crawler) add_chunking_strategy(crawler)
add_extraction_strategy(crawler) add_extraction_strategy(crawler)
add_llm_extraction_strategy(crawler) add_llm_extraction_strategy(crawler)

View File

@@ -50,6 +50,20 @@ crawler.warmup()</code></pre>
<div> <div>
<pre><code class="language-python">crawler.always_by_pass_cache = True</code></pre> <pre><code class="language-python">crawler.always_by_pass_cache = True</code></pre>
</div> </div>
<!-- Step 3.5 Screenshot -->
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
📸
<strong>Let's take a screenshot of the page!</strong>
</div>
<div>
<pre><code class="language-python">result = crawler.run(
url="https://www.nbcnews.com/business",
screenshot=True
)
with open("screenshot.png", "wb") as f:
f.write(base64.b64decode(result.screenshot))</code></pre>
</div>
<!-- Step 4 --> <!-- Step 4 -->
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50"> <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">