feat: Add screenshot functionality to crawl_urls
This commit is contained in:
13
README.md
13
README.md
@@ -13,8 +13,8 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
|
|||||||
## Recent Changes
|
## Recent Changes
|
||||||
|
|
||||||
### v0.2.3
|
### v0.2.3
|
||||||
- 🎨 Extract and return all media tags (Images, Audio, and Video).
|
- 🎨 Extract and return all media tags (Images, Audio, and Video). Check `result.media`
|
||||||
- 🖼️ Take screenshots of the page.
|
- 🖼️ Take [screenshots](#taking-screenshots-) of the page.
|
||||||
|
|
||||||
### v0.2.2
|
### v0.2.2
|
||||||
- Support multiple JS scripts
|
- Support multiple JS scripts
|
||||||
@@ -266,6 +266,14 @@ Crawl result without raw HTML content:
|
|||||||
result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
|
result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Taking Screenshots
|
||||||
|
|
||||||
|
```python
|
||||||
|
result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
|
||||||
|
with open("screenshot.png", "wb") as f:
|
||||||
|
f.write(base64.b64decode(result.screenshot))
|
||||||
|
```
|
||||||
|
|
||||||
### Adding a chunking strategy: RegexChunking
|
### Adding a chunking strategy: RegexChunking
|
||||||
|
|
||||||
Using RegexChunking:
|
Using RegexChunking:
|
||||||
@@ -372,6 +380,7 @@ result = crawler.run(url="https://www.nbcnews.com/business")
|
|||||||
| `urls` | A list of URLs to crawl and extract data from. | Yes | - |
|
| `urls` | A list of URLs to crawl and extract data from. | Yes | - |
|
||||||
| `include_raw_html` | Whether to include the raw HTML content in the response. | No | `false` |
|
| `include_raw_html` | Whether to include the raw HTML content in the response. | No | `false` |
|
||||||
| `bypass_cache` | Whether to force a fresh crawl even if the URL has been previously crawled. | No | `false` |
|
| `bypass_cache` | Whether to force a fresh crawl even if the URL has been previously crawled. | No | `false` |
|
||||||
|
| `screenshots` | Whether to take screenshots of the page. | No | `false` |
|
||||||
| `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5). | No | `5` |
|
| `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5). | No | `5` |
|
||||||
| `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `NoExtractionStrategy` |
|
| `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `NoExtractionStrategy` |
|
||||||
| `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` |
|
| `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` |
|
||||||
|
|||||||
@@ -35,10 +35,20 @@ def cprint(message, press_any_key=False):
|
|||||||
|
|
||||||
def basic_usage(crawler):
|
def basic_usage(crawler):
|
||||||
cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
|
cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
|
||||||
result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
|
result = crawler.run(url="https://www.nbcnews.com/business")
|
||||||
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
|
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
|
||||||
print_result(result)
|
print_result(result)
|
||||||
|
|
||||||
|
def screenshot_usage(crawler):
|
||||||
|
cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
|
||||||
|
result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
|
||||||
|
cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
|
||||||
|
# Save the screenshot to a file
|
||||||
|
with open("screenshot.png", "wb") as f:
|
||||||
|
f.write(base64.b64decode(result.screenshot))
|
||||||
|
cprint("Screenshot saved to 'screenshot.png'!")
|
||||||
|
print_result(result)
|
||||||
|
|
||||||
def understanding_parameters(crawler):
|
def understanding_parameters(crawler):
|
||||||
cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
|
cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
|
||||||
cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.")
|
cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.")
|
||||||
@@ -187,11 +197,11 @@ def main():
|
|||||||
|
|
||||||
crawler = create_crawler()
|
crawler = create_crawler()
|
||||||
|
|
||||||
crawler.always_by_pass_cache = True
|
|
||||||
basic_usage(crawler)
|
basic_usage(crawler)
|
||||||
understanding_parameters(crawler)
|
understanding_parameters(crawler)
|
||||||
|
|
||||||
crawler.always_by_pass_cache = True
|
crawler.always_by_pass_cache = True
|
||||||
|
screenshot_usage(crawler)
|
||||||
add_chunking_strategy(crawler)
|
add_chunking_strategy(crawler)
|
||||||
add_extraction_strategy(crawler)
|
add_extraction_strategy(crawler)
|
||||||
add_llm_extraction_strategy(crawler)
|
add_llm_extraction_strategy(crawler)
|
||||||
|
|||||||
@@ -50,6 +50,20 @@ crawler.warmup()</code></pre>
|
|||||||
<div>
|
<div>
|
||||||
<pre><code class="language-python">crawler.always_by_pass_cache = True</code></pre>
|
<pre><code class="language-python">crawler.always_by_pass_cache = True</code></pre>
|
||||||
</div>
|
</div>
|
||||||
|
<!-- Step 3.5 Screenshot -->
|
||||||
|
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||||
|
📸
|
||||||
|
<strong>Let's take a screenshot of the page!</strong>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<pre><code class="language-python">result = crawler.run(
|
||||||
|
url="https://www.nbcnews.com/business",
|
||||||
|
screenshot=True
|
||||||
|
)
|
||||||
|
with open("screenshot.png", "wb") as f:
|
||||||
|
f.write(base64.b64decode(result.screenshot))</code></pre>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
<!-- Step 4 -->
|
<!-- Step 4 -->
|
||||||
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
<div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
|
||||||
|
|||||||
Reference in New Issue
Block a user