feat: Add screenshot functionality to crawl_urls

The code changes in this commit add the `screenshot` parameter to the `crawl_urls` function in `main.py`. This allows users to specify whether they want to take a screenshot of the page during the crawling process. The default value is `False`.

This commit message follows the established convention of starting with a type (feat for feature) and providing a concise and descriptive summary of the changes made.
This commit is contained in:
unclecode
2024-06-07 15:23:32 +08:00
parent 0533aeb814
commit 8e73a482a2
11 changed files with 147 additions and 27 deletions

View File

@@ -104,6 +104,7 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
chunking_strategy: document.getElementById("chunking-strategy-select").value,
chunking_strategy_args: {},
css_selector: document.getElementById("css-selector").value,
screenshot: document.getElementById("screenshot-checkbox").checked,
// instruction: document.getElementById("instruction").value,
// semantic_filter: document.getElementById("semantic_filter").value,
verbose: true,
@@ -138,7 +139,14 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
document.getElementById("markdown-result").textContent = result.markdown;
document.getElementById("media-result").textContent = JSON.stringify( result.media, null, 2);
if (result.screenshot){
const imgElement = document.createElement("img");
// Set the src attribute with the base64 data
imgElement.src = `data:image/png;base64,${result.screenshot}`;
document.getElementById("screenshot-result").innerHTML = "";
document.getElementById("screenshot-result").appendChild(imgElement);
}
// Update code examples dynamically
const extractionStrategy = data.extraction_strategy;
const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy";

View File

@@ -124,6 +124,10 @@
<input type="checkbox" id="bypass-cache-checkbox" checked />
<label for="bypass-cache-checkbox" class="text-lime-500 font-bold">Bypass Cache</label>
</div>
<div class="flex items-center gap-2">
<input type="checkbox" id="screenshot-checkbox" checked />
<label for="screenshot-checkbox" class="text-lime-500 font-bold">Screenshot</label>
</div>
<div class="flex items-center gap-2 hidden">
<input type="checkbox" id="extract-blocks-checkbox" />
<label for="extract-blocks-checkbox" class="text-lime-500 font-bold">Extract Blocks</label>
@@ -152,12 +156,16 @@
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="media">
Medias
</button>
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="screenshot">
Screenshot
</button>
</div>
<div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
<pre class="h-full flex"><code id="json-result" class="language-json"></code></pre>
<pre class="hidden h-full flex"><code id="cleaned-html-result" class="language-html"></code></pre>
<pre class="hidden h-full flex"><code id="markdown-result" class="language-markdown"></code></pre>
<pre class="hidden h-full flex"><code id="media-result" class="language-json"></code></pre>
<pre class="hidden h-full flex"><code id="screenshot-result"></code></pre>
</div>
</div>