The code changes in this commit add the `screenshot` parameter to the `crawl_urls` function in `main.py`. This allows users to specify whether they want to take a screenshot of the page during the crawling process. The default value is `False`. This commit message follows the established convention of starting with a type (feat for feature) and providing a concise and descriptive summary of the changes made.
357 lines
14 KiB
JavaScript
357 lines
14 KiB
JavaScript
// JavaScript to manage dynamic form changes and logic
|
|
document.getElementById("extraction-strategy-select").addEventListener("change", function () {
|
|
const strategy = this.value;
|
|
const providerModelSelect = document.getElementById("provider-model-select");
|
|
const tokenInput = document.getElementById("token-input");
|
|
const instruction = document.getElementById("instruction");
|
|
const semantic_filter = document.getElementById("semantic_filter");
|
|
const instruction_div = document.getElementById("instruction_div");
|
|
const semantic_filter_div = document.getElementById("semantic_filter_div");
|
|
const llm_settings = document.getElementById("llm_settings");
|
|
|
|
if (strategy === "LLMExtractionStrategy") {
|
|
// providerModelSelect.disabled = false;
|
|
// tokenInput.disabled = false;
|
|
// semantic_filter.disabled = true;
|
|
// instruction.disabled = false;
|
|
llm_settings.classList.remove("hidden");
|
|
instruction_div.classList.remove("hidden");
|
|
semantic_filter_div.classList.add("hidden");
|
|
} else if (strategy === "NoExtractionStrategy") {
|
|
semantic_filter_div.classList.add("hidden");
|
|
instruction_div.classList.add("hidden");
|
|
llm_settings.classList.add("hidden");
|
|
} else {
|
|
// providerModelSelect.disabled = true;
|
|
// tokenInput.disabled = true;
|
|
// semantic_filter.disabled = false;
|
|
// instruction.disabled = true;
|
|
llm_settings.classList.add("hidden");
|
|
instruction_div.classList.add("hidden");
|
|
semantic_filter_div.classList.remove("hidden");
|
|
}
|
|
|
|
|
|
});
|
|
|
|
// Get the selected provider model and token from local storage
|
|
const storedProviderModel = localStorage.getItem("provider_model");
|
|
const storedToken = localStorage.getItem(storedProviderModel);
|
|
|
|
if (storedProviderModel) {
|
|
document.getElementById("provider-model-select").value = storedProviderModel;
|
|
}
|
|
|
|
if (storedToken) {
|
|
document.getElementById("token-input").value = storedToken;
|
|
}
|
|
|
|
// Handle provider model dropdown change
|
|
document.getElementById("provider-model-select").addEventListener("change", () => {
|
|
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
|
const storedToken = localStorage.getItem(selectedProviderModel);
|
|
|
|
if (storedToken) {
|
|
document.getElementById("token-input").value = storedToken;
|
|
} else {
|
|
document.getElementById("token-input").value = "";
|
|
}
|
|
});
|
|
|
|
// Fetch total count from the database
|
|
axios
|
|
.get("/total-count")
|
|
.then((response) => {
|
|
document.getElementById("total-count").textContent = response.data.count;
|
|
})
|
|
.catch((error) => console.error(error));
|
|
|
|
// Handle crawl button click
|
|
document.getElementById("crawl-btn").addEventListener("click", () => {
|
|
// validate input to have both URL and API token
|
|
// if selected extraction strategy is LLMExtractionStrategy, then API token is required
|
|
if (document.getElementById("extraction-strategy-select").value === "LLMExtractionStrategy") {
|
|
if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) {
|
|
alert("Please enter both URL(s) and API token.");
|
|
return;
|
|
}
|
|
}
|
|
|
|
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
|
const apiToken = document.getElementById("token-input").value;
|
|
const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
|
|
const bypassCache = document.getElementById("bypass-cache-checkbox").checked;
|
|
|
|
// Save the selected provider model and token to local storage
|
|
localStorage.setItem("provider_model", selectedProviderModel);
|
|
localStorage.setItem(selectedProviderModel, apiToken);
|
|
|
|
const urlsInput = document.getElementById("url-input").value;
|
|
const urls = urlsInput.split(",").map((url) => url.trim());
|
|
const data = {
|
|
urls: urls,
|
|
include_raw_html: true,
|
|
bypass_cache: bypassCache,
|
|
extract_blocks: extractBlocks,
|
|
word_count_threshold: parseInt(document.getElementById("threshold").value),
|
|
extraction_strategy: document.getElementById("extraction-strategy-select").value,
|
|
extraction_strategy_args: {
|
|
provider: selectedProviderModel,
|
|
api_token: apiToken,
|
|
instruction: document.getElementById("instruction").value,
|
|
semantic_filter: document.getElementById("semantic_filter").value,
|
|
},
|
|
chunking_strategy: document.getElementById("chunking-strategy-select").value,
|
|
chunking_strategy_args: {},
|
|
css_selector: document.getElementById("css-selector").value,
|
|
screenshot: document.getElementById("screenshot-checkbox").checked,
|
|
// instruction: document.getElementById("instruction").value,
|
|
// semantic_filter: document.getElementById("semantic_filter").value,
|
|
verbose: true,
|
|
};
|
|
|
|
// import requests
|
|
|
|
// data = {
|
|
// "urls": [
|
|
// "https://www.nbcnews.com/business"
|
|
// ],
|
|
// "word_count_threshold": 10,
|
|
// "extraction_strategy": "NoExtractionStrategy",
|
|
// }
|
|
|
|
// response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally
|
|
// print(response.json())
|
|
|
|
// save api token to local storage
|
|
localStorage.setItem("api_token", document.getElementById("token-input").value);
|
|
|
|
document.getElementById("loading").classList.remove("hidden");
|
|
document.getElementById("result").style.visibility = "hidden";
|
|
document.getElementById("code_help").style.visibility = "hidden";
|
|
|
|
axios
|
|
.post("/crawl", data)
|
|
.then((response) => {
|
|
const result = response.data.results[0];
|
|
const parsedJson = JSON.parse(result.extracted_content);
|
|
document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
|
|
document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
|
|
document.getElementById("markdown-result").textContent = result.markdown;
|
|
document.getElementById("media-result").textContent = JSON.stringify( result.media, null, 2);
|
|
if (result.screenshot){
|
|
const imgElement = document.createElement("img");
|
|
// Set the src attribute with the base64 data
|
|
imgElement.src = `data:image/png;base64,${result.screenshot}`;
|
|
document.getElementById("screenshot-result").innerHTML = "";
|
|
document.getElementById("screenshot-result").appendChild(imgElement);
|
|
}
|
|
|
|
// Update code examples dynamically
|
|
const extractionStrategy = data.extraction_strategy;
|
|
const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy";
|
|
|
|
// REMOVE API TOKEN FROM CODE EXAMPLES
|
|
data.extraction_strategy_args.api_token = "your_api_token";
|
|
|
|
if (data.extraction_strategy === "NoExtractionStrategy") {
|
|
delete data.extraction_strategy_args;
|
|
delete data.extrac_blocks;
|
|
}
|
|
|
|
if (data.chunking_strategy === "RegexChunking") {
|
|
delete data.chunking_strategy_args;
|
|
}
|
|
|
|
delete data.verbose;
|
|
|
|
if (data.css_selector === "") {
|
|
delete data.css_selector;
|
|
}
|
|
|
|
if (!data.bypass_cache) {
|
|
delete data.bypass_cache;
|
|
}
|
|
|
|
if (!data.extract_blocks) {
|
|
delete data.extract_blocks;
|
|
}
|
|
|
|
if (!data.include_raw_html) {
|
|
delete data.include_raw_html;
|
|
}
|
|
|
|
document.getElementById(
|
|
"curl-code"
|
|
).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
|
|
...data,
|
|
api_token: isLLMExtraction ? "your_api_token" : undefined,
|
|
}, null, 2)}' https://crawl4ai.com/crawl`;
|
|
|
|
document.getElementById("python-code").textContent = `import requests\n\ndata = ${JSON.stringify(
|
|
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
|
|
null,
|
|
2
|
|
)}\n\nresponse = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
|
|
|
|
document.getElementById(
|
|
"nodejs-code"
|
|
).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify(
|
|
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
|
|
null,
|
|
2
|
|
)};\n\naxios.post("https://crawl4ai.com/crawl", data) // OR local host if your run locally \n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
|
|
|
|
document.getElementById(
|
|
"library-code"
|
|
).textContent = `from crawl4ai.web_crawler import WebCrawler\nfrom crawl4ai.extraction_strategy import *\nfrom crawl4ai.chunking_strategy import *\n\ncrawler = WebCrawler()\ncrawler.warmup()\n\nresult = crawler.run(\n url='${
|
|
urls[0]
|
|
}',\n word_count_threshold=${data.word_count_threshold},\n extraction_strategy=${
|
|
isLLMExtraction
|
|
? `${extractionStrategy}(provider="${data.provider_model}", api_token="${data.api_token}")`
|
|
: extractionStrategy + "()"
|
|
},\n chunking_strategy=${data.chunking_strategy}(),\n bypass_cache=${
|
|
data.bypass_cache
|
|
},\n css_selector="${data.css_selector}"\n)\nprint(result)`;
|
|
|
|
// Highlight code syntax
|
|
hljs.highlightAll();
|
|
|
|
// Select JSON tab by default
|
|
document.querySelector('.tab-btn[data-tab="json"]').click();
|
|
|
|
document.getElementById("loading").classList.add("hidden");
|
|
|
|
document.getElementById("result").style.visibility = "visible";
|
|
document.getElementById("code_help").style.visibility = "visible";
|
|
|
|
// increment the total count
|
|
document.getElementById("total-count").textContent =
|
|
parseInt(document.getElementById("total-count").textContent) + 1;
|
|
})
|
|
.catch((error) => {
|
|
console.error(error);
|
|
document.getElementById("loading").classList.add("hidden");
|
|
});
|
|
});
|
|
|
|
// Handle tab clicks
|
|
document.querySelectorAll(".tab-btn").forEach((btn) => {
|
|
btn.addEventListener("click", () => {
|
|
const tab = btn.dataset.tab;
|
|
document.querySelectorAll(".tab-btn").forEach((b) => b.classList.remove("bg-lime-700", "text-white"));
|
|
btn.classList.add("bg-lime-700", "text-white");
|
|
document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden"));
|
|
document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden");
|
|
});
|
|
});
|
|
|
|
// Handle code tab clicks
|
|
document.querySelectorAll(".code-tab-btn").forEach((btn) => {
|
|
btn.addEventListener("click", () => {
|
|
const tab = btn.dataset.tab;
|
|
document.querySelectorAll(".code-tab-btn").forEach((b) => b.classList.remove("bg-lime-700", "text-white"));
|
|
btn.classList.add("bg-lime-700", "text-white");
|
|
document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden"));
|
|
document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden");
|
|
});
|
|
});
|
|
|
|
// Handle copy to clipboard button clicks
|
|
|
|
async function copyToClipboard(text) {
|
|
if (navigator.clipboard && navigator.clipboard.writeText) {
|
|
return navigator.clipboard.writeText(text);
|
|
} else {
|
|
return fallbackCopyTextToClipboard(text);
|
|
}
|
|
}
|
|
|
|
function fallbackCopyTextToClipboard(text) {
|
|
return new Promise((resolve, reject) => {
|
|
const textArea = document.createElement("textarea");
|
|
textArea.value = text;
|
|
|
|
// Avoid scrolling to bottom
|
|
textArea.style.top = "0";
|
|
textArea.style.left = "0";
|
|
textArea.style.position = "fixed";
|
|
|
|
document.body.appendChild(textArea);
|
|
textArea.focus();
|
|
textArea.select();
|
|
|
|
try {
|
|
const successful = document.execCommand("copy");
|
|
if (successful) {
|
|
resolve();
|
|
} else {
|
|
reject();
|
|
}
|
|
} catch (err) {
|
|
reject(err);
|
|
}
|
|
|
|
document.body.removeChild(textArea);
|
|
});
|
|
}
|
|
|
|
document.querySelectorAll(".copy-btn").forEach((btn) => {
|
|
btn.addEventListener("click", () => {
|
|
const target = btn.dataset.target;
|
|
const code = document.getElementById(target).textContent;
|
|
//navigator.clipboard.writeText(code).then(() => {
|
|
copyToClipboard(code).then(() => {
|
|
btn.textContent = "Copied!";
|
|
setTimeout(() => {
|
|
btn.textContent = "Copy";
|
|
}, 2000);
|
|
});
|
|
});
|
|
});
|
|
|
|
document.addEventListener("DOMContentLoaded", async () => {
|
|
try {
|
|
const extractionResponse = await fetch("/strategies/extraction");
|
|
const extractionStrategies = await extractionResponse.json();
|
|
|
|
const chunkingResponse = await fetch("/strategies/chunking");
|
|
const chunkingStrategies = await chunkingResponse.json();
|
|
|
|
renderStrategies("extraction-strategies", extractionStrategies);
|
|
renderStrategies("chunking-strategies", chunkingStrategies);
|
|
} catch (error) {
|
|
console.error("Error fetching strategies:", error);
|
|
}
|
|
});
|
|
|
|
function renderStrategies(containerId, strategies) {
|
|
const container = document.getElementById(containerId);
|
|
container.innerHTML = ""; // Clear any existing content
|
|
strategies = JSON.parse(strategies);
|
|
Object.entries(strategies).forEach(([strategy, description]) => {
|
|
const strategyElement = document.createElement("div");
|
|
strategyElement.classList.add("bg-zinc-800", "p-4", "rounded", "shadow-md", "docs-item");
|
|
|
|
const strategyDescription = document.createElement("div");
|
|
strategyDescription.classList.add("text-gray-300", "prose", "prose-sm");
|
|
strategyDescription.innerHTML = marked.parse(description);
|
|
|
|
strategyElement.appendChild(strategyDescription);
|
|
|
|
container.appendChild(strategyElement);
|
|
});
|
|
}
|
|
document.querySelectorAll(".sidebar a").forEach((link) => {
|
|
link.addEventListener("click", function (event) {
|
|
event.preventDefault();
|
|
document.querySelectorAll(".content-section").forEach((section) => {
|
|
section.classList.remove("active");
|
|
});
|
|
const target = event.target.getAttribute("data-target");
|
|
document.getElementById(target).classList.add("active");
|
|
});
|
|
});
|
|
// Highlight code syntax
|
|
hljs.highlightAll();
|