Files
crawl4ai/pages/index_pooling.html
2024-05-09 19:10:25 +08:00

426 lines
22 KiB
HTML

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Crawl4AI</title>
<link rel="preconnect" href="https://fonts.googleapis.com" />
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@100..900&display=swap" rel="stylesheet" />
<link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet" />
<script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
<link
rel="stylesheet"
href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/vs2015.min.css"
/>
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script>
<style>
:root {
--ifm-font-size-base: 100%;
--ifm-line-height-base: 1.65;
--ifm-font-family-base: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans,
sans-serif, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif, "Apple Color Emoji",
"Segoe UI Emoji", "Segoe UI Symbol";
}
html {
-webkit-font-smoothing: antialiased;
-webkit-text-size-adjust: 100%;
text-size-adjust: 100%;
font: var(--ifm-font-size-base) / var(--ifm-line-height-base) var(--ifm-font-family-base);
}
body {
background-color: #1a202c;
color: #fff;
}
.tab-content {
max-height: 400px;
overflow: auto;
}
pre {
white-space: pre-wrap;
font-size: 14px;
}
pre code {
width: 100%;
}
</style>
</head>
<body>
<header class="bg-gray-900 text-white py-4">
<div class="container mx-auto px-4">
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Scrapper</h1>
</div>
</header>
<section class="try-it py-8 pb-20">
<div class="container mx-auto px-4">
<h2 class="text-2xl font-bold mb-4">Try It Now</h2>
<div class="mb-4 flex w-full gap-2">
<input
type="text"
id="url-input"
value="https://kidocode.com"
class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white"
placeholder="Enter URL(s) separated by commas"
/>
<select
id="provider-model-select"
class="border border-gray-600 rounded px-4 py-2 bg-gray-800 text-white"
>
<!-- Add your option values here -->
<option value="groq/llama3-70b-8192">groq/llama3-70b-8192</option>
<option value="groq/llama3-8b-8192">groq/llama3-8b-8192</option>
<option value="openai/gpt-4-turbo">gpt-4-turbo</option>
<option value="openai/gpt-3.5-turbo">gpt-3.5-turbo</option>
<option value="anthropic/claude-3-haiku-20240307">claude-3-haiku</option>
<option value="anthropic/claude-3-opus-20240229">claude-3-opus</option>
<option value="anthropic/claude-3-sonnet-20240229">claude-3-sonnet</option>
</select>
<input
type="password"
id="token-input"
class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white"
placeholder="Enter Groq API token"
/>
<div class="flex items-center justify-center">
<input type="checkbox" id="extract-blocks-checkbox" class="mr-2" checked>
<label for="extract-blocks-checkbox" class="text-white">Extract Blocks</label>
</div>
<button id="crawl-btn" class="bg-blue-600 text-white px-4 py-2 rounded">Crawl</button>
</div>
<div class="grid grid-cols-1 md:grid-cols-2 gap-8">
<div id="loading" class="hidden mt-4">
<p>Loading...</p>
</div>
<div id="result" class="tab-container flex-1 h-full flex-col">
<div class="tab-buttons flex gap-2">
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="json">JSON</button>
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="cleaned-html">
Cleaned HTML
</button>
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="markdown">
Markdown
</button>
</div>
<div class="tab-content code bg-gray-800 p-2 rounded h-full flex-1 border border-gray-600">
<pre class="h-full flex"><code id="json-result" class="language-json "></code></pre>
<pre
class="hidden h-full flex"
><code id="cleaned-html-result" class="language-html "></code></pre>
<pre
class="hidden h-full flex"
><code id="markdown-result" class="language-markdown "></code></pre>
</div>
</div>
<div id="code_help" class="tab-container flex-1 h-full">
<div class="tab-buttons flex gap-2">
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="curl">cURL</button>
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="python">
Python
</button>
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="nodejs">
Node.js
</button>
</div>
<div class="tab-content result bg-gray-800 p-2 rounded h-full flex-1 border border-gray-600">
<pre class="h-full flex relative">
<code id="curl-code" class="language-bash"></code>
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="curl-code">Copy</button>
</pre>
<pre class="hidden h-full flex relative">
<code id="python-code" class="language-python"></code>
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="python-code">Copy</button>
</pre>
<pre class="hidden h-full flex relative">
<code id="nodejs-code" class="language-javascript"></code>
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="nodejs-code">Copy</button>
</pre>
</div>
</div>
</div>
</div>
</section>
<section class="hero bg-gray-900 py-8">
<div class="container mx-auto px-4">
<h2 class="text-3xl font-bold mb-4">🤔 Why building this?</h2>
<p class="text-lg mb-4">
In recent times, we've seen numerous startups emerging, riding the AI hype wave and charging for
services that should rightfully be accessible to everyone. 🌍💸 One for example is to scrap and crawl
a web page, and transform it o a form suitable for LLM. We don't think one should build a business
out of this, but definilty should be opened source. So if you possess the skills to build such things
and you have such philosphy you should join our "Robinhood" band and help set
these products free. 🆓🤝
</p>
</div>
</section>
<section class="installation py-8">
<div class="container mx-auto px-4">
<h2 class="text-2xl font-bold mb-4">⚙️ Installation</h2>
<p class="mb-4">
To install and run Crawl4AI locally or on your own service, the best way is to use Docker. 🐳 Follow
these steps:
</p>
<ol class="list-decimal list-inside mb-4">
<li>
Clone the GitHub repository: 📥
<code>git clone https://github.com/unclecode/crawl4ai.git</code>
</li>
<li>Navigate to the project directory: 📂 <code>cd crawl4ai</code></li>
<li>
Build the Docker image: 🛠️ <code>docker build -t crawl4ai .</code> On Mac, follow: 🍎
<code>docker build --platform linux/amd64 -t crawl4ai .</code>
</li>
<li>Run the Docker container: ▶️ <code>docker run -p 8000:80 crawl4ai</code></li>
</ol>
<p>
For more detailed instructions and advanced configuration options, please refer to the 📚
<a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
</p>
</div>
</section>
<footer class="bg-gray-900 text-white py-4">
<div class="container mx-auto px-4">
<div class="flex justify-between items-center">
<p>© 2024 Crawl4AI. All rights reserved.</p>
<div class="social-links">
<a
href="https://github.com/unclecode/crawl4ai"
class="text-white hover:text-gray-300 mx-2"
target="_blank"
>😺 GitHub</a
>
<a
href="https://twitter.com/unclecode"
class="text-white hover:text-gray-300 mx-2"
target="_blank"
>🐦 Twitter</a
>
<a
href="https://discord.gg/your-invite-link"
class="text-white hover:text-gray-300 mx-2"
target="_blank"
>💬 Discord</a
>
</div>
</div>
</div>
</footer>
<script>
// Get the selected provider model and token from local storage
const storedProviderModel = localStorage.getItem("provider_model");
const storedToken = localStorage.getItem(storedProviderModel);
if (storedProviderModel) {
document.getElementById("provider-model-select").value = storedProviderModel;
}
if (storedToken) {
document.getElementById("token-input").value = storedToken;
}
// Handle provider model dropdown change
document.getElementById("provider-model-select").addEventListener("change", () => {
const selectedProviderModel = document.getElementById("provider-model-select").value;
const storedToken = localStorage.getItem(selectedProviderModel);
if (storedToken) {
document.getElementById("token-input").value = storedToken;
} else {
document.getElementById("token-input").value = "";
}
});
// Fetch total count from the database
axios
.get("/total-count")
.then((response) => {
document.getElementById("total-count").textContent = response.data.count;
})
.catch((error) => console.error(error));
// Handle crawl button click
document.getElementById("crawl-btn").addEventListener("click", () => {
// validate input to have both URL and API token
if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) {
alert("Please enter both URL(s) and API token.");
return;
}
const selectedProviderModel = document.getElementById("provider-model-select").value;
const apiToken = document.getElementById("token-input").value;
const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
// Save the selected provider model and token to local storage
localStorage.setItem("provider_model", selectedProviderModel);
localStorage.setItem(selectedProviderModel, apiToken);
const urlsInput = document.getElementById("url-input").value;
const urls = urlsInput.split(",").map((url) => url.trim());
const data = {
urls: urls,
provider_model: selectedProviderModel,
api_token: apiToken,
include_raw_html: true,
forced: false,
extract_blocks: extractBlocks,
};
// save api token to local storage
localStorage.setItem("api_token", document.getElementById("token-input").value);
document.getElementById("loading").classList.remove("hidden");
document.getElementById("result").classList.add("hidden");
document.getElementById("code_help").classList.add("hidden");
axios
.post("/crawl", data)
.then((response) => {
const result = response.data.results[0];
const parsedJson = JSON.parse(result.parsed_json);
document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
document.getElementById("markdown-result").textContent = result.markdown;
// Update code examples dynamically
// Update code examples dynamically
document.getElementById(
"curl-code"
).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
...data,
api_token: "your_api_token",
})}' http://localhost:8000/crawl`;
document.getElementById(
"python-code"
).textContent = `import requests\n\ndata = ${JSON.stringify(
{ ...data, api_token: "your_api_token" },
null,
2
)}\n\nresponse = requests.post("http://localhost:8000/crawl", json=data)\nprint(response.json())`;
document.getElementById(
"nodejs-code"
).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify(
{ ...data, api_token: "your_api_token" },
null,
2
)};\n\naxios.post("http://localhost:8000/crawl", data)\n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
// Highlight code syntax
hljs.highlightAll();
// Select JSON tab by default
document.querySelector('.tab-btn[data-tab="json"]').click();
document.getElementById("loading").classList.add("hidden");
document.getElementById("result").classList.remove("hidden");
document.getElementById("code_help").classList.remove("hidden");
})
.catch((error) => {
console.error(error);
document.getElementById("loading").classList.add("hidden");
});
});
// Handle tab clicks
document.querySelectorAll(".tab-btn").forEach((btn) => {
btn.addEventListener("click", () => {
const tab = btn.dataset.tab;
document
.querySelectorAll(".tab-btn")
.forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
btn.classList.add("bg-blue-600", "text-white");
document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden"));
document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden");
});
});
// Handle code tab clicks
document.querySelectorAll(".code-tab-btn").forEach((btn) => {
btn.addEventListener("click", () => {
const tab = btn.dataset.tab;
document
.querySelectorAll(".code-tab-btn")
.forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
btn.classList.add("bg-blue-600", "text-white");
document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden"));
document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden");
});
});
// Handle copy to clipboard button clicks
document.querySelectorAll(".copy-btn").forEach((btn) => {
btn.addEventListener("click", () => {
const target = btn.dataset.target;
const code = document.getElementById(target).textContent;
navigator.clipboard.writeText(code).then(() => {
btn.textContent = "Copied!";
setTimeout(() => {
btn.textContent = "Copy";
}, 2000);
});
});
});
document.getElementById("crawl-btn").addEventListener("click", () => {
const urlsInput = document.getElementById("url-input").value;
const urls = urlsInput.split(",").map(url => url.trim());
const apiToken = document.getElementById("token-input").value;
const selectedProviderModel = document.getElementById("provider-model-select").value;
const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
const data = {
urls: urls,
provider_model: selectedProviderModel,
api_token: apiToken,
include_raw_html: true,
forced: false,
extract_blocks: extractBlocks
};
localStorage.setItem("api_token", apiToken);
document.getElementById("loading").classList.remove("hidden");
document.getElementById("result").classList.add("hidden");
document.getElementById("code_help").classList.add("hidden");
axios.post("/crawl", data)
.then(response => {
const taskId = response.data.task_id;
pollTaskStatus(taskId);
})
.catch(error => {
console.error('Error during fetch:', error);
document.getElementById("loading").classList.add("hidden");
});
});
function pollTaskStatus(taskId) {
axios.get(`/task/${taskId}`)
.then(response => {
const task = response.data;
if (task.status === 'done') {
displayResults(task.results[0]);
} else if (task.status === 'pending') {
setTimeout(() => pollTaskStatus(taskId), 2000); // Poll every 2 seconds
} else {
console.error('Task failed:', task.error);
document.getElementById("loading").classList.add("hidden");
}
})
.catch(error => {
console.error('Error polling task status:', error);
document.getElementById("loading").classList.add("hidden");
});
}
</script>
</body>
</html>