426 lines
22 KiB
HTML
426 lines
22 KiB
HTML
<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="UTF-8" />
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
<title>Crawl4AI</title>
|
|
|
|
<link rel="preconnect" href="https://fonts.googleapis.com" />
|
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
|
|
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@100..900&display=swap" rel="stylesheet" />
|
|
|
|
<link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet" />
|
|
<script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
|
|
<link
|
|
rel="stylesheet"
|
|
href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/vs2015.min.css"
|
|
/>
|
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script>
|
|
<style>
|
|
:root {
|
|
--ifm-font-size-base: 100%;
|
|
--ifm-line-height-base: 1.65;
|
|
--ifm-font-family-base: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans,
|
|
sans-serif, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif, "Apple Color Emoji",
|
|
"Segoe UI Emoji", "Segoe UI Symbol";
|
|
}
|
|
html {
|
|
-webkit-font-smoothing: antialiased;
|
|
-webkit-text-size-adjust: 100%;
|
|
text-size-adjust: 100%;
|
|
font: var(--ifm-font-size-base) / var(--ifm-line-height-base) var(--ifm-font-family-base);
|
|
}
|
|
body {
|
|
background-color: #1a202c;
|
|
color: #fff;
|
|
}
|
|
.tab-content {
|
|
max-height: 400px;
|
|
overflow: auto;
|
|
}
|
|
pre {
|
|
white-space: pre-wrap;
|
|
font-size: 14px;
|
|
}
|
|
pre code {
|
|
width: 100%;
|
|
}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<header class="bg-gray-900 text-white py-4">
|
|
<div class="container mx-auto px-4">
|
|
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Scrapper</h1>
|
|
</div>
|
|
</header>
|
|
|
|
<section class="try-it py-8 pb-20">
|
|
<div class="container mx-auto px-4">
|
|
<h2 class="text-2xl font-bold mb-4">Try It Now</h2>
|
|
<div class="mb-4 flex w-full gap-2">
|
|
<input
|
|
type="text"
|
|
id="url-input"
|
|
value="https://kidocode.com"
|
|
class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white"
|
|
placeholder="Enter URL(s) separated by commas"
|
|
/>
|
|
<select
|
|
id="provider-model-select"
|
|
class="border border-gray-600 rounded px-4 py-2 bg-gray-800 text-white"
|
|
>
|
|
<!-- Add your option values here -->
|
|
<option value="groq/llama3-70b-8192">groq/llama3-70b-8192</option>
|
|
<option value="groq/llama3-8b-8192">groq/llama3-8b-8192</option>
|
|
<option value="openai/gpt-4-turbo">gpt-4-turbo</option>
|
|
<option value="openai/gpt-3.5-turbo">gpt-3.5-turbo</option>
|
|
<option value="anthropic/claude-3-haiku-20240307">claude-3-haiku</option>
|
|
<option value="anthropic/claude-3-opus-20240229">claude-3-opus</option>
|
|
<option value="anthropic/claude-3-sonnet-20240229">claude-3-sonnet</option>
|
|
</select>
|
|
<input
|
|
type="password"
|
|
id="token-input"
|
|
class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white"
|
|
placeholder="Enter Groq API token"
|
|
/>
|
|
<div class="flex items-center justify-center">
|
|
<input type="checkbox" id="extract-blocks-checkbox" class="mr-2" checked>
|
|
<label for="extract-blocks-checkbox" class="text-white">Extract Blocks</label>
|
|
</div>
|
|
<button id="crawl-btn" class="bg-blue-600 text-white px-4 py-2 rounded">Crawl</button>
|
|
</div>
|
|
<div class="grid grid-cols-1 md:grid-cols-2 gap-8">
|
|
<div id="loading" class="hidden mt-4">
|
|
<p>Loading...</p>
|
|
</div>
|
|
<div id="result" class="tab-container flex-1 h-full flex-col">
|
|
<div class="tab-buttons flex gap-2">
|
|
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="json">JSON</button>
|
|
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="cleaned-html">
|
|
Cleaned HTML
|
|
</button>
|
|
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="markdown">
|
|
Markdown
|
|
</button>
|
|
</div>
|
|
<div class="tab-content code bg-gray-800 p-2 rounded h-full flex-1 border border-gray-600">
|
|
<pre class="h-full flex"><code id="json-result" class="language-json "></code></pre>
|
|
<pre
|
|
class="hidden h-full flex"
|
|
><code id="cleaned-html-result" class="language-html "></code></pre>
|
|
<pre
|
|
class="hidden h-full flex"
|
|
><code id="markdown-result" class="language-markdown "></code></pre>
|
|
</div>
|
|
</div>
|
|
<div id="code_help" class="tab-container flex-1 h-full">
|
|
<div class="tab-buttons flex gap-2">
|
|
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="curl">cURL</button>
|
|
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="python">
|
|
Python
|
|
</button>
|
|
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="nodejs">
|
|
Node.js
|
|
</button>
|
|
</div>
|
|
<div class="tab-content result bg-gray-800 p-2 rounded h-full flex-1 border border-gray-600">
|
|
<pre class="h-full flex relative">
|
|
<code id="curl-code" class="language-bash"></code>
|
|
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="curl-code">Copy</button>
|
|
</pre>
|
|
<pre class="hidden h-full flex relative">
|
|
<code id="python-code" class="language-python"></code>
|
|
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="python-code">Copy</button>
|
|
</pre>
|
|
<pre class="hidden h-full flex relative">
|
|
<code id="nodejs-code" class="language-javascript"></code>
|
|
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="nodejs-code">Copy</button>
|
|
</pre>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</section>
|
|
|
|
<section class="hero bg-gray-900 py-8">
|
|
<div class="container mx-auto px-4">
|
|
<h2 class="text-3xl font-bold mb-4">🤔 Why building this?</h2>
|
|
<p class="text-lg mb-4">
|
|
In recent times, we've seen numerous startups emerging, riding the AI hype wave and charging for
|
|
services that should rightfully be accessible to everyone. 🌍💸 One for example is to scrap and crawl
|
|
a web page, and transform it o a form suitable for LLM. We don't think one should build a business
|
|
out of this, but definilty should be opened source. So if you possess the skills to build such things
|
|
and you have such philosphy you should join our "Robinhood" band and help set
|
|
these products free. 🆓🤝
|
|
</p>
|
|
</div>
|
|
</section>
|
|
|
|
<section class="installation py-8">
|
|
<div class="container mx-auto px-4">
|
|
<h2 class="text-2xl font-bold mb-4">⚙️ Installation</h2>
|
|
<p class="mb-4">
|
|
To install and run Crawl4AI locally or on your own service, the best way is to use Docker. 🐳 Follow
|
|
these steps:
|
|
</p>
|
|
<ol class="list-decimal list-inside mb-4">
|
|
<li>
|
|
Clone the GitHub repository: 📥
|
|
<code>git clone https://github.com/unclecode/crawl4ai.git</code>
|
|
</li>
|
|
<li>Navigate to the project directory: 📂 <code>cd crawl4ai</code></li>
|
|
<li>
|
|
Build the Docker image: 🛠️ <code>docker build -t crawl4ai .</code> On Mac, follow: 🍎
|
|
<code>docker build --platform linux/amd64 -t crawl4ai .</code>
|
|
</li>
|
|
<li>Run the Docker container: ▶️ <code>docker run -p 8000:80 crawl4ai</code></li>
|
|
</ol>
|
|
<p>
|
|
For more detailed instructions and advanced configuration options, please refer to the 📚
|
|
<a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
|
|
</p>
|
|
</div>
|
|
</section>
|
|
|
|
<footer class="bg-gray-900 text-white py-4">
|
|
<div class="container mx-auto px-4">
|
|
<div class="flex justify-between items-center">
|
|
<p>© 2024 Crawl4AI. All rights reserved.</p>
|
|
<div class="social-links">
|
|
<a
|
|
href="https://github.com/unclecode/crawl4ai"
|
|
class="text-white hover:text-gray-300 mx-2"
|
|
target="_blank"
|
|
>😺 GitHub</a
|
|
>
|
|
<a
|
|
href="https://twitter.com/unclecode"
|
|
class="text-white hover:text-gray-300 mx-2"
|
|
target="_blank"
|
|
>🐦 Twitter</a
|
|
>
|
|
<a
|
|
href="https://discord.gg/your-invite-link"
|
|
class="text-white hover:text-gray-300 mx-2"
|
|
target="_blank"
|
|
>💬 Discord</a
|
|
>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</footer>
|
|
|
|
<script>
|
|
// Get the selected provider model and token from local storage
|
|
const storedProviderModel = localStorage.getItem("provider_model");
|
|
const storedToken = localStorage.getItem(storedProviderModel);
|
|
|
|
if (storedProviderModel) {
|
|
document.getElementById("provider-model-select").value = storedProviderModel;
|
|
}
|
|
|
|
if (storedToken) {
|
|
document.getElementById("token-input").value = storedToken;
|
|
}
|
|
|
|
// Handle provider model dropdown change
|
|
document.getElementById("provider-model-select").addEventListener("change", () => {
|
|
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
|
const storedToken = localStorage.getItem(selectedProviderModel);
|
|
|
|
if (storedToken) {
|
|
document.getElementById("token-input").value = storedToken;
|
|
} else {
|
|
document.getElementById("token-input").value = "";
|
|
}
|
|
});
|
|
|
|
// Fetch total count from the database
|
|
axios
|
|
.get("/total-count")
|
|
.then((response) => {
|
|
document.getElementById("total-count").textContent = response.data.count;
|
|
})
|
|
.catch((error) => console.error(error));
|
|
|
|
// Handle crawl button click
|
|
document.getElementById("crawl-btn").addEventListener("click", () => {
|
|
// validate input to have both URL and API token
|
|
if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) {
|
|
alert("Please enter both URL(s) and API token.");
|
|
return;
|
|
}
|
|
|
|
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
|
const apiToken = document.getElementById("token-input").value;
|
|
const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
|
|
|
|
|
|
// Save the selected provider model and token to local storage
|
|
localStorage.setItem("provider_model", selectedProviderModel);
|
|
localStorage.setItem(selectedProviderModel, apiToken);
|
|
|
|
const urlsInput = document.getElementById("url-input").value;
|
|
const urls = urlsInput.split(",").map((url) => url.trim());
|
|
const data = {
|
|
urls: urls,
|
|
provider_model: selectedProviderModel,
|
|
api_token: apiToken,
|
|
include_raw_html: true,
|
|
forced: false,
|
|
extract_blocks: extractBlocks,
|
|
};
|
|
|
|
// save api token to local storage
|
|
localStorage.setItem("api_token", document.getElementById("token-input").value);
|
|
|
|
document.getElementById("loading").classList.remove("hidden");
|
|
document.getElementById("result").classList.add("hidden");
|
|
document.getElementById("code_help").classList.add("hidden");
|
|
|
|
axios
|
|
.post("/crawl", data)
|
|
.then((response) => {
|
|
const result = response.data.results[0];
|
|
const parsedJson = JSON.parse(result.parsed_json);
|
|
document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
|
|
document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
|
|
document.getElementById("markdown-result").textContent = result.markdown;
|
|
|
|
// Update code examples dynamically
|
|
// Update code examples dynamically
|
|
document.getElementById(
|
|
"curl-code"
|
|
).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
|
|
...data,
|
|
api_token: "your_api_token",
|
|
})}' http://localhost:8000/crawl`;
|
|
|
|
document.getElementById(
|
|
"python-code"
|
|
).textContent = `import requests\n\ndata = ${JSON.stringify(
|
|
{ ...data, api_token: "your_api_token" },
|
|
null,
|
|
2
|
|
)}\n\nresponse = requests.post("http://localhost:8000/crawl", json=data)\nprint(response.json())`;
|
|
|
|
document.getElementById(
|
|
"nodejs-code"
|
|
).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify(
|
|
{ ...data, api_token: "your_api_token" },
|
|
null,
|
|
2
|
|
)};\n\naxios.post("http://localhost:8000/crawl", data)\n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
|
|
// Highlight code syntax
|
|
hljs.highlightAll();
|
|
|
|
// Select JSON tab by default
|
|
document.querySelector('.tab-btn[data-tab="json"]').click();
|
|
|
|
document.getElementById("loading").classList.add("hidden");
|
|
document.getElementById("result").classList.remove("hidden");
|
|
document.getElementById("code_help").classList.remove("hidden");
|
|
})
|
|
.catch((error) => {
|
|
console.error(error);
|
|
document.getElementById("loading").classList.add("hidden");
|
|
});
|
|
});
|
|
|
|
// Handle tab clicks
|
|
document.querySelectorAll(".tab-btn").forEach((btn) => {
|
|
btn.addEventListener("click", () => {
|
|
const tab = btn.dataset.tab;
|
|
document
|
|
.querySelectorAll(".tab-btn")
|
|
.forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
|
|
btn.classList.add("bg-blue-600", "text-white");
|
|
document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden"));
|
|
document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden");
|
|
});
|
|
});
|
|
|
|
// Handle code tab clicks
|
|
document.querySelectorAll(".code-tab-btn").forEach((btn) => {
|
|
btn.addEventListener("click", () => {
|
|
const tab = btn.dataset.tab;
|
|
document
|
|
.querySelectorAll(".code-tab-btn")
|
|
.forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
|
|
btn.classList.add("bg-blue-600", "text-white");
|
|
document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden"));
|
|
document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden");
|
|
});
|
|
});
|
|
|
|
// Handle copy to clipboard button clicks
|
|
document.querySelectorAll(".copy-btn").forEach((btn) => {
|
|
btn.addEventListener("click", () => {
|
|
const target = btn.dataset.target;
|
|
const code = document.getElementById(target).textContent;
|
|
navigator.clipboard.writeText(code).then(() => {
|
|
btn.textContent = "Copied!";
|
|
setTimeout(() => {
|
|
btn.textContent = "Copy";
|
|
}, 2000);
|
|
});
|
|
});
|
|
});
|
|
|
|
|
|
document.getElementById("crawl-btn").addEventListener("click", () => {
|
|
const urlsInput = document.getElementById("url-input").value;
|
|
const urls = urlsInput.split(",").map(url => url.trim());
|
|
const apiToken = document.getElementById("token-input").value;
|
|
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
|
const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
|
|
|
|
const data = {
|
|
urls: urls,
|
|
provider_model: selectedProviderModel,
|
|
api_token: apiToken,
|
|
include_raw_html: true,
|
|
forced: false,
|
|
extract_blocks: extractBlocks
|
|
};
|
|
|
|
localStorage.setItem("api_token", apiToken);
|
|
|
|
document.getElementById("loading").classList.remove("hidden");
|
|
document.getElementById("result").classList.add("hidden");
|
|
document.getElementById("code_help").classList.add("hidden");
|
|
|
|
axios.post("/crawl", data)
|
|
.then(response => {
|
|
const taskId = response.data.task_id;
|
|
pollTaskStatus(taskId);
|
|
})
|
|
.catch(error => {
|
|
console.error('Error during fetch:', error);
|
|
document.getElementById("loading").classList.add("hidden");
|
|
});
|
|
});
|
|
|
|
function pollTaskStatus(taskId) {
|
|
axios.get(`/task/${taskId}`)
|
|
.then(response => {
|
|
const task = response.data;
|
|
if (task.status === 'done') {
|
|
displayResults(task.results[0]);
|
|
} else if (task.status === 'pending') {
|
|
setTimeout(() => pollTaskStatus(taskId), 2000); // Poll every 2 seconds
|
|
} else {
|
|
console.error('Task failed:', task.error);
|
|
document.getElementById("loading").classList.add("hidden");
|
|
}
|
|
})
|
|
.catch(error => {
|
|
console.error('Error polling task status:', error);
|
|
document.getElementById("loading").classList.add("hidden");
|
|
});
|
|
}
|
|
</script>
|
|
</body>
|
|
</html>
|