- Add demo page to the new mkdocs
- Set website home page to mkdocs
This commit is contained in:
unclecode
2024-06-22 20:36:01 +08:00
parent 2217904876
commit d6182bedd7
8 changed files with 332 additions and 18 deletions

View File

@@ -631,4 +631,9 @@ def wrap_text(draw, text, font, max_width):
while words and draw.textbbox((0, 0), line + words[0], font=font)[2] <= max_width:
line += (words.pop(0) + ' ')
lines.append(line)
return '\n'.join(lines)
return '\n'.join(lines)
def format_html(html_string):
soup = BeautifulSoup(html_string, 'html.parser')
return soup.prettify()

View File

@@ -140,24 +140,28 @@ class WebCrawler:
# Check cache first
cached = None
screenshot_data = None
extracted_content = None
if not bypass_cache and not self.always_by_pass_cache:
cached = get_cached_url(url)
if cached:
html = cached[1]
extracted_content = cached[2]
extracted_content = cached[4]
if screenshot:
screenshot = cached[9]
screenshot_data = cached[9]
if not screenshot_data:
cached = None
else:
if not cached or not html:
if user_agent:
self.crawler_strategy.update_user_agent(user_agent)
html = self.crawler_strategy.crawl(url)
if screenshot:
screenshot = self.crawler_strategy.take_screenshot()
screenshot_data = self.crawler_strategy.take_screenshot()
return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, bool(cached), **kwargs)
return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
def process_html(
self,
@@ -197,7 +201,7 @@ class WebCrawler:
sections = chunking_strategy.chunk(markdown)
extracted_content = extraction_strategy.run(url, sections)
extracted_content = json.dumps(extracted_content)
extracted_content = json.dumps(extracted_content, indent=4, default=str)
if verbose:
print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.")
@@ -217,11 +221,11 @@ class WebCrawler:
json.dumps(metadata),
screenshot=screenshot,
)
return CrawlResult(
url=url,
html=html,
cleaned_html=cleaned_html,
cleaned_html=format_html(cleaned_html),
markdown=markdown,
media=media,
links=links,

View File

@@ -15,7 +15,6 @@
--mono-font-stack: Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono,
Courier New, monospace, serif;
--background-color: #151515; /* Dark background */
--font-color: #eaeaea; /* Light font color for contrast */
--invert-font-color: #151515; /* Dark color for inverted elements */
@@ -73,11 +72,78 @@ pre, code {
border-bottom: 1px dashed var(--secondary-color);
} */
.terminal-mkdocs-main-content{
.terminal-mkdocs-main-content {
line-height: var(--global-line-height);
}
strong, .highlight {
strong,
.highlight {
/* background: url(//s2.svgbox.net/pen-brushes.svg?ic=brush-1&color=50ffff); */
background-color: #50ffff33;
}
.terminal-card > header {
color: var(--font-color);
text-align: center;
background-color: var(--progress-bar-background);
padding: 0.3em 0.5em;
}
.btn.btn-sm {
color: var(--font-color);
padding: 0.2em 0.5em;
font-size: 0.8em;
}
.loading-message {
display: none;
margin-top: 20px;
}
.response-section {
display: none;
padding-top: 20px;
}
.tabs {
display: flex;
flex-direction: column;
}
.tab-list {
display: flex;
padding: 0;
margin: 0;
list-style-type: none;
border-bottom: 1px solid var(--font-color);
}
.tab-item {
cursor: pointer;
padding: 10px;
border: 1px solid var(--font-color);
margin-right: -1px;
border-bottom: none;
}
.tab-item:hover,
.tab-item:focus,
.tab-item:active {
background-color: var(--progress-bar-background);
}
.tab-content {
display: none;
border: 1px solid var(--font-color);
border-top: none;
}
.tab-content:first-of-type {
display: block;
}
.tab-content header {
padding: 0.5em;
display: flex;
justify-content: end;
align-items: center;
background-color: var(--progress-bar-background);
}
.tab-content pre {
margin: 0;
max-height: 300px; overflow: auto; border:none;
}

198
docs/md/demo.md Normal file
View File

@@ -0,0 +1,198 @@
# Interactive Demo for Crowler
<div id="demo">
<form id="crawlForm" class="terminal-form">
<fieldset>
<legend>Enter URL and Options</legend>
<div class="form-group">
<label for="url">Enter URL:</label>
<input type="text" id="url" name="url" required>
</div>
<div class="form-group">
<label for="screenshot">Get Screenshot:</label>
<input type="checkbox" id="screenshot" name="screenshot">
</div>
<div class="form-group">
<button class="btn btn-default" type="submit">Submit</button>
</div>
</fieldset>
</form>
<div id="loading" class="loading-message">
<div class="terminal-alert terminal-alert-primary">Loading... Please wait.</div>
</div>
<section id="response" class="response-section">
<h2>Response</h2>
<div class="tabs">
<ul class="tab-list">
<li class="tab-item" onclick="showTab('markdown')">Markdown</li>
<li class="tab-item" onclick="showTab('cleanedHtml')">Cleaned HTML</li>
<li class="tab-item" onclick="showTab('media')">Media</li>
<li class="tab-item" onclick="showTab('extractedContent')">Extracted Content</li>
<li class="tab-item" onclick="showTab('screenshot')">Screenshot</li>
<li class="tab-item" onclick="showTab('pythonCode')">Python Code</li>
</ul>
<div class="tab-content" id="tab-markdown">
<header>
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('markdownContent')">Copy</button>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('markdownContent', 'markdown.md')">Download</button>
</div>
</header>
<pre><code id="markdownContent" class="language-markdown hljs"></code></pre>
</div>
<div class="tab-content" id="tab-cleanedHtml" style="display: none;">
<header >
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('cleanedHtmlContent')">Copy</button>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('cleanedHtmlContent', 'cleaned.html')">Download</button>
</div>
</header>
<pre><code id="cleanedHtmlContent" class="language-html hljs"></code></pre>
</div>
<div class="tab-content" id="tab-media" style="display: none;">
<header >
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('mediaContent')">Copy</button>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('mediaContent', 'media.json')">Download</button>
</div>
</header>
<pre><code id="mediaContent" class="language-json hljs"></code></pre>
</div>
<div class="tab-content" id="tab-extractedContent" style="display: none;">
<header >
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('extractedContentContent')">Copy</button>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('extractedContentContent', 'extracted_content.json')">Download</button>
</div>
</header>
<pre><code id="extractedContentContent" class="language-json hljs"></code></pre>
</div>
<div class="tab-content" id="tab-screenshot" style="display: none;">
<header >
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadImage('screenshotContent', 'screenshot.png')">Download</button>
</div>
</header>
<pre><img id="screenshotContent" /></pre>
</div>
<div class="tab-content" id="tab-pythonCode" style="display: none;">
<header >
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('pythonCode')">Copy</button>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('pythonCode', 'example.py')">Download</button>
</div>
</header>
<pre><code id="pythonCode" class="language-python hljs"></code></pre>
</div>
</div>
</section>
<script>
function showTab(tabId) {
const tabs = document.querySelectorAll('.tab-content');
tabs.forEach(tab => tab.style.display = 'none');
document.getElementById(`tab-${tabId}`).style.display = 'block';
}
function redo(codeBlock, codeText){
codeBlock.classList.remove('hljs');
codeBlock.removeAttribute('data-highlighted');
// Set new code and re-highlight
codeBlock.textContent = codeText;
hljs.highlightBlock(codeBlock);
}
function copyToClipboard(elementId) {
const content = document.getElementById(elementId).textContent;
navigator.clipboard.writeText(content).then(() => {
alert('Copied to clipboard');
});
}
function downloadContent(elementId, filename) {
const content = document.getElementById(elementId).textContent;
const blob = new Blob([content], { type: 'text/plain' });
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.style.display = 'none';
a.href = url;
a.download = filename;
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
}
function downloadImage(elementId, filename) {
const content = document.getElementById(elementId).src;
const a = document.createElement('a');
a.style.display = 'none';
a.href = content;
a.download = filename;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
}
document.getElementById('crawlForm').addEventListener('submit', function(event) {
event.preventDefault();
document.getElementById('loading').style.display = 'block';
document.getElementById('response').style.display = 'none';
const url = document.getElementById('url').value;
const screenshot = document.getElementById('screenshot').checked;
const data = {
urls: [url],
bypass_cache: false,
word_count_threshold: 5,
screenshot: screenshot
};
fetch('/crawl', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(data)
})
.then(response => response.json())
.then(data => {
data = data.results[0]; // Only one URL is requested
document.getElementById('loading').style.display = 'none';
document.getElementById('response').style.display = 'block';
redo(document.getElementById('markdownContent'), data.markdown);
redo(document.getElementById('cleanedHtmlContent'), data.cleaned_html);
redo(document.getElementById('mediaContent'), JSON.stringify(data.media, null, 2));
redo(document.getElementById('extractedContentContent'), data.extracted_content);
if (screenshot) {
document.getElementById('screenshotContent').src = `data:image/png;base64,${data.screenshot}`;
}
const pythonCode = `
from crawl4ai.web_crawler import WebCrawler
crawler = WebCrawler()
crawler.warmup()
result = crawler.run(
url='${url}',
screenshot=${screenshot}
)
print(result)
`;
redo(document.getElementById('pythonCode'), pythonCode);
})
.catch(error => {
document.getElementById('loading').style.display = 'none';
document.getElementById('response').style.display = 'block';
document.getElementById('markdownContent').textContent = 'Error: ' + error;
});
});
</script>
</div>

View File

@@ -2,6 +2,11 @@
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
## Try the [Demo](demo.md)
Just try it now and crawl different pages to see how it works. You can set the links, see the structures of the output, and also view the Python sample code on how to run it. The old demo is available at [/old_demo](/old) where you can see more details.
## Introduction
Crawl4AI has one clear task: to make crawling and data extraction from web pages easy and efficient, especially for large language models (LLMs) and AI applications. Whether you are using it as a REST API or a Python library, Crawl4AI offers a robust and flexible solution.

View File

@@ -0,0 +1,28 @@
<h1>Try Our Library</h1>
<form id="apiForm">
<label for="inputField">Enter some input:</label>
<input type="text" id="inputField" name="inputField" required>
<button type="submit">Submit</button>
</form>
<div id="result"></div>
<script>
document.getElementById('apiForm').addEventListener('submit', function(event) {
event.preventDefault();
const input = document.getElementById('inputField').value;
fetch('https://your-api-endpoint.com/api', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({ input: input })
})
.then(response => response.json())
.then(data => {
document.getElementById('result').textContent = JSON.stringify(data);
})
.catch(error => {
document.getElementById('result').textContent = 'Error: ' + error;
});
});
</script>

View File

@@ -63,6 +63,11 @@ class CrawlRequest(BaseModel):
@app.get("/", response_class=HTMLResponse)
async def read_index(request: Request):
# redirect to site/index.html
return templates.TemplateResponse("index.html", {"request": request})
@app.get("/old", response_class=HTMLResponse)
async def read_index(request: Request):
partials_dir = os.path.join(__location__, "pages", "partial")
partials = {}

View File

@@ -2,9 +2,11 @@ site_name: Crawl4AI Documentation
docs_dir: docs/md
nav:
- Home: index.md
- Introduction: introduction.md
- Installation: installation.md
- Quick Start: quickstart.md
- Demo: demo.md # Add this line
- First Steps:
- Introduction: introduction.md
- Installation: installation.md
- Quick Start: quickstart.md
- Examples:
- Intro: examples/index.md
- LLM Extraction: examples/llm_extraction.md
@@ -21,8 +23,9 @@ nav:
- API Reference:
- Core Classes and Functions: api/core_classes_and_functions.md
- Detailed API Documentation: api/detailed_api_documentation.md
- Change Log: changelog.md
- Contact: contact.md
- Miscellaneous:
- Change Log: changelog.md
- Contact: contact.md
theme:
name: terminal
@@ -36,4 +39,4 @@ extra_css:
extra_javascript:
- assets/highlight.min.js
- assets/highlight_init.js
- assets/highlight_init.js