Compare commits
44 Commits
claude/imp
...
ssh-server
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b1ac4fe023 | ||
|
|
a3c92141a1 | ||
|
|
3fd777dd6f | ||
|
|
d7200138a0 | ||
|
|
be37abe05a | ||
|
|
90ba51b52f | ||
|
|
11721eb0ce | ||
|
|
1222e456fb | ||
|
|
e8aaa57cb2 | ||
|
|
a661b3173d | ||
|
|
b781b6df96 | ||
|
|
14e537fdd3 | ||
|
|
64b33af0e0 | ||
|
|
1afcdb6996 | ||
|
|
ca625b3152 | ||
|
|
6521b4745f | ||
|
|
241862bfe6 | ||
|
|
f2491b6c1a | ||
|
|
886622cb1e | ||
|
|
13dc254438 | ||
|
|
096929153f | ||
|
|
7e95c38acb | ||
|
|
c697bf23e4 | ||
|
|
b951d34ed0 | ||
|
|
c8a10dc455 | ||
|
|
9e0ded8da0 | ||
|
|
48c27899b7 | ||
|
|
3c32b0abed | ||
|
|
a215ec08d6 | ||
|
|
5d3fef45f7 | ||
|
|
77df6db453 | ||
|
|
2124652327 | ||
|
|
255bde70c9 | ||
|
|
04808b5dc9 | ||
|
|
b3a150f3d1 | ||
|
|
de80a2da09 | ||
|
|
df4cda8322 | ||
|
|
7717a3b948 | ||
|
|
a4a6b2075f | ||
|
|
4010558885 | ||
|
|
b0cf5076da | ||
|
|
0d6e9e37ca | ||
|
|
9b0f71ba88 | ||
|
|
6ddccc144c |
231
docs/md/demo.md
Normal file
231
docs/md/demo.md
Normal file
@@ -0,0 +1,231 @@
|
||||
# Interactive Demo for Crowler
|
||||
<div id="demo">
|
||||
<form id="crawlForm" class="terminal-form">
|
||||
<fieldset>
|
||||
<legend>Enter URL and Options</legend>
|
||||
<div class="form-group">
|
||||
<label for="url">Enter URL:</label>
|
||||
<input type="text" id="url" name="url" required>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label for="screenshot">Get Screenshot:</label>
|
||||
<input type="checkbox" id="screenshot" name="screenshot">
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<button class="btn btn-default" type="submit">Submit</button>
|
||||
</div>
|
||||
|
||||
</fieldset>
|
||||
</form>
|
||||
|
||||
<div id="loading" class="loading-message">
|
||||
<div class="terminal-alert terminal-alert-primary">Loading... Please wait.</div>
|
||||
</div>
|
||||
|
||||
<section id="response" class="response-section">
|
||||
<h2>Response</h2>
|
||||
<div class="tabs">
|
||||
<ul class="tab-list">
|
||||
<li class="tab-item" onclick="showTab('markdown')">Markdown</li>
|
||||
<li class="tab-item" onclick="showTab('cleanedHtml')">Cleaned HTML</li>
|
||||
<li class="tab-item" onclick="showTab('media')">Media</li>
|
||||
<li class="tab-item" onclick="showTab('extractedContent')">Extracted Content</li>
|
||||
<li class="tab-item" onclick="showTab('screenshot')">Screenshot</li>
|
||||
<li class="tab-item" onclick="showTab('pythonCode')">Python Code</li>
|
||||
</ul>
|
||||
<div class="tab-content" id="tab-markdown">
|
||||
<header>
|
||||
<div>
|
||||
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('markdownContent')">Copy</button>
|
||||
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('markdownContent', 'markdown.md')">Download</button>
|
||||
</div>
|
||||
</header>
|
||||
<pre><code id="markdownContent" class="language-markdown hljs"></code></pre>
|
||||
</div>
|
||||
|
||||
<div class="tab-content" id="tab-cleanedHtml" style="display: none;">
|
||||
<header >
|
||||
<div>
|
||||
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('cleanedHtmlContent')">Copy</button>
|
||||
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('cleanedHtmlContent', 'cleaned.html')">Download</button>
|
||||
</div>
|
||||
</header>
|
||||
<pre><code id="cleanedHtmlContent" class="language-html hljs"></code></pre>
|
||||
</div>
|
||||
|
||||
<div class="tab-content" id="tab-media" style="display: none;">
|
||||
<header >
|
||||
<div>
|
||||
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('mediaContent')">Copy</button>
|
||||
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('mediaContent', 'media.json')">Download</button>
|
||||
</div>
|
||||
</header>
|
||||
<pre><code id="mediaContent" class="language-json hljs"></code></pre>
|
||||
</div>
|
||||
|
||||
<div class="tab-content" id="tab-extractedContent" style="display: none;">
|
||||
<header >
|
||||
<div>
|
||||
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('extractedContentContent')">Copy</button>
|
||||
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('extractedContentContent', 'extracted_content.json')">Download</button>
|
||||
</div>
|
||||
</header>
|
||||
<pre><code id="extractedContentContent" class="language-json hljs"></code></pre>
|
||||
</div>
|
||||
|
||||
<div class="tab-content" id="tab-screenshot" style="display: none;">
|
||||
<header >
|
||||
<div>
|
||||
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadImage('screenshotContent', 'screenshot.png')">Download</button>
|
||||
</div>
|
||||
</header>
|
||||
<pre><img id="screenshotContent" /></pre>
|
||||
</div>
|
||||
|
||||
<div class="tab-content" id="tab-pythonCode" style="display: none;">
|
||||
<header >
|
||||
<div>
|
||||
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('pythonCode')">Copy</button>
|
||||
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('pythonCode', 'example.py')">Download</button>
|
||||
</div>
|
||||
</header>
|
||||
<pre><code id="pythonCode" class="language-python hljs"></code></pre>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<div id="error" class="error-message" style="display: none; margin-top:1em;">
|
||||
<div class="terminal-alert terminal-alert-error"></div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
function showTab(tabId) {
|
||||
const tabs = document.querySelectorAll('.tab-content');
|
||||
tabs.forEach(tab => tab.style.display = 'none');
|
||||
document.getElementById(`tab-${tabId}`).style.display = 'block';
|
||||
}
|
||||
|
||||
function redo(codeBlock, codeText){
|
||||
codeBlock.classList.remove('hljs');
|
||||
codeBlock.removeAttribute('data-highlighted');
|
||||
|
||||
// Set new code and re-highlight
|
||||
codeBlock.textContent = codeText;
|
||||
hljs.highlightBlock(codeBlock);
|
||||
}
|
||||
|
||||
function copyToClipboard(elementId) {
|
||||
const content = document.getElementById(elementId).textContent;
|
||||
navigator.clipboard.writeText(content).then(() => {
|
||||
alert('Copied to clipboard');
|
||||
});
|
||||
}
|
||||
|
||||
function downloadContent(elementId, filename) {
|
||||
const content = document.getElementById(elementId).textContent;
|
||||
const blob = new Blob([content], { type: 'text/plain' });
|
||||
const url = window.URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.style.display = 'none';
|
||||
a.href = url;
|
||||
a.download = filename;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
window.URL.revokeObjectURL(url);
|
||||
document.body.removeChild(a);
|
||||
}
|
||||
|
||||
function downloadImage(elementId, filename) {
|
||||
const content = document.getElementById(elementId).src;
|
||||
const a = document.createElement('a');
|
||||
a.style.display = 'none';
|
||||
a.href = content;
|
||||
a.download = filename;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
}
|
||||
|
||||
document.getElementById('crawlForm').addEventListener('submit', function(event) {
|
||||
event.preventDefault();
|
||||
document.getElementById('loading').style.display = 'block';
|
||||
document.getElementById('response').style.display = 'none';
|
||||
|
||||
const url = document.getElementById('url').value;
|
||||
const screenshot = document.getElementById('screenshot').checked;
|
||||
const data = {
|
||||
urls: [url],
|
||||
bypass_cache: false,
|
||||
word_count_threshold: 5,
|
||||
screenshot: screenshot
|
||||
};
|
||||
|
||||
fetch('https://crawl4ai.com/crawl', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify(data)
|
||||
})
|
||||
.then(response => {
|
||||
if (!response.ok) {
|
||||
if (response.status === 429) {
|
||||
return response.json().then(err => {
|
||||
throw Object.assign(new Error('Rate limit exceeded'), { status: 429, details: err });
|
||||
});
|
||||
}
|
||||
throw new Error('Network response was not ok');
|
||||
}
|
||||
return response.json();
|
||||
})
|
||||
.then(data => {
|
||||
data = data.results[0]; // Only one URL is requested
|
||||
document.getElementById('loading').style.display = 'none';
|
||||
document.getElementById('response').style.display = 'block';
|
||||
redo(document.getElementById('markdownContent'), data.markdown);
|
||||
redo(document.getElementById('cleanedHtmlContent'), data.cleaned_html);
|
||||
redo(document.getElementById('mediaContent'), JSON.stringify(data.media, null, 2));
|
||||
redo(document.getElementById('extractedContentContent'), data.extracted_content);
|
||||
if (screenshot) {
|
||||
document.getElementById('screenshotContent').src = `data:image/png;base64,${data.screenshot}`;
|
||||
}
|
||||
const pythonCode = `
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
|
||||
crawler = WebCrawler()
|
||||
crawler.warmup()
|
||||
|
||||
result = crawler.run(
|
||||
url='${url}',
|
||||
screenshot=${screenshot}
|
||||
)
|
||||
print(result)
|
||||
`;
|
||||
redo(document.getElementById('pythonCode'), pythonCode);
|
||||
document.getElementById('error').style.display = 'none';
|
||||
})
|
||||
.catch(error => {
|
||||
document.getElementById('loading').style.display = 'none';
|
||||
document.getElementById('error').style.display = 'block';
|
||||
let errorMessage = 'An unexpected error occurred. Please try again later.';
|
||||
|
||||
if (error.status === 429) {
|
||||
const details = error.details;
|
||||
if (details.retry_after) {
|
||||
errorMessage = `Rate limit exceeded. Please wait ${parseFloat(details.retry_after).toFixed(1)} seconds before trying again.`;
|
||||
} else if (details.reset_at) {
|
||||
const resetTime = new Date(details.reset_at);
|
||||
const waitTime = Math.ceil((resetTime - new Date()) / 1000);
|
||||
errorMessage = `Rate limit exceeded. Please try again after ${waitTime} seconds.`;
|
||||
} else {
|
||||
errorMessage = `Rate limit exceeded. Please try again later.`;
|
||||
}
|
||||
} else if (error.message) {
|
||||
errorMessage = error.message;
|
||||
}
|
||||
|
||||
document.querySelector('#error .terminal-alert').textContent = errorMessage;
|
||||
});
|
||||
});
|
||||
</script>
|
||||
</div>
|
||||
158
main.py
158
main.py
@@ -380,97 +380,97 @@ def read_root():
|
||||
return {"message": "Crawl4AI API service is running"}
|
||||
|
||||
|
||||
@app.post("/crawl", dependencies=[Depends(verify_token)])
|
||||
async def crawl(request: CrawlRequest) -> Dict[str, str]:
|
||||
task_id = await crawler_service.submit_task(request)
|
||||
return {"task_id": task_id}
|
||||
# @app.post("/crawl", dependencies=[Depends(verify_token)])
|
||||
# async def crawl(request: CrawlRequest) -> Dict[str, str]:
|
||||
# task_id = await crawler_service.submit_task(request)
|
||||
# return {"task_id": task_id}
|
||||
|
||||
@app.get("/task/{task_id}", dependencies=[Depends(verify_token)])
|
||||
async def get_task_status(task_id: str):
|
||||
task_info = crawler_service.task_manager.get_task(task_id)
|
||||
if not task_info:
|
||||
raise HTTPException(status_code=404, detail="Task not found")
|
||||
# @app.get("/task/{task_id}", dependencies=[Depends(verify_token)])
|
||||
# async def get_task_status(task_id: str):
|
||||
# task_info = crawler_service.task_manager.get_task(task_id)
|
||||
# if not task_info:
|
||||
# raise HTTPException(status_code=404, detail="Task not found")
|
||||
|
||||
response = {
|
||||
"status": task_info.status,
|
||||
"created_at": task_info.created_at,
|
||||
}
|
||||
# response = {
|
||||
# "status": task_info.status,
|
||||
# "created_at": task_info.created_at,
|
||||
# }
|
||||
|
||||
if task_info.status == TaskStatus.COMPLETED:
|
||||
# Convert CrawlResult to dict for JSON response
|
||||
if isinstance(task_info.result, list):
|
||||
response["results"] = [result.dict() for result in task_info.result]
|
||||
else:
|
||||
response["result"] = task_info.result.dict()
|
||||
elif task_info.status == TaskStatus.FAILED:
|
||||
response["error"] = task_info.error
|
||||
# if task_info.status == TaskStatus.COMPLETED:
|
||||
# # Convert CrawlResult to dict for JSON response
|
||||
# if isinstance(task_info.result, list):
|
||||
# response["results"] = [result.dict() for result in task_info.result]
|
||||
# else:
|
||||
# response["result"] = task_info.result.dict()
|
||||
# elif task_info.status == TaskStatus.FAILED:
|
||||
# response["error"] = task_info.error
|
||||
|
||||
return response
|
||||
# return response
|
||||
|
||||
@app.post("/crawl_sync", dependencies=[Depends(verify_token)])
|
||||
async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
|
||||
task_id = await crawler_service.submit_task(request)
|
||||
# @app.post("/crawl_sync", dependencies=[Depends(verify_token)])
|
||||
# async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
|
||||
# task_id = await crawler_service.submit_task(request)
|
||||
|
||||
# Wait up to 60 seconds for task completion
|
||||
for _ in range(60):
|
||||
task_info = crawler_service.task_manager.get_task(task_id)
|
||||
if not task_info:
|
||||
raise HTTPException(status_code=404, detail="Task not found")
|
||||
# # Wait up to 60 seconds for task completion
|
||||
# for _ in range(60):
|
||||
# task_info = crawler_service.task_manager.get_task(task_id)
|
||||
# if not task_info:
|
||||
# raise HTTPException(status_code=404, detail="Task not found")
|
||||
|
||||
if task_info.status == TaskStatus.COMPLETED:
|
||||
# Return same format as /task/{task_id} endpoint
|
||||
if isinstance(task_info.result, list):
|
||||
return {"status": task_info.status, "results": [result.dict() for result in task_info.result]}
|
||||
return {"status": task_info.status, "result": task_info.result.dict()}
|
||||
# if task_info.status == TaskStatus.COMPLETED:
|
||||
# # Return same format as /task/{task_id} endpoint
|
||||
# if isinstance(task_info.result, list):
|
||||
# return {"status": task_info.status, "results": [result.dict() for result in task_info.result]}
|
||||
# return {"status": task_info.status, "result": task_info.result.dict()}
|
||||
|
||||
if task_info.status == TaskStatus.FAILED:
|
||||
raise HTTPException(status_code=500, detail=task_info.error)
|
||||
# if task_info.status == TaskStatus.FAILED:
|
||||
# raise HTTPException(status_code=500, detail=task_info.error)
|
||||
|
||||
await asyncio.sleep(1)
|
||||
# await asyncio.sleep(1)
|
||||
|
||||
# If we get here, task didn't complete within timeout
|
||||
raise HTTPException(status_code=408, detail="Task timed out")
|
||||
# # If we get here, task didn't complete within timeout
|
||||
# raise HTTPException(status_code=408, detail="Task timed out")
|
||||
|
||||
@app.post("/crawl_direct", dependencies=[Depends(verify_token)])
|
||||
async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
|
||||
try:
|
||||
crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
|
||||
extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config)
|
||||
# @app.post("/crawl_direct", dependencies=[Depends(verify_token)])
|
||||
# async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
|
||||
# try:
|
||||
# crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
|
||||
# extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config)
|
||||
|
||||
try:
|
||||
if isinstance(request.urls, list):
|
||||
results = await crawler.arun_many(
|
||||
urls=[str(url) for url in request.urls],
|
||||
extraction_strategy=extraction_strategy,
|
||||
js_code=request.js_code,
|
||||
wait_for=request.wait_for,
|
||||
css_selector=request.css_selector,
|
||||
screenshot=request.screenshot,
|
||||
magic=request.magic,
|
||||
cache_mode=request.cache_mode,
|
||||
session_id=request.session_id,
|
||||
**request.extra,
|
||||
)
|
||||
return {"results": [result.dict() for result in results]}
|
||||
else:
|
||||
result = await crawler.arun(
|
||||
url=str(request.urls),
|
||||
extraction_strategy=extraction_strategy,
|
||||
js_code=request.js_code,
|
||||
wait_for=request.wait_for,
|
||||
css_selector=request.css_selector,
|
||||
screenshot=request.screenshot,
|
||||
magic=request.magic,
|
||||
cache_mode=request.cache_mode,
|
||||
session_id=request.session_id,
|
||||
**request.extra,
|
||||
)
|
||||
return {"result": result.dict()}
|
||||
finally:
|
||||
await crawler_service.crawler_pool.release(crawler)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in direct crawl: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
# try:
|
||||
# if isinstance(request.urls, list):
|
||||
# results = await crawler.arun_many(
|
||||
# urls=[str(url) for url in request.urls],
|
||||
# extraction_strategy=extraction_strategy,
|
||||
# js_code=request.js_code,
|
||||
# wait_for=request.wait_for,
|
||||
# css_selector=request.css_selector,
|
||||
# screenshot=request.screenshot,
|
||||
# magic=request.magic,
|
||||
# cache_mode=request.cache_mode,
|
||||
# session_id=request.session_id,
|
||||
# **request.extra,
|
||||
# )
|
||||
# return {"results": [result.dict() for result in results]}
|
||||
# else:
|
||||
# result = await crawler.arun(
|
||||
# url=str(request.urls),
|
||||
# extraction_strategy=extraction_strategy,
|
||||
# js_code=request.js_code,
|
||||
# wait_for=request.wait_for,
|
||||
# css_selector=request.css_selector,
|
||||
# screenshot=request.screenshot,
|
||||
# magic=request.magic,
|
||||
# cache_mode=request.cache_mode,
|
||||
# session_id=request.session_id,
|
||||
# **request.extra,
|
||||
# )
|
||||
# return {"result": result.dict()}
|
||||
# finally:
|
||||
# await crawler_service.crawler_pool.release(crawler)
|
||||
# except Exception as e:
|
||||
# logger.error(f"Error in direct crawl: {str(e)}")
|
||||
# raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
|
||||
@@ -8,7 +8,7 @@ docs_dir: docs/md_v2
|
||||
nav:
|
||||
- Home: 'index.md'
|
||||
- 'Installation': 'basic/installation.md'
|
||||
- 'Docker Deplotment': 'basic/docker-deploymeny.md'
|
||||
- 'Docker Deployment': 'basic/docker-deploymeny.md'
|
||||
- 'Quick Start': 'basic/quickstart.md'
|
||||
- Changelog & Blog:
|
||||
- 'Blog Home': 'blog/index.md'
|
||||
|
||||
Reference in New Issue
Block a user