Compare commits

...

44 Commits

Author SHA1 Message Date
Unclecode
b1ac4fe023 Merge branch 'main' into ssh-server 2024-12-12 12:25:26 +00:00
Unclecode
a3c92141a1 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-12-12 12:25:01 +00:00
Unclecode
3fd777dd6f remove crawl endpoints 2024-12-12 12:24:13 +00:00
Unclecode
d7200138a0 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-12-08 12:06:53 +00:00
Unclecode
be37abe05a Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-12-04 12:31:45 +00:00
Unclecode
90ba51b52f fix(mkdocs): correct typo in Docker Deployment navigation entry 2024-12-04 12:31:41 +00:00
Unclecode
11721eb0ce Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-11-05 13:02:59 +00:00
Unclecode
1222e456fb Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-11-05 12:58:30 +00:00
Unclecode
e8aaa57cb2 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-10-30 12:59:34 +00:00
Unclecode
a661b3173d Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-10-30 12:47:07 +00:00
Unclecode
b781b6df96 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-10-27 11:42:23 +00:00
Unclecode
14e537fdd3 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-08-04 06:57:16 +00:00
Unclecode
64b33af0e0 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-08-02 08:04:54 +00:00
Unclecode
1afcdb6996 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-07-08 12:24:13 +00:00
Unclecode
ca625b3152 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-07-08 12:02:19 +00:00
Unclecode
6521b4745f Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-07-08 08:35:49 +00:00
Unclecode
241862bfe6 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-07-03 07:27:37 +00:00
Unclecode
f2491b6c1a Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-29 16:34:15 +00:00
Unclecode
886622cb1e Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-29 16:23:44 +00:00
Unclecode
13dc254438 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-26 07:35:06 +00:00
Unclecode
096929153f Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-26 05:45:25 +00:00
Unclecode
7e95c38acb Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-24 14:40:48 +00:00
Unclecode
c697bf23e4 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-22 16:37:27 +00:00
Unclecode
b951d34ed0 chore: Update fetch URL to use HTTPS 2024-06-22 16:37:21 +00:00
Unclecode
c8a10dc455 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-22 12:54:41 +00:00
Unclecode
9e0ded8da0 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-22 12:41:52 +00:00
Unclecode
48c27899b7 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-22 12:38:14 +00:00
Unclecode
3c32b0abed Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-21 09:58:17 +00:00
Unclecode
a215ec08d6 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-19 10:51:31 +00:00
Unclecode
5d3fef45f7 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-18 12:02:29 +00:00
Unclecode
77df6db453 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-08 10:38:10 +00:00
Unclecode
2124652327 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-08 10:07:30 +00:00
Unclecode
255bde70c9 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-08 08:53:54 +00:00
Unclecode
04808b5dc9 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-07 12:44:41 +00:00
Unclecode
b3a150f3d1 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-07 08:26:43 +00:00
Unclecode
de80a2da09 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-07 08:25:49 +00:00
Unclecode
df4cda8322 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-07 08:24:46 +00:00
Unclecode
7717a3b948 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-07 08:19:37 +00:00
Unclecode
a4a6b2075f Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-07 08:18:19 +00:00
Unclecode
4010558885 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-02 08:12:32 +00:00
Unclecode
b0cf5076da Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-02 08:09:25 +00:00
Unclecode
0d6e9e37ca Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-02 08:06:56 +00:00
Unclecode
9b0f71ba88 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-06-02 07:56:00 +00:00
Unclecode
6ddccc144c chore: Bump version to 0.2.2 in setup.py 2024-05-19 16:19:40 +00:00
3 changed files with 311 additions and 80 deletions

231
docs/md/demo.md Normal file
View File

@@ -0,0 +1,231 @@
# Interactive Demo for Crowler
<div id="demo">
<form id="crawlForm" class="terminal-form">
<fieldset>
<legend>Enter URL and Options</legend>
<div class="form-group">
<label for="url">Enter URL:</label>
<input type="text" id="url" name="url" required>
</div>
<div class="form-group">
<label for="screenshot">Get Screenshot:</label>
<input type="checkbox" id="screenshot" name="screenshot">
</div>
<div class="form-group">
<button class="btn btn-default" type="submit">Submit</button>
</div>
</fieldset>
</form>
<div id="loading" class="loading-message">
<div class="terminal-alert terminal-alert-primary">Loading... Please wait.</div>
</div>
<section id="response" class="response-section">
<h2>Response</h2>
<div class="tabs">
<ul class="tab-list">
<li class="tab-item" onclick="showTab('markdown')">Markdown</li>
<li class="tab-item" onclick="showTab('cleanedHtml')">Cleaned HTML</li>
<li class="tab-item" onclick="showTab('media')">Media</li>
<li class="tab-item" onclick="showTab('extractedContent')">Extracted Content</li>
<li class="tab-item" onclick="showTab('screenshot')">Screenshot</li>
<li class="tab-item" onclick="showTab('pythonCode')">Python Code</li>
</ul>
<div class="tab-content" id="tab-markdown">
<header>
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('markdownContent')">Copy</button>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('markdownContent', 'markdown.md')">Download</button>
</div>
</header>
<pre><code id="markdownContent" class="language-markdown hljs"></code></pre>
</div>
<div class="tab-content" id="tab-cleanedHtml" style="display: none;">
<header >
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('cleanedHtmlContent')">Copy</button>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('cleanedHtmlContent', 'cleaned.html')">Download</button>
</div>
</header>
<pre><code id="cleanedHtmlContent" class="language-html hljs"></code></pre>
</div>
<div class="tab-content" id="tab-media" style="display: none;">
<header >
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('mediaContent')">Copy</button>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('mediaContent', 'media.json')">Download</button>
</div>
</header>
<pre><code id="mediaContent" class="language-json hljs"></code></pre>
</div>
<div class="tab-content" id="tab-extractedContent" style="display: none;">
<header >
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('extractedContentContent')">Copy</button>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('extractedContentContent', 'extracted_content.json')">Download</button>
</div>
</header>
<pre><code id="extractedContentContent" class="language-json hljs"></code></pre>
</div>
<div class="tab-content" id="tab-screenshot" style="display: none;">
<header >
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadImage('screenshotContent', 'screenshot.png')">Download</button>
</div>
</header>
<pre><img id="screenshotContent" /></pre>
</div>
<div class="tab-content" id="tab-pythonCode" style="display: none;">
<header >
<div>
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('pythonCode')">Copy</button>
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('pythonCode', 'example.py')">Download</button>
</div>
</header>
<pre><code id="pythonCode" class="language-python hljs"></code></pre>
</div>
</div>
</section>
<div id="error" class="error-message" style="display: none; margin-top:1em;">
<div class="terminal-alert terminal-alert-error"></div>
</div>
<script>
function showTab(tabId) {
const tabs = document.querySelectorAll('.tab-content');
tabs.forEach(tab => tab.style.display = 'none');
document.getElementById(`tab-${tabId}`).style.display = 'block';
}
function redo(codeBlock, codeText){
codeBlock.classList.remove('hljs');
codeBlock.removeAttribute('data-highlighted');
// Set new code and re-highlight
codeBlock.textContent = codeText;
hljs.highlightBlock(codeBlock);
}
function copyToClipboard(elementId) {
const content = document.getElementById(elementId).textContent;
navigator.clipboard.writeText(content).then(() => {
alert('Copied to clipboard');
});
}
function downloadContent(elementId, filename) {
const content = document.getElementById(elementId).textContent;
const blob = new Blob([content], { type: 'text/plain' });
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.style.display = 'none';
a.href = url;
a.download = filename;
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
}
function downloadImage(elementId, filename) {
const content = document.getElementById(elementId).src;
const a = document.createElement('a');
a.style.display = 'none';
a.href = content;
a.download = filename;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
}
document.getElementById('crawlForm').addEventListener('submit', function(event) {
event.preventDefault();
document.getElementById('loading').style.display = 'block';
document.getElementById('response').style.display = 'none';
const url = document.getElementById('url').value;
const screenshot = document.getElementById('screenshot').checked;
const data = {
urls: [url],
bypass_cache: false,
word_count_threshold: 5,
screenshot: screenshot
};
fetch('https://crawl4ai.com/crawl', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(data)
})
.then(response => {
if (!response.ok) {
if (response.status === 429) {
return response.json().then(err => {
throw Object.assign(new Error('Rate limit exceeded'), { status: 429, details: err });
});
}
throw new Error('Network response was not ok');
}
return response.json();
})
.then(data => {
data = data.results[0]; // Only one URL is requested
document.getElementById('loading').style.display = 'none';
document.getElementById('response').style.display = 'block';
redo(document.getElementById('markdownContent'), data.markdown);
redo(document.getElementById('cleanedHtmlContent'), data.cleaned_html);
redo(document.getElementById('mediaContent'), JSON.stringify(data.media, null, 2));
redo(document.getElementById('extractedContentContent'), data.extracted_content);
if (screenshot) {
document.getElementById('screenshotContent').src = `data:image/png;base64,${data.screenshot}`;
}
const pythonCode = `
from crawl4ai.web_crawler import WebCrawler
crawler = WebCrawler()
crawler.warmup()
result = crawler.run(
url='${url}',
screenshot=${screenshot}
)
print(result)
`;
redo(document.getElementById('pythonCode'), pythonCode);
document.getElementById('error').style.display = 'none';
})
.catch(error => {
document.getElementById('loading').style.display = 'none';
document.getElementById('error').style.display = 'block';
let errorMessage = 'An unexpected error occurred. Please try again later.';
if (error.status === 429) {
const details = error.details;
if (details.retry_after) {
errorMessage = `Rate limit exceeded. Please wait ${parseFloat(details.retry_after).toFixed(1)} seconds before trying again.`;
} else if (details.reset_at) {
const resetTime = new Date(details.reset_at);
const waitTime = Math.ceil((resetTime - new Date()) / 1000);
errorMessage = `Rate limit exceeded. Please try again after ${waitTime} seconds.`;
} else {
errorMessage = `Rate limit exceeded. Please try again later.`;
}
} else if (error.message) {
errorMessage = error.message;
}
document.querySelector('#error .terminal-alert').textContent = errorMessage;
});
});
</script>
</div>

158
main.py
View File

@@ -380,97 +380,97 @@ def read_root():
return {"message": "Crawl4AI API service is running"} return {"message": "Crawl4AI API service is running"}
@app.post("/crawl", dependencies=[Depends(verify_token)]) # @app.post("/crawl", dependencies=[Depends(verify_token)])
async def crawl(request: CrawlRequest) -> Dict[str, str]: # async def crawl(request: CrawlRequest) -> Dict[str, str]:
task_id = await crawler_service.submit_task(request) # task_id = await crawler_service.submit_task(request)
return {"task_id": task_id} # return {"task_id": task_id}
@app.get("/task/{task_id}", dependencies=[Depends(verify_token)]) # @app.get("/task/{task_id}", dependencies=[Depends(verify_token)])
async def get_task_status(task_id: str): # async def get_task_status(task_id: str):
task_info = crawler_service.task_manager.get_task(task_id) # task_info = crawler_service.task_manager.get_task(task_id)
if not task_info: # if not task_info:
raise HTTPException(status_code=404, detail="Task not found") # raise HTTPException(status_code=404, detail="Task not found")
response = { # response = {
"status": task_info.status, # "status": task_info.status,
"created_at": task_info.created_at, # "created_at": task_info.created_at,
} # }
if task_info.status == TaskStatus.COMPLETED: # if task_info.status == TaskStatus.COMPLETED:
# Convert CrawlResult to dict for JSON response # # Convert CrawlResult to dict for JSON response
if isinstance(task_info.result, list): # if isinstance(task_info.result, list):
response["results"] = [result.dict() for result in task_info.result] # response["results"] = [result.dict() for result in task_info.result]
else: # else:
response["result"] = task_info.result.dict() # response["result"] = task_info.result.dict()
elif task_info.status == TaskStatus.FAILED: # elif task_info.status == TaskStatus.FAILED:
response["error"] = task_info.error # response["error"] = task_info.error
return response # return response
@app.post("/crawl_sync", dependencies=[Depends(verify_token)]) # @app.post("/crawl_sync", dependencies=[Depends(verify_token)])
async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]: # async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
task_id = await crawler_service.submit_task(request) # task_id = await crawler_service.submit_task(request)
# Wait up to 60 seconds for task completion # # Wait up to 60 seconds for task completion
for _ in range(60): # for _ in range(60):
task_info = crawler_service.task_manager.get_task(task_id) # task_info = crawler_service.task_manager.get_task(task_id)
if not task_info: # if not task_info:
raise HTTPException(status_code=404, detail="Task not found") # raise HTTPException(status_code=404, detail="Task not found")
if task_info.status == TaskStatus.COMPLETED: # if task_info.status == TaskStatus.COMPLETED:
# Return same format as /task/{task_id} endpoint # # Return same format as /task/{task_id} endpoint
if isinstance(task_info.result, list): # if isinstance(task_info.result, list):
return {"status": task_info.status, "results": [result.dict() for result in task_info.result]} # return {"status": task_info.status, "results": [result.dict() for result in task_info.result]}
return {"status": task_info.status, "result": task_info.result.dict()} # return {"status": task_info.status, "result": task_info.result.dict()}
if task_info.status == TaskStatus.FAILED: # if task_info.status == TaskStatus.FAILED:
raise HTTPException(status_code=500, detail=task_info.error) # raise HTTPException(status_code=500, detail=task_info.error)
await asyncio.sleep(1) # await asyncio.sleep(1)
# If we get here, task didn't complete within timeout # # If we get here, task didn't complete within timeout
raise HTTPException(status_code=408, detail="Task timed out") # raise HTTPException(status_code=408, detail="Task timed out")
@app.post("/crawl_direct", dependencies=[Depends(verify_token)]) # @app.post("/crawl_direct", dependencies=[Depends(verify_token)])
async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]: # async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
try: # try:
crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params) # crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config) # extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config)
try: # try:
if isinstance(request.urls, list): # if isinstance(request.urls, list):
results = await crawler.arun_many( # results = await crawler.arun_many(
urls=[str(url) for url in request.urls], # urls=[str(url) for url in request.urls],
extraction_strategy=extraction_strategy, # extraction_strategy=extraction_strategy,
js_code=request.js_code, # js_code=request.js_code,
wait_for=request.wait_for, # wait_for=request.wait_for,
css_selector=request.css_selector, # css_selector=request.css_selector,
screenshot=request.screenshot, # screenshot=request.screenshot,
magic=request.magic, # magic=request.magic,
cache_mode=request.cache_mode, # cache_mode=request.cache_mode,
session_id=request.session_id, # session_id=request.session_id,
**request.extra, # **request.extra,
) # )
return {"results": [result.dict() for result in results]} # return {"results": [result.dict() for result in results]}
else: # else:
result = await crawler.arun( # result = await crawler.arun(
url=str(request.urls), # url=str(request.urls),
extraction_strategy=extraction_strategy, # extraction_strategy=extraction_strategy,
js_code=request.js_code, # js_code=request.js_code,
wait_for=request.wait_for, # wait_for=request.wait_for,
css_selector=request.css_selector, # css_selector=request.css_selector,
screenshot=request.screenshot, # screenshot=request.screenshot,
magic=request.magic, # magic=request.magic,
cache_mode=request.cache_mode, # cache_mode=request.cache_mode,
session_id=request.session_id, # session_id=request.session_id,
**request.extra, # **request.extra,
) # )
return {"result": result.dict()} # return {"result": result.dict()}
finally: # finally:
await crawler_service.crawler_pool.release(crawler) # await crawler_service.crawler_pool.release(crawler)
except Exception as e: # except Exception as e:
logger.error(f"Error in direct crawl: {str(e)}") # logger.error(f"Error in direct crawl: {str(e)}")
raise HTTPException(status_code=500, detail=str(e)) # raise HTTPException(status_code=500, detail=str(e))
@app.get("/health") @app.get("/health")
async def health_check(): async def health_check():

View File

@@ -8,7 +8,7 @@ docs_dir: docs/md_v2
nav: nav:
- Home: 'index.md' - Home: 'index.md'
- 'Installation': 'basic/installation.md' - 'Installation': 'basic/installation.md'
- 'Docker Deplotment': 'basic/docker-deploymeny.md' - 'Docker Deployment': 'basic/docker-deploymeny.md'
- 'Quick Start': 'basic/quickstart.md' - 'Quick Start': 'basic/quickstart.md'
- Changelog & Blog: - Changelog & Blog:
- 'Blog Home': 'blog/index.md' - 'Blog Home': 'blog/index.md'