- Extract all media tags
- Take screenshot of the page
This commit is contained in:
unclecode
2024-06-07 15:23:13 +08:00
parent aead6de888
commit 0533aeb814
8 changed files with 90 additions and 22 deletions

View File

@@ -1,13 +1,12 @@
import os
from pathlib import Path
import sqlite3
from typing import Optional
from typing import Optional, Tuple
DB_PATH = os.path.join(Path.home(), ".crawl4ai")
os.makedirs(DB_PATH, exist_ok=True)
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
def init_db():
global DB_PATH
conn = sqlite3.connect(DB_PATH)
@@ -19,22 +18,34 @@ def init_db():
cleaned_html TEXT,
markdown TEXT,
extracted_content TEXT,
success BOOLEAN
success BOOLEAN,
media TEXT
)
''')
conn.commit()
conn.close()
def check_db_path():
if not DB_PATH:
raise ValueError("Database path is not set or is empty.")
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
def alter_db_add_media():
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success FROM crawled_data WHERE url = ?', (url,))
cursor.execute('ALTER TABLE crawled_data ADD COLUMN media TEXT DEFAULT ""')
conn.commit()
conn.close()
except Exception as e:
print(f"Error altering database to add media column: {e}")
def check_db_path():
if not DB_PATH:
raise ValueError("Database path is not set or is empty.")
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool, str]]:
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media FROM crawled_data WHERE url = ?', (url,))
result = cursor.fetchone()
conn.close()
return result
@@ -42,21 +53,22 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
print(f"Error retrieving cached URL: {e}")
return None
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool):
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media: str = ""):
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('''
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success)
VALUES (?, ?, ?, ?, ?, ?)
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media)
VALUES (?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
html = excluded.html,
cleaned_html = excluded.cleaned_html,
markdown = excluded.markdown,
extracted_content = excluded.extracted_content,
success = excluded.success
''', (url, html, cleaned_html, markdown, extracted_content, success))
success = excluded.success,
media = excluded.media
''', (url, html, cleaned_html, markdown, extracted_content, success, media))
conn.commit()
conn.close()
except Exception as e:
@@ -95,4 +107,20 @@ def flush_db():
conn.commit()
conn.close()
except Exception as e:
print(f"Error flushing database: {e}")
print(f"Error flushing database: {e}")
def update_existing_records():
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('UPDATE crawled_data SET media = "" WHERE media IS NULL')
conn.commit()
conn.close()
except Exception as e:
print(f"Error updating existing records: {e}")
if __name__ == "__main__":
init_db() # Initialize the database if not already initialized
alter_db_add_media() # Add the new column to the table
update_existing_records() # Update existing records to set the new column to an empty string

View File

@@ -1,5 +1,5 @@
from pydantic import BaseModel, HttpUrl
from typing import List
from typing import List, Dict
class UrlModel(BaseModel):
url: HttpUrl
@@ -10,6 +10,7 @@ class CrawlResult(BaseModel):
html: str
success: bool
cleaned_html: str = None
media: Dict[str, List[Dict]] = {}
markdown: str = None
extracted_content: str = None
metadata: dict = None

View File

@@ -180,6 +180,35 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
if tag.name != 'img':
tag.attrs = {}
# Extract all img tgas inti [{src: '', alt: ''}]
media = {
'images': [],
'videos': [],
'audios': []
}
for img in body.find_all('img'):
media['images'].append({
'src': img.get('src'),
'alt': img.get('alt'),
"type": "image"
})
# Extract all video tags into [{src: '', alt: ''}]
for video in body.find_all('video'):
media['videos'].append({
'src': video.get('src'),
'alt': video.get('alt'),
"type": "video"
})
# Extract all audio tags into [{src: '', alt: ''}]
for audio in body.find_all('audio'):
media['audios'].append({
'src': audio.get('src'),
'alt': audio.get('alt'),
"type": "audio"
})
# Replace images with their alt text or remove them if no alt text is available
for img in body.find_all('img'):
alt_text = img.get('alt')
@@ -299,7 +328,8 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
return{
'markdown': markdown,
'cleaned_html': cleaned_html,
'success': True
'success': True,
'media': media
}
except Exception as e:

View File

@@ -110,6 +110,7 @@ class WebCrawler:
"markdown": cached[3],
"extracted_content": cached[4],
"success": cached[5],
"media": json.loads(cached[6]),
"error_message": "",
}
)
@@ -129,6 +130,7 @@ class WebCrawler:
cleaned_html = result.get("cleaned_html", html)
markdown = result.get("markdown", "")
media = result.get("media", [])
# Print a profession LOG style message, show time taken and say crawling is done
if verbose:
@@ -163,6 +165,7 @@ class WebCrawler:
markdown,
extracted_content,
success,
json.dumps(media),
)
return CrawlResult(
@@ -170,6 +173,7 @@ class WebCrawler:
html=html,
cleaned_html=cleaned_html,
markdown=markdown,
media=media,
extracted_content=extracted_content,
success=success,
error_message=error_message,

View File

@@ -136,7 +136,7 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
for result in results:
result.html = None
return {"results": [result.dict() for result in results]}
return {"results": [result.model_dump() for result in results]}
finally:
async with lock:
current_requests -= 1

View File

@@ -137,6 +137,7 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
document.getElementById("markdown-result").textContent = result.markdown;
document.getElementById("media-result").textContent = JSON.stringify( result.media, null, 2);
// Update code examples dynamically
const extractionStrategy = data.extraction_strategy;

View File

@@ -1,4 +1,4 @@
<section class="try-it py-8 px-16 pb-20 bg-zinc-900">
<section class="try-it py-8 px-16 pb-20 bg-zinc-900 overflow-hidden">
<div class="container mx-auto ">
<h2 class="text-2xl font-bold mb-4 text-lime-500">Try It Now</h2>
<div class="flex gap-4">
@@ -135,7 +135,7 @@
<div id="loading" class="hidden">
<p class="text-white">Loading... Please wait.</p>
</div>
<div id="result" class="flex-1">
<div id="result" class="flex-1 overflow-x-auto">
<div class="tab-buttons flex gap-2">
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="json">
JSON
@@ -149,15 +149,19 @@
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="markdown">
Markdown
</button>
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="media">
Medias
</button>
</div>
<div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
<pre class="h-full flex"><code id="json-result" class="language-json"></code></pre>
<pre class="hidden h-full flex"><code id="cleaned-html-result" class="language-html"></code></pre>
<pre class="hidden h-full flex"><code id="markdown-result" class="language-markdown"></code></pre>
<pre class="hidden h-full flex"><code id="media-result" class="language-json"></code></pre>
</div>
</div>
<div id="code_help" class="flex-1">
<div id="code_help" class="flex-1 overflow-x-auto">
<div class="tab-buttons flex gap-2">
<button class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="curl">
cURL

View File

@@ -26,7 +26,7 @@ class CustomInstallCommand(install):
setup(
name="Crawl4AI",
version="0.2.2",
version="0.2.3",
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
long_description=open("README.md").read(),
long_description_content_type="text/markdown",