- Extract all media tags
- Take screenshot of the page
This commit is contained in:
unclecode
2024-06-07 15:23:13 +08:00
parent aead6de888
commit 0533aeb814
8 changed files with 90 additions and 22 deletions

View File

@@ -1,7 +1,6 @@
import os import os
from pathlib import Path from pathlib import Path
import sqlite3 import sqlite3
from typing import Optional
from typing import Optional, Tuple from typing import Optional, Tuple
DB_PATH = os.path.join(Path.home(), ".crawl4ai") DB_PATH = os.path.join(Path.home(), ".crawl4ai")
@@ -19,22 +18,34 @@ def init_db():
cleaned_html TEXT, cleaned_html TEXT,
markdown TEXT, markdown TEXT,
extracted_content TEXT, extracted_content TEXT,
success BOOLEAN success BOOLEAN,
media TEXT
) )
''') ''')
conn.commit() conn.commit()
conn.close() conn.close()
def check_db_path(): def alter_db_add_media():
if not DB_PATH:
raise ValueError("Database path is not set or is empty.")
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
check_db_path() check_db_path()
try: try:
conn = sqlite3.connect(DB_PATH) conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success FROM crawled_data WHERE url = ?', (url,)) cursor.execute('ALTER TABLE crawled_data ADD COLUMN media TEXT DEFAULT ""')
conn.commit()
conn.close()
except Exception as e:
print(f"Error altering database to add media column: {e}")
def check_db_path():
if not DB_PATH:
raise ValueError("Database path is not set or is empty.")
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool, str]]:
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media FROM crawled_data WHERE url = ?', (url,))
result = cursor.fetchone() result = cursor.fetchone()
conn.close() conn.close()
return result return result
@@ -42,21 +53,22 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
print(f"Error retrieving cached URL: {e}") print(f"Error retrieving cached URL: {e}")
return None return None
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool): def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media: str = ""):
check_db_path() check_db_path()
try: try:
conn = sqlite3.connect(DB_PATH) conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute(''' cursor.execute('''
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success) INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media)
VALUES (?, ?, ?, ?, ?, ?) VALUES (?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(url) DO UPDATE SET ON CONFLICT(url) DO UPDATE SET
html = excluded.html, html = excluded.html,
cleaned_html = excluded.cleaned_html, cleaned_html = excluded.cleaned_html,
markdown = excluded.markdown, markdown = excluded.markdown,
extracted_content = excluded.extracted_content, extracted_content = excluded.extracted_content,
success = excluded.success success = excluded.success,
''', (url, html, cleaned_html, markdown, extracted_content, success)) media = excluded.media
''', (url, html, cleaned_html, markdown, extracted_content, success, media))
conn.commit() conn.commit()
conn.close() conn.close()
except Exception as e: except Exception as e:
@@ -96,3 +108,19 @@ def flush_db():
conn.close() conn.close()
except Exception as e: except Exception as e:
print(f"Error flushing database: {e}") print(f"Error flushing database: {e}")
def update_existing_records():
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('UPDATE crawled_data SET media = "" WHERE media IS NULL')
conn.commit()
conn.close()
except Exception as e:
print(f"Error updating existing records: {e}")
if __name__ == "__main__":
init_db() # Initialize the database if not already initialized
alter_db_add_media() # Add the new column to the table
update_existing_records() # Update existing records to set the new column to an empty string

View File

@@ -1,5 +1,5 @@
from pydantic import BaseModel, HttpUrl from pydantic import BaseModel, HttpUrl
from typing import List from typing import List, Dict
class UrlModel(BaseModel): class UrlModel(BaseModel):
url: HttpUrl url: HttpUrl
@@ -10,6 +10,7 @@ class CrawlResult(BaseModel):
html: str html: str
success: bool success: bool
cleaned_html: str = None cleaned_html: str = None
media: Dict[str, List[Dict]] = {}
markdown: str = None markdown: str = None
extracted_content: str = None extracted_content: str = None
metadata: dict = None metadata: dict = None

View File

@@ -180,6 +180,35 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
if tag.name != 'img': if tag.name != 'img':
tag.attrs = {} tag.attrs = {}
# Extract all img tgas inti [{src: '', alt: ''}]
media = {
'images': [],
'videos': [],
'audios': []
}
for img in body.find_all('img'):
media['images'].append({
'src': img.get('src'),
'alt': img.get('alt'),
"type": "image"
})
# Extract all video tags into [{src: '', alt: ''}]
for video in body.find_all('video'):
media['videos'].append({
'src': video.get('src'),
'alt': video.get('alt'),
"type": "video"
})
# Extract all audio tags into [{src: '', alt: ''}]
for audio in body.find_all('audio'):
media['audios'].append({
'src': audio.get('src'),
'alt': audio.get('alt'),
"type": "audio"
})
# Replace images with their alt text or remove them if no alt text is available # Replace images with their alt text or remove them if no alt text is available
for img in body.find_all('img'): for img in body.find_all('img'):
alt_text = img.get('alt') alt_text = img.get('alt')
@@ -299,7 +328,8 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
return{ return{
'markdown': markdown, 'markdown': markdown,
'cleaned_html': cleaned_html, 'cleaned_html': cleaned_html,
'success': True 'success': True,
'media': media
} }
except Exception as e: except Exception as e:

View File

@@ -110,6 +110,7 @@ class WebCrawler:
"markdown": cached[3], "markdown": cached[3],
"extracted_content": cached[4], "extracted_content": cached[4],
"success": cached[5], "success": cached[5],
"media": json.loads(cached[6]),
"error_message": "", "error_message": "",
} }
) )
@@ -129,6 +130,7 @@ class WebCrawler:
cleaned_html = result.get("cleaned_html", html) cleaned_html = result.get("cleaned_html", html)
markdown = result.get("markdown", "") markdown = result.get("markdown", "")
media = result.get("media", [])
# Print a profession LOG style message, show time taken and say crawling is done # Print a profession LOG style message, show time taken and say crawling is done
if verbose: if verbose:
@@ -163,6 +165,7 @@ class WebCrawler:
markdown, markdown,
extracted_content, extracted_content,
success, success,
json.dumps(media),
) )
return CrawlResult( return CrawlResult(
@@ -170,6 +173,7 @@ class WebCrawler:
html=html, html=html,
cleaned_html=cleaned_html, cleaned_html=cleaned_html,
markdown=markdown, markdown=markdown,
media=media,
extracted_content=extracted_content, extracted_content=extracted_content,
success=success, success=success,
error_message=error_message, error_message=error_message,

View File

@@ -136,7 +136,7 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
for result in results: for result in results:
result.html = None result.html = None
return {"results": [result.dict() for result in results]} return {"results": [result.model_dump() for result in results]}
finally: finally:
async with lock: async with lock:
current_requests -= 1 current_requests -= 1

View File

@@ -137,6 +137,7 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2); document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
document.getElementById("cleaned-html-result").textContent = result.cleaned_html; document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
document.getElementById("markdown-result").textContent = result.markdown; document.getElementById("markdown-result").textContent = result.markdown;
document.getElementById("media-result").textContent = JSON.stringify( result.media, null, 2);
// Update code examples dynamically // Update code examples dynamically
const extractionStrategy = data.extraction_strategy; const extractionStrategy = data.extraction_strategy;

View File

@@ -1,4 +1,4 @@
<section class="try-it py-8 px-16 pb-20 bg-zinc-900"> <section class="try-it py-8 px-16 pb-20 bg-zinc-900 overflow-hidden">
<div class="container mx-auto "> <div class="container mx-auto ">
<h2 class="text-2xl font-bold mb-4 text-lime-500">Try It Now</h2> <h2 class="text-2xl font-bold mb-4 text-lime-500">Try It Now</h2>
<div class="flex gap-4"> <div class="flex gap-4">
@@ -135,7 +135,7 @@
<div id="loading" class="hidden"> <div id="loading" class="hidden">
<p class="text-white">Loading... Please wait.</p> <p class="text-white">Loading... Please wait.</p>
</div> </div>
<div id="result" class="flex-1"> <div id="result" class="flex-1 overflow-x-auto">
<div class="tab-buttons flex gap-2"> <div class="tab-buttons flex gap-2">
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="json"> <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="json">
JSON JSON
@@ -149,15 +149,19 @@
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="markdown"> <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="markdown">
Markdown Markdown
</button> </button>
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="media">
Medias
</button>
</div> </div>
<div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm"> <div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
<pre class="h-full flex"><code id="json-result" class="language-json"></code></pre> <pre class="h-full flex"><code id="json-result" class="language-json"></code></pre>
<pre class="hidden h-full flex"><code id="cleaned-html-result" class="language-html"></code></pre> <pre class="hidden h-full flex"><code id="cleaned-html-result" class="language-html"></code></pre>
<pre class="hidden h-full flex"><code id="markdown-result" class="language-markdown"></code></pre> <pre class="hidden h-full flex"><code id="markdown-result" class="language-markdown"></code></pre>
<pre class="hidden h-full flex"><code id="media-result" class="language-json"></code></pre>
</div> </div>
</div> </div>
<div id="code_help" class="flex-1"> <div id="code_help" class="flex-1 overflow-x-auto">
<div class="tab-buttons flex gap-2"> <div class="tab-buttons flex gap-2">
<button class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="curl"> <button class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="curl">
cURL cURL

View File

@@ -26,7 +26,7 @@ class CustomInstallCommand(install):
setup( setup(
name="Crawl4AI", name="Crawl4AI",
version="0.2.2", version="0.2.3",
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper", description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
long_description=open("README.md").read(), long_description=open("README.md").read(),
long_description_content_type="text/markdown", long_description_content_type="text/markdown",