v0.2.3:
- Extract all media tags - Take screenshot of the page
This commit is contained in:
@@ -1,13 +1,12 @@
|
|||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sqlite3
|
import sqlite3
|
||||||
from typing import Optional
|
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
DB_PATH = os.path.join(Path.home(), ".crawl4ai")
|
DB_PATH = os.path.join(Path.home(), ".crawl4ai")
|
||||||
os.makedirs(DB_PATH, exist_ok=True)
|
os.makedirs(DB_PATH, exist_ok=True)
|
||||||
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
|
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
|
||||||
|
|
||||||
def init_db():
|
def init_db():
|
||||||
global DB_PATH
|
global DB_PATH
|
||||||
conn = sqlite3.connect(DB_PATH)
|
conn = sqlite3.connect(DB_PATH)
|
||||||
@@ -19,22 +18,34 @@ def init_db():
|
|||||||
cleaned_html TEXT,
|
cleaned_html TEXT,
|
||||||
markdown TEXT,
|
markdown TEXT,
|
||||||
extracted_content TEXT,
|
extracted_content TEXT,
|
||||||
success BOOLEAN
|
success BOOLEAN,
|
||||||
|
media TEXT
|
||||||
)
|
)
|
||||||
''')
|
''')
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
def check_db_path():
|
def alter_db_add_media():
|
||||||
if not DB_PATH:
|
|
||||||
raise ValueError("Database path is not set or is empty.")
|
|
||||||
|
|
||||||
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
|
|
||||||
check_db_path()
|
check_db_path()
|
||||||
try:
|
try:
|
||||||
conn = sqlite3.connect(DB_PATH)
|
conn = sqlite3.connect(DB_PATH)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success FROM crawled_data WHERE url = ?', (url,))
|
cursor.execute('ALTER TABLE crawled_data ADD COLUMN media TEXT DEFAULT ""')
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error altering database to add media column: {e}")
|
||||||
|
|
||||||
|
def check_db_path():
|
||||||
|
if not DB_PATH:
|
||||||
|
raise ValueError("Database path is not set or is empty.")
|
||||||
|
|
||||||
|
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool, str]]:
|
||||||
|
check_db_path()
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(DB_PATH)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media FROM crawled_data WHERE url = ?', (url,))
|
||||||
result = cursor.fetchone()
|
result = cursor.fetchone()
|
||||||
conn.close()
|
conn.close()
|
||||||
return result
|
return result
|
||||||
@@ -42,21 +53,22 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
|
|||||||
print(f"Error retrieving cached URL: {e}")
|
print(f"Error retrieving cached URL: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool):
|
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media: str = ""):
|
||||||
check_db_path()
|
check_db_path()
|
||||||
try:
|
try:
|
||||||
conn = sqlite3.connect(DB_PATH)
|
conn = sqlite3.connect(DB_PATH)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success)
|
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media)
|
||||||
VALUES (?, ?, ?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||||
ON CONFLICT(url) DO UPDATE SET
|
ON CONFLICT(url) DO UPDATE SET
|
||||||
html = excluded.html,
|
html = excluded.html,
|
||||||
cleaned_html = excluded.cleaned_html,
|
cleaned_html = excluded.cleaned_html,
|
||||||
markdown = excluded.markdown,
|
markdown = excluded.markdown,
|
||||||
extracted_content = excluded.extracted_content,
|
extracted_content = excluded.extracted_content,
|
||||||
success = excluded.success
|
success = excluded.success,
|
||||||
''', (url, html, cleaned_html, markdown, extracted_content, success))
|
media = excluded.media
|
||||||
|
''', (url, html, cleaned_html, markdown, extracted_content, success, media))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -95,4 +107,20 @@ def flush_db():
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error flushing database: {e}")
|
print(f"Error flushing database: {e}")
|
||||||
|
|
||||||
|
def update_existing_records():
|
||||||
|
check_db_path()
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(DB_PATH)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute('UPDATE crawled_data SET media = "" WHERE media IS NULL')
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error updating existing records: {e}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
init_db() # Initialize the database if not already initialized
|
||||||
|
alter_db_add_media() # Add the new column to the table
|
||||||
|
update_existing_records() # Update existing records to set the new column to an empty string
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
from pydantic import BaseModel, HttpUrl
|
from pydantic import BaseModel, HttpUrl
|
||||||
from typing import List
|
from typing import List, Dict
|
||||||
|
|
||||||
class UrlModel(BaseModel):
|
class UrlModel(BaseModel):
|
||||||
url: HttpUrl
|
url: HttpUrl
|
||||||
@@ -10,6 +10,7 @@ class CrawlResult(BaseModel):
|
|||||||
html: str
|
html: str
|
||||||
success: bool
|
success: bool
|
||||||
cleaned_html: str = None
|
cleaned_html: str = None
|
||||||
|
media: Dict[str, List[Dict]] = {}
|
||||||
markdown: str = None
|
markdown: str = None
|
||||||
extracted_content: str = None
|
extracted_content: str = None
|
||||||
metadata: dict = None
|
metadata: dict = None
|
||||||
|
|||||||
@@ -180,6 +180,35 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
|
|||||||
if tag.name != 'img':
|
if tag.name != 'img':
|
||||||
tag.attrs = {}
|
tag.attrs = {}
|
||||||
|
|
||||||
|
# Extract all img tgas inti [{src: '', alt: ''}]
|
||||||
|
media = {
|
||||||
|
'images': [],
|
||||||
|
'videos': [],
|
||||||
|
'audios': []
|
||||||
|
}
|
||||||
|
for img in body.find_all('img'):
|
||||||
|
media['images'].append({
|
||||||
|
'src': img.get('src'),
|
||||||
|
'alt': img.get('alt'),
|
||||||
|
"type": "image"
|
||||||
|
})
|
||||||
|
|
||||||
|
# Extract all video tags into [{src: '', alt: ''}]
|
||||||
|
for video in body.find_all('video'):
|
||||||
|
media['videos'].append({
|
||||||
|
'src': video.get('src'),
|
||||||
|
'alt': video.get('alt'),
|
||||||
|
"type": "video"
|
||||||
|
})
|
||||||
|
|
||||||
|
# Extract all audio tags into [{src: '', alt: ''}]
|
||||||
|
for audio in body.find_all('audio'):
|
||||||
|
media['audios'].append({
|
||||||
|
'src': audio.get('src'),
|
||||||
|
'alt': audio.get('alt'),
|
||||||
|
"type": "audio"
|
||||||
|
})
|
||||||
|
|
||||||
# Replace images with their alt text or remove them if no alt text is available
|
# Replace images with their alt text or remove them if no alt text is available
|
||||||
for img in body.find_all('img'):
|
for img in body.find_all('img'):
|
||||||
alt_text = img.get('alt')
|
alt_text = img.get('alt')
|
||||||
@@ -299,7 +328,8 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
|
|||||||
return{
|
return{
|
||||||
'markdown': markdown,
|
'markdown': markdown,
|
||||||
'cleaned_html': cleaned_html,
|
'cleaned_html': cleaned_html,
|
||||||
'success': True
|
'success': True,
|
||||||
|
'media': media
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -110,6 +110,7 @@ class WebCrawler:
|
|||||||
"markdown": cached[3],
|
"markdown": cached[3],
|
||||||
"extracted_content": cached[4],
|
"extracted_content": cached[4],
|
||||||
"success": cached[5],
|
"success": cached[5],
|
||||||
|
"media": json.loads(cached[6]),
|
||||||
"error_message": "",
|
"error_message": "",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@@ -129,6 +130,7 @@ class WebCrawler:
|
|||||||
|
|
||||||
cleaned_html = result.get("cleaned_html", html)
|
cleaned_html = result.get("cleaned_html", html)
|
||||||
markdown = result.get("markdown", "")
|
markdown = result.get("markdown", "")
|
||||||
|
media = result.get("media", [])
|
||||||
|
|
||||||
# Print a profession LOG style message, show time taken and say crawling is done
|
# Print a profession LOG style message, show time taken and say crawling is done
|
||||||
if verbose:
|
if verbose:
|
||||||
@@ -163,6 +165,7 @@ class WebCrawler:
|
|||||||
markdown,
|
markdown,
|
||||||
extracted_content,
|
extracted_content,
|
||||||
success,
|
success,
|
||||||
|
json.dumps(media),
|
||||||
)
|
)
|
||||||
|
|
||||||
return CrawlResult(
|
return CrawlResult(
|
||||||
@@ -170,6 +173,7 @@ class WebCrawler:
|
|||||||
html=html,
|
html=html,
|
||||||
cleaned_html=cleaned_html,
|
cleaned_html=cleaned_html,
|
||||||
markdown=markdown,
|
markdown=markdown,
|
||||||
|
media=media,
|
||||||
extracted_content=extracted_content,
|
extracted_content=extracted_content,
|
||||||
success=success,
|
success=success,
|
||||||
error_message=error_message,
|
error_message=error_message,
|
||||||
|
|||||||
2
main.py
2
main.py
@@ -136,7 +136,7 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
|
|||||||
for result in results:
|
for result in results:
|
||||||
result.html = None
|
result.html = None
|
||||||
|
|
||||||
return {"results": [result.dict() for result in results]}
|
return {"results": [result.model_dump() for result in results]}
|
||||||
finally:
|
finally:
|
||||||
async with lock:
|
async with lock:
|
||||||
current_requests -= 1
|
current_requests -= 1
|
||||||
|
|||||||
@@ -137,6 +137,7 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
|
|||||||
document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
|
document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
|
||||||
document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
|
document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
|
||||||
document.getElementById("markdown-result").textContent = result.markdown;
|
document.getElementById("markdown-result").textContent = result.markdown;
|
||||||
|
document.getElementById("media-result").textContent = JSON.stringify( result.media, null, 2);
|
||||||
|
|
||||||
// Update code examples dynamically
|
// Update code examples dynamically
|
||||||
const extractionStrategy = data.extraction_strategy;
|
const extractionStrategy = data.extraction_strategy;
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
<section class="try-it py-8 px-16 pb-20 bg-zinc-900">
|
<section class="try-it py-8 px-16 pb-20 bg-zinc-900 overflow-hidden">
|
||||||
<div class="container mx-auto ">
|
<div class="container mx-auto ">
|
||||||
<h2 class="text-2xl font-bold mb-4 text-lime-500">Try It Now</h2>
|
<h2 class="text-2xl font-bold mb-4 text-lime-500">Try It Now</h2>
|
||||||
<div class="flex gap-4">
|
<div class="flex gap-4">
|
||||||
@@ -135,7 +135,7 @@
|
|||||||
<div id="loading" class="hidden">
|
<div id="loading" class="hidden">
|
||||||
<p class="text-white">Loading... Please wait.</p>
|
<p class="text-white">Loading... Please wait.</p>
|
||||||
</div>
|
</div>
|
||||||
<div id="result" class="flex-1">
|
<div id="result" class="flex-1 overflow-x-auto">
|
||||||
<div class="tab-buttons flex gap-2">
|
<div class="tab-buttons flex gap-2">
|
||||||
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="json">
|
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="json">
|
||||||
JSON
|
JSON
|
||||||
@@ -149,15 +149,19 @@
|
|||||||
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="markdown">
|
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="markdown">
|
||||||
Markdown
|
Markdown
|
||||||
</button>
|
</button>
|
||||||
|
<button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="media">
|
||||||
|
Medias
|
||||||
|
</button>
|
||||||
</div>
|
</div>
|
||||||
<div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
|
<div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
|
||||||
<pre class="h-full flex"><code id="json-result" class="language-json"></code></pre>
|
<pre class="h-full flex"><code id="json-result" class="language-json"></code></pre>
|
||||||
<pre class="hidden h-full flex"><code id="cleaned-html-result" class="language-html"></code></pre>
|
<pre class="hidden h-full flex"><code id="cleaned-html-result" class="language-html"></code></pre>
|
||||||
<pre class="hidden h-full flex"><code id="markdown-result" class="language-markdown"></code></pre>
|
<pre class="hidden h-full flex"><code id="markdown-result" class="language-markdown"></code></pre>
|
||||||
|
<pre class="hidden h-full flex"><code id="media-result" class="language-json"></code></pre>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div id="code_help" class="flex-1">
|
<div id="code_help" class="flex-1 overflow-x-auto">
|
||||||
<div class="tab-buttons flex gap-2">
|
<div class="tab-buttons flex gap-2">
|
||||||
<button class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="curl">
|
<button class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="curl">
|
||||||
cURL
|
cURL
|
||||||
|
|||||||
2
setup.py
2
setup.py
@@ -26,7 +26,7 @@ class CustomInstallCommand(install):
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="Crawl4AI",
|
name="Crawl4AI",
|
||||||
version="0.2.2",
|
version="0.2.3",
|
||||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
||||||
long_description=open("README.md").read(),
|
long_description=open("README.md").read(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
|
|||||||
Reference in New Issue
Block a user