From 0533aeb81461d64a2c1da2acc0f2ba09515af92b Mon Sep 17 00:00:00 2001 From: unclecode Date: Fri, 7 Jun 2024 15:23:13 +0800 Subject: [PATCH] v0.2.3: - Extract all media tags - Take screenshot of the page --- crawl4ai/database.py | 58 +++++++++++++++++++++++++++++---------- crawl4ai/models.py | 3 +- crawl4ai/utils.py | 32 ++++++++++++++++++++- crawl4ai/web_crawler.py | 4 +++ main.py | 2 +- pages/app.js | 1 + pages/partial/try_it.html | 10 +++++-- setup.py | 2 +- 8 files changed, 90 insertions(+), 22 deletions(-) diff --git a/crawl4ai/database.py b/crawl4ai/database.py index 391d3f4f..bcce8474 100644 --- a/crawl4ai/database.py +++ b/crawl4ai/database.py @@ -1,13 +1,12 @@ import os from pathlib import Path import sqlite3 -from typing import Optional from typing import Optional, Tuple DB_PATH = os.path.join(Path.home(), ".crawl4ai") os.makedirs(DB_PATH, exist_ok=True) DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") - + def init_db(): global DB_PATH conn = sqlite3.connect(DB_PATH) @@ -19,22 +18,34 @@ def init_db(): cleaned_html TEXT, markdown TEXT, extracted_content TEXT, - success BOOLEAN + success BOOLEAN, + media TEXT ) ''') conn.commit() conn.close() -def check_db_path(): - if not DB_PATH: - raise ValueError("Database path is not set or is empty.") - -def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]: +def alter_db_add_media(): check_db_path() try: conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() - cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success FROM crawled_data WHERE url = ?', (url,)) + cursor.execute('ALTER TABLE crawled_data ADD COLUMN media TEXT DEFAULT ""') + conn.commit() + conn.close() + except Exception as e: + print(f"Error altering database to add media column: {e}") + +def check_db_path(): + if not DB_PATH: + raise ValueError("Database path is not set or is empty.") + +def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool, str]]: + check_db_path() + try: + conn = sqlite3.connect(DB_PATH) + cursor = conn.cursor() + cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media FROM crawled_data WHERE url = ?', (url,)) result = cursor.fetchone() conn.close() return result @@ -42,21 +53,22 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]: print(f"Error retrieving cached URL: {e}") return None -def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool): +def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media: str = ""): check_db_path() try: conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() cursor.execute(''' - INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success) - VALUES (?, ?, ?, ?, ?, ?) + INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media) + VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(url) DO UPDATE SET html = excluded.html, cleaned_html = excluded.cleaned_html, markdown = excluded.markdown, extracted_content = excluded.extracted_content, - success = excluded.success - ''', (url, html, cleaned_html, markdown, extracted_content, success)) + success = excluded.success, + media = excluded.media + ''', (url, html, cleaned_html, markdown, extracted_content, success, media)) conn.commit() conn.close() except Exception as e: @@ -95,4 +107,20 @@ def flush_db(): conn.commit() conn.close() except Exception as e: - print(f"Error flushing database: {e}") \ No newline at end of file + print(f"Error flushing database: {e}") + +def update_existing_records(): + check_db_path() + try: + conn = sqlite3.connect(DB_PATH) + cursor = conn.cursor() + cursor.execute('UPDATE crawled_data SET media = "" WHERE media IS NULL') + conn.commit() + conn.close() + except Exception as e: + print(f"Error updating existing records: {e}") + +if __name__ == "__main__": + init_db() # Initialize the database if not already initialized + alter_db_add_media() # Add the new column to the table + update_existing_records() # Update existing records to set the new column to an empty string diff --git a/crawl4ai/models.py b/crawl4ai/models.py index c2c2d61e..bd791b1d 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -1,5 +1,5 @@ from pydantic import BaseModel, HttpUrl -from typing import List +from typing import List, Dict class UrlModel(BaseModel): url: HttpUrl @@ -10,6 +10,7 @@ class CrawlResult(BaseModel): html: str success: bool cleaned_html: str = None + media: Dict[str, List[Dict]] = {} markdown: str = None extracted_content: str = None metadata: dict = None diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index cbeca812..3121ccdc 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -180,6 +180,35 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_ if tag.name != 'img': tag.attrs = {} + # Extract all img tgas inti [{src: '', alt: ''}] + media = { + 'images': [], + 'videos': [], + 'audios': [] + } + for img in body.find_all('img'): + media['images'].append({ + 'src': img.get('src'), + 'alt': img.get('alt'), + "type": "image" + }) + + # Extract all video tags into [{src: '', alt: ''}] + for video in body.find_all('video'): + media['videos'].append({ + 'src': video.get('src'), + 'alt': video.get('alt'), + "type": "video" + }) + + # Extract all audio tags into [{src: '', alt: ''}] + for audio in body.find_all('audio'): + media['audios'].append({ + 'src': audio.get('src'), + 'alt': audio.get('alt'), + "type": "audio" + }) + # Replace images with their alt text or remove them if no alt text is available for img in body.find_all('img'): alt_text = img.get('alt') @@ -299,7 +328,8 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_ return{ 'markdown': markdown, 'cleaned_html': cleaned_html, - 'success': True + 'success': True, + 'media': media } except Exception as e: diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 4535930c..a8f1f818 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -110,6 +110,7 @@ class WebCrawler: "markdown": cached[3], "extracted_content": cached[4], "success": cached[5], + "media": json.loads(cached[6]), "error_message": "", } ) @@ -129,6 +130,7 @@ class WebCrawler: cleaned_html = result.get("cleaned_html", html) markdown = result.get("markdown", "") + media = result.get("media", []) # Print a profession LOG style message, show time taken and say crawling is done if verbose: @@ -163,6 +165,7 @@ class WebCrawler: markdown, extracted_content, success, + json.dumps(media), ) return CrawlResult( @@ -170,6 +173,7 @@ class WebCrawler: html=html, cleaned_html=cleaned_html, markdown=markdown, + media=media, extracted_content=extracted_content, success=success, error_message=error_message, diff --git a/main.py b/main.py index 5dca8771..26cc1880 100644 --- a/main.py +++ b/main.py @@ -136,7 +136,7 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request): for result in results: result.html = None - return {"results": [result.dict() for result in results]} + return {"results": [result.model_dump() for result in results]} finally: async with lock: current_requests -= 1 diff --git a/pages/app.js b/pages/app.js index 1a09969e..e56dd899 100644 --- a/pages/app.js +++ b/pages/app.js @@ -137,6 +137,7 @@ document.getElementById("crawl-btn").addEventListener("click", () => { document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2); document.getElementById("cleaned-html-result").textContent = result.cleaned_html; document.getElementById("markdown-result").textContent = result.markdown; + document.getElementById("media-result").textContent = JSON.stringify( result.media, null, 2); // Update code examples dynamically const extractionStrategy = data.extraction_strategy; diff --git a/pages/partial/try_it.html b/pages/partial/try_it.html index 544e69dd..3674944e 100644 --- a/pages/partial/try_it.html +++ b/pages/partial/try_it.html @@ -1,4 +1,4 @@ -
+

Try It Now

@@ -135,7 +135,7 @@ -
+
+
+
-
+