v0.2.3:

- Extract all media tags - Take screenshot of the page
2024-06-07 15:23:13 +08:00
parent aead6de888
commit 0533aeb814
8 changed files with 90 additions and 22 deletions
--- a/crawl4ai/database.py
+++ b/crawl4ai/database.py
@@ -1,13 +1,12 @@
 import os
 from pathlib import Path
 import sqlite3
 from typing import Optional
 from typing import Optional, Tuple
 DB_PATH = os.path.join(Path.home(), ".crawl4ai")
 os.makedirs(DB_PATH, exist_ok=True)
 DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
-        
+
 def init_db():
    global DB_PATH
    conn = sqlite3.connect(DB_PATH)
@@ -19,22 +18,34 @@ def init_db():
            cleaned_html TEXT,
            markdown TEXT,
            extracted_content TEXT,
-            success BOOLEAN
+            success BOOLEAN,
            media TEXT
        )
    ''')
    conn.commit()
    conn.close()
-def check_db_path():
+def alter_db_add_media():
    if not DB_PATH:
        raise ValueError("Database path is not set or is empty.")
 def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
    check_db_path()
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
-        cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success FROM crawled_data WHERE url = ?', (url,))
+        cursor.execute('ALTER TABLE crawled_data ADD COLUMN media TEXT DEFAULT ""')
        conn.commit()
        conn.close()
    except Exception as e:
        print(f"Error altering database to add media column: {e}")
 def check_db_path():
    if not DB_PATH:
        raise ValueError("Database path is not set or is empty.")
 def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool, str]]:
    check_db_path()
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media FROM crawled_data WHERE url = ?', (url,))
        result = cursor.fetchone()
        conn.close()
        return result
@@ -42,21 +53,22 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
        print(f"Error retrieving cached URL: {e}")
        return None
-def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool):
+def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media: str = ""):
    check_db_path()
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute('''
-            INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success)
+            INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media)
-            VALUES (?, ?, ?, ?, ?, ?)
+            VALUES (?, ?, ?, ?, ?, ?, ?)
            ON CONFLICT(url) DO UPDATE SET
                html = excluded.html,
                cleaned_html = excluded.cleaned_html,
                markdown = excluded.markdown,
                extracted_content = excluded.extracted_content,
-                success = excluded.success
+                success = excluded.success,
-        ''', (url, html, cleaned_html, markdown, extracted_content, success))
+                media = excluded.media
        ''', (url, html, cleaned_html, markdown, extracted_content, success, media))
        conn.commit()
        conn.close()
    except Exception as e:
@@ -95,4 +107,20 @@ def flush_db():
        conn.commit()
        conn.close()
    except Exception as e:
-        print(f"Error flushing database: {e}")
+        print(f"Error flushing database: {e}")
 def update_existing_records():
    check_db_path()
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute('UPDATE crawled_data SET media = "" WHERE media IS NULL')
        conn.commit()
        conn.close()
    except Exception as e:
        print(f"Error updating existing records: {e}")
 if __name__ == "__main__":
    init_db()  # Initialize the database if not already initialized
    alter_db_add_media()  # Add the new column to the table
    update_existing_records()  # Update existing records to set the new column to an empty string
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -1,5 +1,5 @@
 from pydantic import BaseModel, HttpUrl
-from typing import List
+from typing import List, Dict
 class UrlModel(BaseModel):
    url: HttpUrl
@@ -10,6 +10,7 @@ class CrawlResult(BaseModel):
    html: str
    success: bool
    cleaned_html: str = None
    media: Dict[str, List[Dict]] = {}
    markdown: str = None
    extracted_content: str = None
    metadata: dict = None
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -180,6 +180,35 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
            if tag.name != 'img':
                tag.attrs = {}
        # Extract all img tgas inti [{src: '', alt: ''}]
        media = {
            'images': [],
            'videos': [],
            'audios': []
        }
        for img in body.find_all('img'):
            media['images'].append({
                'src': img.get('src'),
                'alt': img.get('alt'),
                "type": "image"
            })
        # Extract all video tags into [{src: '', alt: ''}]
        for video in body.find_all('video'):
            media['videos'].append({
                'src': video.get('src'),
                'alt': video.get('alt'),
                "type": "video"
            })
        # Extract all audio tags into [{src: '', alt: ''}]
        for audio in body.find_all('audio'):
            media['audios'].append({
                'src': audio.get('src'),
                'alt': audio.get('alt'),
                "type": "audio"
            })
        # Replace images with their alt text or remove them if no alt text is available
        for img in body.find_all('img'):
            alt_text = img.get('alt')
@@ -299,7 +328,8 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
        return{
            'markdown': markdown,
            'cleaned_html': cleaned_html,
-            'success': True
+            'success': True,
            'media': media
        }
    except Exception as e:
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -110,6 +110,7 @@ class WebCrawler:
                        "markdown": cached[3],
                        "extracted_content": cached[4],
                        "success": cached[5],
                        "media": json.loads(cached[6]),
                        "error_message": "",
                    }
                )
@@ -129,6 +130,7 @@ class WebCrawler:
        cleaned_html = result.get("cleaned_html", html)
        markdown = result.get("markdown", "")
        media = result.get("media", [])
        # Print a profession LOG style message, show time taken and say crawling is done
        if verbose:
@@ -163,6 +165,7 @@ class WebCrawler:
            markdown,
            extracted_content,
            success,
            json.dumps(media),
        )
        return CrawlResult(
@@ -170,6 +173,7 @@ class WebCrawler:
            html=html,
            cleaned_html=cleaned_html,
            markdown=markdown,
            media=media,
            extracted_content=extracted_content,
            success=success,
            error_message=error_message,
--- a/main.py
+++ b/main.py
@@ -136,7 +136,7 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
            for result in results:
                result.html = None
-        return {"results": [result.dict() for result in results]}
+        return {"results": [result.model_dump() for result in results]}
    finally:
        async with lock:
            current_requests -= 1
--- a/pages/app.js
+++ b/pages/app.js
@@ -137,6 +137,7 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
            document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
            document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
            document.getElementById("markdown-result").textContent = result.markdown;
            document.getElementById("media-result").textContent = JSON.stringify( result.media, null, 2);
            // Update code examples dynamically
            const extractionStrategy = data.extraction_strategy;
--- a/pages/partial/try_it.html
+++ b/pages/partial/try_it.html
@@ -1,4 +1,4 @@
-<section class="try-it py-8 px-16 pb-20 bg-zinc-900">
+<section class="try-it py-8 px-16 pb-20 bg-zinc-900 overflow-hidden">
    <div class="container mx-auto ">
        <h2 class="text-2xl font-bold mb-4 text-lime-500">Try It Now</h2>
        <div class="flex gap-4">
@@ -135,7 +135,7 @@
            <div id="loading" class="hidden">
                <p class="text-white">Loading... Please wait.</p>
            </div>
-            <div id="result" class="flex-1">
+            <div id="result" class="flex-1  overflow-x-auto">
                <div class="tab-buttons flex gap-2">
                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="json">
                        JSON
@@ -149,15 +149,19 @@
                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="markdown">
                        Markdown
                    </button>
                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="media">
                        Medias
                    </button>
                </div>
                <div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
                    <pre class="h-full flex"><code id="json-result" class="language-json"></code></pre>
                    <pre class="hidden h-full flex"><code id="cleaned-html-result" class="language-html"></code></pre>
                    <pre class="hidden h-full flex"><code id="markdown-result" class="language-markdown"></code></pre>
                    <pre class="hidden h-full flex"><code id="media-result" class="language-json"></code></pre>
                </div>
            </div>
-            <div id="code_help" class="flex-1">
+            <div id="code_help" class="flex-1  overflow-x-auto">
                <div class="tab-buttons flex gap-2">
                    <button class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="curl">
                        cURL
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@ class CustomInstallCommand(install):
 setup(
    name="Crawl4AI",
-    version="0.2.2",
+    version="0.2.3",
    description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
    long_description=open("README.md").read(),
    long_description_content_type="text/markdown",