Files
crawl4ai/crawler/database.py
2024-05-09 19:10:25 +08:00

53 lines
1.7 KiB
Python

import sqlite3
from typing import Optional
def init_db(db_path: str):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS crawled_data (
url TEXT PRIMARY KEY,
html TEXT,
cleaned_html TEXT,
markdown TEXT,
parsed_json TEXT,
success BOOLEAN
)
''')
conn.commit()
conn.close()
def get_cached_url(db_path: str, url: str) -> Optional[tuple]:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute('SELECT url, html, cleaned_html, markdown, parsed_json, success FROM crawled_data WHERE url = ?', (url,))
result = cursor.fetchone()
conn.close()
return result
def cache_url(db_path: str, url: str, html: str, cleaned_html: str, markdown: str, parsed_json: str, success: bool):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute('''
INSERT INTO crawled_data (url, html, cleaned_html, markdown, parsed_json, success)
VALUES (?, ?, ?, ?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
html = excluded.html,
cleaned_html = excluded.cleaned_html,
markdown = excluded.markdown,
parsed_json = excluded.parsed_json,
success = excluded.success
''', (str(url), html, cleaned_html, markdown, parsed_json, success))
conn.commit()
conn.close()
def get_total_count(db_path: str) -> int:
try:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute('SELECT COUNT(*) FROM crawled_data')
result = cursor.fetchone()
conn.close()
return result[0]
except Exception as e:
return 0