53 lines
1.7 KiB
Python
53 lines
1.7 KiB
Python
import sqlite3
|
|
from typing import Optional
|
|
|
|
def init_db(db_path: str):
|
|
conn = sqlite3.connect(db_path)
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
CREATE TABLE IF NOT EXISTS crawled_data (
|
|
url TEXT PRIMARY KEY,
|
|
html TEXT,
|
|
cleaned_html TEXT,
|
|
markdown TEXT,
|
|
parsed_json TEXT,
|
|
success BOOLEAN
|
|
)
|
|
''')
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
def get_cached_url(db_path: str, url: str) -> Optional[tuple]:
|
|
conn = sqlite3.connect(db_path)
|
|
cursor = conn.cursor()
|
|
cursor.execute('SELECT url, html, cleaned_html, markdown, parsed_json, success FROM crawled_data WHERE url = ?', (url,))
|
|
result = cursor.fetchone()
|
|
conn.close()
|
|
return result
|
|
|
|
def cache_url(db_path: str, url: str, html: str, cleaned_html: str, markdown: str, parsed_json: str, success: bool):
|
|
conn = sqlite3.connect(db_path)
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
INSERT INTO crawled_data (url, html, cleaned_html, markdown, parsed_json, success)
|
|
VALUES (?, ?, ?, ?, ?, ?)
|
|
ON CONFLICT(url) DO UPDATE SET
|
|
html = excluded.html,
|
|
cleaned_html = excluded.cleaned_html,
|
|
markdown = excluded.markdown,
|
|
parsed_json = excluded.parsed_json,
|
|
success = excluded.success
|
|
''', (str(url), html, cleaned_html, markdown, parsed_json, success))
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
def get_total_count(db_path: str) -> int:
|
|
try:
|
|
conn = sqlite3.connect(db_path)
|
|
cursor = conn.cursor()
|
|
cursor.execute('SELECT COUNT(*) FROM crawled_data')
|
|
result = cursor.fetchone()
|
|
conn.close()
|
|
return result[0]
|
|
except Exception as e:
|
|
return 0 |