Initial Commit
This commit is contained in:
53
crawler/database.py
Normal file
53
crawler/database.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import sqlite3
|
||||
from typing import Optional
|
||||
|
||||
def init_db(db_path: str):
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS crawled_data (
|
||||
url TEXT PRIMARY KEY,
|
||||
html TEXT,
|
||||
cleaned_html TEXT,
|
||||
markdown TEXT,
|
||||
parsed_json TEXT,
|
||||
success BOOLEAN
|
||||
)
|
||||
''')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def get_cached_url(db_path: str, url: str) -> Optional[tuple]:
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT url, html, cleaned_html, markdown, parsed_json, success FROM crawled_data WHERE url = ?', (url,))
|
||||
result = cursor.fetchone()
|
||||
conn.close()
|
||||
return result
|
||||
|
||||
def cache_url(db_path: str, url: str, html: str, cleaned_html: str, markdown: str, parsed_json: str, success: bool):
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
INSERT INTO crawled_data (url, html, cleaned_html, markdown, parsed_json, success)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
html = excluded.html,
|
||||
cleaned_html = excluded.cleaned_html,
|
||||
markdown = excluded.markdown,
|
||||
parsed_json = excluded.parsed_json,
|
||||
success = excluded.success
|
||||
''', (str(url), html, cleaned_html, markdown, parsed_json, success))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def get_total_count(db_path: str) -> int:
|
||||
try:
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT COUNT(*) FROM crawled_data')
|
||||
result = cursor.fetchone()
|
||||
conn.close()
|
||||
return result[0]
|
||||
except Exception as e:
|
||||
return 0
|
||||
Reference in New Issue
Block a user