chore: Add function to clear the database
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -163,4 +163,5 @@ cython_debug/
|
|||||||
|
|
||||||
Crawl4AI.egg-info/
|
Crawl4AI.egg-info/
|
||||||
Crawl4AI.egg-info/*
|
Crawl4AI.egg-info/*
|
||||||
crawler_data.db
|
crawler_data.db
|
||||||
|
.vscode/
|
||||||
@@ -50,4 +50,12 @@ def get_total_count(db_path: str) -> int:
|
|||||||
conn.close()
|
conn.close()
|
||||||
return result[0]
|
return result[0]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
# Crete function to cler the database
|
||||||
|
def clear_db(db_path: str):
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute('DELETE FROM crawled_data')
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
@@ -8,13 +8,13 @@ def main():
|
|||||||
crawler = WebCrawler(db_path='crawler_data.db')
|
crawler = WebCrawler(db_path='crawler_data.db')
|
||||||
|
|
||||||
# Fetch a single page
|
# Fetch a single page
|
||||||
single_url = UrlModel(url='https://kidocode.com', forced=True)
|
single_url = UrlModel(url='https://techcrunch.com/', forced=True)
|
||||||
result = crawler.fetch_page(
|
result = crawler.fetch_page(
|
||||||
single_url,
|
single_url,
|
||||||
provider= "openai/gpt-3.5-turbo",
|
provider= "openai/gpt-3.5-turbo",
|
||||||
api_token = os.getenv('OPENAI_API_KEY'),
|
api_token = os.getenv('OPENAI_API_KEY'),
|
||||||
extract_blocks_flag=True,
|
extract_blocks_flag=True,
|
||||||
word_count_threshold=5
|
word_count_threshold=10
|
||||||
)
|
)
|
||||||
print(result.model_dump())
|
print(result.model_dump())
|
||||||
|
|
||||||
|
|||||||
8
main.py
8
main.py
@@ -10,7 +10,7 @@ import asyncio
|
|||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
import chromedriver_autoinstaller
|
import chromedriver_autoinstaller
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from crawler.database import get_total_count
|
from crawler.database import get_total_count, clear_db
|
||||||
import os
|
import os
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
@@ -56,6 +56,12 @@ async def get_total_url_count():
|
|||||||
count = get_total_count(db_path='crawler_data.db')
|
count = get_total_count(db_path='crawler_data.db')
|
||||||
return JSONResponse(content={"count": count})
|
return JSONResponse(content={"count": count})
|
||||||
|
|
||||||
|
# Add endpoit to clear db
|
||||||
|
@app.get("/clear-db")
|
||||||
|
async def clear_database():
|
||||||
|
clear_db(db_path='crawler_data.db')
|
||||||
|
return JSONResponse(content={"message": "Database cleared."})
|
||||||
|
|
||||||
@app.post("/crawl")
|
@app.post("/crawl")
|
||||||
async def crawl_urls(urls_input: UrlsInput, request: Request):
|
async def crawl_urls(urls_input: UrlsInput, request: Request):
|
||||||
global current_requests
|
global current_requests
|
||||||
|
|||||||
Reference in New Issue
Block a user