From 181250cb93f8e2754974c16d82cbde11ad593b67 Mon Sep 17 00:00:00 2001 From: unclecode Date: Thu, 9 May 2024 19:42:43 +0800 Subject: [PATCH] `chore: Add function to clear the database` --- .gitignore | 3 ++- crawler/database.py | 10 +++++++++- examples/quickstart.py | 4 ++-- main.py | 8 +++++++- 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 020ec7c4..59f39306 100644 --- a/.gitignore +++ b/.gitignore @@ -163,4 +163,5 @@ cython_debug/ Crawl4AI.egg-info/ Crawl4AI.egg-info/* -crawler_data.db \ No newline at end of file +crawler_data.db +.vscode/ \ No newline at end of file diff --git a/crawler/database.py b/crawler/database.py index 294d894f..89048d05 100644 --- a/crawler/database.py +++ b/crawler/database.py @@ -50,4 +50,12 @@ def get_total_count(db_path: str) -> int: conn.close() return result[0] except Exception as e: - return 0 \ No newline at end of file + return 0 + +# Crete function to cler the database +def clear_db(db_path: str): + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + cursor.execute('DELETE FROM crawled_data') + conn.commit() + conn.close() \ No newline at end of file diff --git a/examples/quickstart.py b/examples/quickstart.py index 9fc26a30..57f71eaa 100644 --- a/examples/quickstart.py +++ b/examples/quickstart.py @@ -8,13 +8,13 @@ def main(): crawler = WebCrawler(db_path='crawler_data.db') # Fetch a single page - single_url = UrlModel(url='https://kidocode.com', forced=True) + single_url = UrlModel(url='https://techcrunch.com/', forced=True) result = crawler.fetch_page( single_url, provider= "openai/gpt-3.5-turbo", api_token = os.getenv('OPENAI_API_KEY'), extract_blocks_flag=True, - word_count_threshold=5 + word_count_threshold=10 ) print(result.model_dump()) diff --git a/main.py b/main.py index 6191c9ce..c8a4bdcf 100644 --- a/main.py +++ b/main.py @@ -10,7 +10,7 @@ import asyncio from concurrent.futures import ThreadPoolExecutor, as_completed import chromedriver_autoinstaller from functools import lru_cache -from crawler.database import get_total_count +from crawler.database import get_total_count, clear_db import os import uuid @@ -56,6 +56,12 @@ async def get_total_url_count(): count = get_total_count(db_path='crawler_data.db') return JSONResponse(content={"count": count}) +# Add endpoit to clear db +@app.get("/clear-db") +async def clear_database(): + clear_db(db_path='crawler_data.db') + return JSONResponse(content={"message": "Database cleared."}) + @app.post("/crawl") async def crawl_urls(urls_input: UrlsInput, request: Request): global current_requests