From 597fe8bdb7a7a1df154cc149a4ffcbcba848c57c Mon Sep 17 00:00:00 2001 From: unclecode Date: Fri, 5 Jul 2024 17:04:57 +0800 Subject: [PATCH] chore: Delete existing database file and initialize new database This commit deletes the existing database file and initializes a new database in the `crawl4ai/database.py` file. The `os.remove()` function is used to delete the file if it exists, and then the `init_db()` function is called to initialize the new database. This change is necessary to start with a clean database state. --- crawl4ai/database.py | 11 +++++++---- crawl4ai/extraction_strategy.py | 1 - crawl4ai/utils.py | 2 -- setup.py | 5 +++++ 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/crawl4ai/database.py b/crawl4ai/database.py index 47f41748..37d94463 100644 --- a/crawl4ai/database.py +++ b/crawl4ai/database.py @@ -20,7 +20,7 @@ def init_db(): extracted_content TEXT, success BOOLEAN, media TEXT DEFAULT "{}", - link TEXT DEFAULT "{}", + links TEXT DEFAULT "{}", metadata TEXT DEFAULT "{}", screenshot TEXT DEFAULT "" ) @@ -127,6 +127,9 @@ def update_existing_records(new_column: str = "media", default_value: str = "{}" print(f"Error updating existing records: {e}") if __name__ == "__main__": - init_db() # Initialize the database if not already initialized - alter_db_add_screenshot("metadata") # Add the new column to the table - update_existing_records("metadata") # Update existing records to set the new column to an empty string + # Delete the existing database file + if os.path.exists(DB_PATH): + os.remove(DB_PATH) + init_db() + # alter_db_add_screenshot("COL_NAME") + diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index d4415c88..5d5ac836 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -116,7 +116,6 @@ class LLMExtractionStrategy(ExtractionStrategy): for block in blocks: block['error'] = False except Exception as e: - print("Error extracting blocks:", str(e)) parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content) blocks = parsed if unparsed: diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 474ce395..c8d4b993 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -664,7 +664,6 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None): for block in blocks: block['error'] = False except Exception as e: - print("Error extracting blocks:", str(e)) parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content) blocks = parsed # Append all unparsed segments as onr error block and content is list of unparsed segments @@ -710,7 +709,6 @@ def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_toke blocks = json.loads(blocks) except Exception as e: - print("Error extracting blocks:", str(e)) blocks = [{ "index": 0, "tags": ["error"], diff --git a/setup.py b/setup.py index 468dc56e..674d628e 100644 --- a/setup.py +++ b/setup.py @@ -5,10 +5,15 @@ import subprocess from setuptools.command.install import install # Create the .crawl4ai folder in the user's home directory if it doesn't exist +# If the folder already exists, remove the cache folder crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") +if os.path.exists(f"{crawl4ai_folder}/cache"): + subprocess.run(["rm", "-rf", f"{crawl4ai_folder}/cache"]) os.makedirs(crawl4ai_folder, exist_ok=True) os.makedirs(f"{crawl4ai_folder}/cache", exist_ok=True) + + # Read the requirements from requirements.txt with open("requirements.txt") as f: requirements = f.read().splitlines()