Enhance installation and migration processes

- Added a post-installation setup script for initialization.
  - Updated README with installation notes for Playwright setup.
  - Enhanced migration logging for better error visibility.
  - Added 'pydantic' to requirements.
  - Bumped version to 0.3.746.
This commit is contained in:
UncleCode
2024-11-29 18:48:44 +08:00
parent 12e73d4898
commit d202f3539b
8 changed files with 90 additions and 102 deletions

View File

@@ -27,6 +27,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
1. Install Crawl4AI: 1. Install Crawl4AI:
```bash ```bash
pip install crawl4ai pip install crawl4ai
crawl4ai-setup # Setup the browser
``` ```
2. Run a simple web crawl: 2. Run a simple web crawl:
@@ -125,34 +126,6 @@ if __name__ == "__main__":
✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/) ✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/)
## Features ✨
- 🆓 Completely free and open-source
- 🚀 Blazing fast performance, outperforming many paid services
- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown)
- 🌐 Multi-browser support (Chromium, Firefox, WebKit)
- 🌍 Supports crawling multiple URLs simultaneously
- 🎨 Extracts and returns all media tags (Images, Audio, and Video)
- 🔗 Extracts all external and internal links
- 📚 Extracts metadata from the page
- 🔄 Custom hooks for authentication, headers, and page modifications
- 🕵️ User-agent customization
- 🖼️ Takes screenshots of pages with enhanced error handling
- 📜 Executes multiple custom JavaScripts before crawling
- 📊 Generates structured output without LLM using JsonCssExtractionStrategy
- 📚 Various chunking strategies: topic-based, regex, sentence, and more
- 🧠 Advanced extraction strategies: cosine clustering, LLM, and more
- 🎯 CSS selector support for precise data extraction
- 📝 Passes instructions/keywords to refine extraction
- 🔒 Proxy support with authentication for enhanced access
- 🔄 Session management for complex multi-page crawling
- 🌐 Asynchronous architecture for improved performance
- 🖼️ Improved image processing with lazy-loading detection
- 🕰️ Enhanced handling of delayed content loading
- 🔑 Custom headers support for LLM interactions
- 🖼️ iframe content extraction for comprehensive analysis
- ⏱️ Flexible timeout and delayed content retrieval options
## Installation 🛠️ ## Installation 🛠️
Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker. Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker.
@@ -168,11 +141,12 @@ For basic web crawling and scraping tasks:
```bash ```bash
pip install crawl4ai pip install crawl4ai
crawl4ai-setup # Setup the browser
``` ```
By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling. By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling.
👉 **Note**: When you install Crawl4AI, the setup script should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods: 👉 **Note**: When you install Crawl4AI, the `crawl4ai-setup` should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods:
1. Through the command line: 1. Through the command line:

View File

@@ -4,7 +4,6 @@ from .async_webcrawler import AsyncWebCrawler, CacheMode
from .models import CrawlResult from .models import CrawlResult
from .__version__ import __version__ from .__version__ import __version__
# __version__ = "0.3.73"
__all__ = [ __all__ = [
"AsyncWebCrawler", "AsyncWebCrawler",

View File

@@ -1,2 +1,2 @@
# crawl4ai/_version.py # crawl4ai/_version.py
__version__ = "0.3.745" __version__ = "0.3.746"

44
crawl4ai/install.py Normal file
View File

@@ -0,0 +1,44 @@
import subprocess
import sys
import asyncio
from .async_logger import AsyncLogger, LogLevel
# Initialize logger
logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
def post_install():
"""Run all post-installation tasks"""
logger.info("Running post-installation setup...", tag="INIT")
install_playwright()
run_migration()
logger.success("Post-installation setup completed!", tag="COMPLETE")
def install_playwright():
logger.info("Installing Playwright browsers...", tag="INIT")
try:
subprocess.check_call([sys.executable, "-m", "playwright", "install"])
logger.success("Playwright installation completed successfully.", tag="COMPLETE")
except subprocess.CalledProcessError as e:
logger.error(f"Error during Playwright installation: {e}", tag="ERROR")
logger.warning(
"Please run 'python -m playwright install' manually after the installation."
)
except Exception as e:
logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR")
logger.warning(
"Please run 'python -m playwright install' manually after the installation."
)
def run_migration():
"""Initialize database during installation"""
try:
logger.info("Starting database initialization...", tag="INIT")
from crawl4ai.async_database import async_db_manager
asyncio.run(async_db_manager.initialize())
logger.success("Database initialization completed successfully.", tag="COMPLETE")
except ImportError:
logger.warning("Database module not found. Will initialize on first use.")
except Exception as e:
logger.warning(f"Database initialization failed: {e}")
logger.warning("Database will be initialized on first use")

View File

@@ -9,9 +9,13 @@ import aiofiles
import shutil import shutil
import time import time
from datetime import datetime from datetime import datetime
from .async_logger import AsyncLogger, LogLevel
logging.basicConfig(level=logging.INFO) # Initialize logger
logger = logging.getLogger(__name__) logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)
class DatabaseMigration: class DatabaseMigration:
def __init__(self, db_path: str): def __init__(self, db_path: str):
@@ -55,7 +59,8 @@ class DatabaseMigration:
async def migrate_database(self): async def migrate_database(self):
"""Migrate existing database to file-based storage""" """Migrate existing database to file-based storage"""
logger.info("Starting database migration...") # logger.info("Starting database migration...")
logger.info("Starting database migration...", tag="INIT")
try: try:
async with aiosqlite.connect(self.db_path) as db: async with aiosqlite.connect(self.db_path) as db:
@@ -91,19 +96,25 @@ class DatabaseMigration:
migrated_count += 1 migrated_count += 1
if migrated_count % 100 == 0: if migrated_count % 100 == 0:
logger.info(f"Migrated {migrated_count} records...") logger.info(f"Migrated {migrated_count} records...", tag="INIT")
await db.commit() await db.commit()
logger.info(f"Migration completed. {migrated_count} records processed.") logger.success(f"Migration completed. {migrated_count} records processed.", tag="COMPLETE")
except Exception as e: except Exception as e:
logger.error(f"Migration failed: {e}") # logger.error(f"Migration failed: {e}")
raise logger.error(
message="Migration failed: {error}",
tag="ERROR",
params={"error": str(e)}
)
raise e
async def backup_database(db_path: str) -> str: async def backup_database(db_path: str) -> str:
"""Create backup of existing database""" """Create backup of existing database"""
if not os.path.exists(db_path): if not os.path.exists(db_path):
logger.info("No existing database found. Skipping backup.") logger.info("No existing database found. Skipping backup.", tag="INIT")
return None return None
# Create backup with timestamp # Create backup with timestamp
@@ -116,11 +127,16 @@ async def backup_database(db_path: str) -> str:
# Create backup # Create backup
shutil.copy2(db_path, backup_path) shutil.copy2(db_path, backup_path)
logger.info(f"Database backup created at: {backup_path}") logger.info(f"Database backup created at: {backup_path}", tag="COMPLETE")
return backup_path return backup_path
except Exception as e: except Exception as e:
logger.error(f"Backup failed: {e}") # logger.error(f"Backup failed: {e}")
raise logger.error(
message="Migration failed: {error}",
tag="ERROR",
params={"error": str(e)}
)
raise e
async def run_migration(db_path: Optional[str] = None): async def run_migration(db_path: Optional[str] = None):
"""Run database migration""" """Run database migration"""
@@ -128,7 +144,7 @@ async def run_migration(db_path: Optional[str] = None):
db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db") db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db")
if not os.path.exists(db_path): if not os.path.exists(db_path):
logger.info("No existing database found. Skipping migration.") logger.info("No existing database found. Skipping migration.", tag="INIT")
return return
# Create backup first # Create backup first

View File

@@ -562,18 +562,18 @@ async def fit_markdown_remove_overlay():
async def main(): async def main():
# await simple_crawl() await simple_crawl()
# await simple_example_with_running_js_code() await simple_example_with_running_js_code()
# await simple_example_with_css_selector() await simple_example_with_css_selector()
# await use_proxy() # await use_proxy()
# await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
# await extract_structured_data_using_css_extractor() await extract_structured_data_using_css_extractor()
# LLM extraction examples # LLM extraction examples
# await extract_structured_data_using_llm() # await extract_structured_data_using_llm()
# await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
# await extract_structured_data_using_llm("ollama/llama3.2") # await extract_structured_data_using_llm("ollama/llama3.2")
# await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
# You always can pass custom headers to the extraction strategy # You always can pass custom headers to the extraction strategy
# custom_headers = { # custom_headers = {
@@ -582,9 +582,9 @@ async def main():
# } # }
# await extract_structured_data_using_llm(extra_headers=custom_headers) # await extract_structured_data_using_llm(extra_headers=custom_headers)
# await crawl_dynamic_content_pages_method_1() await crawl_dynamic_content_pages_method_1()
# await crawl_dynamic_content_pages_method_2() await crawl_dynamic_content_pages_method_2()
# await crawl_dynamic_content_pages_method_3() await crawl_dynamic_content_pages_method_3()
await crawl_custom_browser_type() await crawl_custom_browser_type()

View File

@@ -1,5 +1,4 @@
aiosqlite~=0.20 aiosqlite~=0.20
html2text~=2024.2
lxml~=5.3 lxml~=5.3
litellm>=1.53.1 litellm>=1.53.1
numpy>=1.26.0,<3 numpy>=1.26.0,<3
@@ -13,4 +12,5 @@ xxhash~=3.4
rank-bm25~=0.2 rank-bm25~=0.2
aiofiles>=24.1.0 aiofiles>=24.1.0
colorama~=0.4 colorama~=0.4
snowballstemmer~=2.2 snowballstemmer~=2.2
pydantic>=2.10

View File

@@ -1,11 +1,8 @@
from setuptools import setup, find_packages from setuptools import setup, find_packages
from setuptools.command.install import install
import os import os
from pathlib import Path from pathlib import Path
import shutil import shutil
import subprocess
import sys
import asyncio
# Create the .crawl4ai folder in the user's home directory if it doesn't exist # Create the .crawl4ai folder in the user's home directory if it doesn't exist
# If the folder already exists, remove the cache folder # If the folder already exists, remove the cache folder
@@ -49,46 +46,6 @@ transformer_requirements = ["transformers", "tokenizers"]
cosine_similarity_requirements = ["torch", "transformers", "nltk"] cosine_similarity_requirements = ["torch", "transformers", "nltk"]
sync_requirements = ["selenium"] sync_requirements = ["selenium"]
def install_playwright():
print("Installing Playwright browsers...")
try:
subprocess.check_call([sys.executable, "-m", "playwright", "install"])
print("Playwright installation completed successfully.")
except subprocess.CalledProcessError as e:
print(f"Error during Playwright installation: {e}")
print(
"Please run 'python -m playwright install' manually after the installation."
)
except Exception as e:
print(f"Unexpected error during Playwright installation: {e}")
print(
"Please run 'python -m playwright install' manually after the installation."
)
def run_migration():
"""Initialize database during installation"""
try:
print("Starting database initialization...")
from crawl4ai.async_database import async_db_manager
asyncio.run(async_db_manager.initialize())
print("Database initialization completed successfully.")
except ImportError:
print("Warning: Database module not found. Will initialize on first use.")
except Exception as e:
print(f"Warning: Database initialization failed: {e}")
print("Database will be initialized on first use")
class PostInstallCommand(install):
def run(self):
install.run(self)
install_playwright()
# run_migration()
setup( setup(
name="Crawl4AI", name="Crawl4AI",
version=version, version=version,
@@ -116,7 +73,8 @@ setup(
entry_points={ entry_points={
"console_scripts": [ "console_scripts": [
"crawl4ai-download-models=crawl4ai.model_loader:main", "crawl4ai-download-models=crawl4ai.model_loader:main",
"crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command "crawl4ai-migrate=crawl4ai.migrations:main",
'crawl4ai-setup=crawl4ai.install:post_install',
], ],
}, },
classifiers=[ classifiers=[
@@ -130,7 +88,4 @@ setup(
"Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.10",
], ],
python_requires=">=3.7", python_requires=">=3.7",
cmdclass={
"install": PostInstallCommand,
},
) )